youtube parser

# -*- coding:Utf-8 -*-
import urllib2

def format_gdata(url):
    channel = url.split('/')[4]
    channel = 'http://gdata.youtube.com/feeds/base/users/'+channel+'/uploads'

    return channel

def open_url(url):
    data = urllib2.urlopen(url)
    return data.read()


def parse_html_source(htmlSource, url_channel):
    #get url video --------------------------------
    start_link = "<link rel='alternate' type='text/html' href='"
    end_link = "&amp;feature=youtube_gdata"
    video = []

    lien = htmlSource.split(start_link)[1:]

    for element in lien:
        video.append(element.split(end_link)[0])

    #get title video --------------------------------
    start_title = "<title type='text'>"
    end_title = "</title><content"

    titre = htmlSource.split(start_title)[1:]
    title = []

    for element in titre:
        title.append(element.split(end_title)[0])

    #remove first element description
    channel_name = title.pop(0).split('</title>')[0]
    channel_name = channel_name.split('Uploads by ')[1]
    print channel_name

    #get image video --------------------------------
    start_image = 'alt="" src="' #use in second
    end_image = '"&gt;&lt;/' #use in first

    images = htmlSource.split(end_image)
    image = []

    for img in images:
        if '.jpg' in img:
            image.append(img.split(start_image)[1])
    video.pop(0)


    liste = zip(video, title)
    liste.insert(0, (url_channel, channel_name))

    return liste

def check_parsed_channel(channel, first_video_link):
    #first, get data in memory
    fichier = open('C:/Users/name/Desktop/parsed_channel.txt', 'r')
    data = fichier.readlines()

    #if file is empty (first time used)
    if len(data)<1:
        return 1

    for line in data:
        if channel in line:
            #print data[data.index(line)+1]
            if first_video_link in data[data.index(line)+1]:
                print 'channel already updated'
                return 0
            else:
                print 'new video availaible'
                return 1

    return 1

def write_parsed_channel(list_video):
    fichier = open('C:/Users/name/Desktop/parsed_channel.txt', 'a+')

    for video in list_video:
        if 'user' in video[0] and not 'watch' in video[0] and list_video.index(video) != 0:
            fichier.write('\n') #add delimiter
        fichier.write(video[0]+' '+video[1]+'\n')

    fichier.write('\n') #add delimiter between different channel in eof
    fichier.close()


fichier = open('C:/Users/name/Desktop/chaine.txt', 'r')

#récupérer toutes les lignes du fichiers sous forme d'une liste
data = fichier.readlines()
fichier.close()

i = 0
for element in data:
    data[i] = element.replace('\n', '')
    i+=1

#links = list(set(data))
links = data
first_links = []

i=0

#take first three links, after some operation, they will append on the main list 'links'
while i<3: #true limit is 3
    a = links.pop(0)
    first_links.append(a)
    i+=1

liste_videos = []
video_list_checked = []

for link in first_links:
    url_channel = link

    if not 'gdata' in link:
        url = format_gdata(link)
        #first_links[first_links.index(link)] = url
        #link = url

    #open_url(link) and get hmlt source to parse the data
    liste_videos = parse_html_source(open_url(url), url_channel)
    #print liste_videos
    #print a[0],'\n',a[1]

    #channel name + first video link of this channel
    check = check_parsed_channel(liste_videos[0][0], liste_videos[1][0])

    print liste_videos[0][0],'\n', liste_videos[1][0],'\n\n'
    if check:
        video_list_checked +=liste_videos

#write parsed_channel
if len(video_list_checked):
    write_parsed_channel(video_list_checked)


links = links + first_links

fichier = open('C:/Users/name/Desktop/chaine.txt', 'w')

for link in links:
    fichier.write(link+'\n')

fichier.close()