Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding:Utf-8 -*-
- import urllib2
- def format_gdata(url):
- channel = url.split('/')[4]
- channel = 'http://gdata.youtube.com/feeds/base/users/'+channel+'/uploads'
- return channel
- def open_url(url):
- data = urllib2.urlopen(url)
- return data.read()
- def parse_html_source(htmlSource, url_channel):
- #get url video --------------------------------
- start_link = "<link rel='alternate' type='text/html' href='"
- end_link = "&feature=youtube_gdata"
- video = []
- lien = htmlSource.split(start_link)[1:]
- for element in lien:
- video.append(element.split(end_link)[0])
- #get title video --------------------------------
- start_title = "<title type='text'>"
- end_title = "</title><content"
- titre = htmlSource.split(start_title)[1:]
- title = []
- for element in titre:
- title.append(element.split(end_title)[0])
- #remove first element description
- channel_name = title.pop(0).split('</title>')[0]
- channel_name = channel_name.split('Uploads by ')[1]
- print channel_name
- #get image video --------------------------------
- start_image = 'alt="" src="' #use in second
- end_image = '"></' #use in first
- images = htmlSource.split(end_image)
- image = []
- for img in images:
- if '.jpg' in img:
- image.append(img.split(start_image)[1])
- video.pop(0)
- liste = zip(video, title)
- liste.insert(0, (url_channel, channel_name))
- return liste
- def check_parsed_channel(channel, first_video_link):
- #first, get data in memory
- fichier = open('C:/Users/name/Desktop/parsed_channel.txt', 'r')
- data = fichier.readlines()
- #if file is empty (first time used)
- if len(data)<1:
- return 1
- for line in data:
- if channel in line:
- #print data[data.index(line)+1]
- if first_video_link in data[data.index(line)+1]:
- print 'channel already updated'
- return 0
- else:
- print 'new video availaible'
- return 1
- return 1
- def write_parsed_channel(list_video):
- fichier = open('C:/Users/name/Desktop/parsed_channel.txt', 'a+')
- for video in list_video:
- if 'user' in video[0] and not 'watch' in video[0] and list_video.index(video) != 0:
- fichier.write('\n') #add delimiter
- fichier.write(video[0]+' '+video[1]+'\n')
- fichier.write('\n') #add delimiter between different channel in eof
- fichier.close()
- fichier = open('C:/Users/name/Desktop/chaine.txt', 'r')
- #récupérer toutes les lignes du fichiers sous forme d'une liste
- data = fichier.readlines()
- fichier.close()
- i = 0
- for element in data:
- data[i] = element.replace('\n', '')
- i+=1
- #links = list(set(data))
- links = data
- first_links = []
- i=0
- #take first three links, after some operation, they will append on the main list 'links'
- while i<3: #true limit is 3
- a = links.pop(0)
- first_links.append(a)
- i+=1
- liste_videos = []
- video_list_checked = []
- for link in first_links:
- url_channel = link
- if not 'gdata' in link:
- url = format_gdata(link)
- #first_links[first_links.index(link)] = url
- #link = url
- #open_url(link) and get hmlt source to parse the data
- liste_videos = parse_html_source(open_url(url), url_channel)
- #print liste_videos
- #print a[0],'\n',a[1]
- #channel name + first video link of this channel
- check = check_parsed_channel(liste_videos[0][0], liste_videos[1][0])
- print liste_videos[0][0],'\n', liste_videos[1][0],'\n\n'
- if check:
- video_list_checked +=liste_videos
- #write parsed_channel
- if len(video_list_checked):
- write_parsed_channel(video_list_checked)
- links = links + first_links
- fichier = open('C:/Users/name/Desktop/chaine.txt', 'w')
- for link in links:
- fichier.write(link+'\n')
- fichier.close()
Advertisement
Add Comment
Please, Sign In to add comment