Guest User

youtube parser

a guest
Dec 22nd, 2014
220
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.92 KB | None | 0 0
  1. # -*- coding:Utf-8 -*-
  2. import urllib2
  3.  
  4. def format_gdata(url):
  5.     channel = url.split('/')[4]
  6.     channel = 'http://gdata.youtube.com/feeds/base/users/'+channel+'/uploads'
  7.  
  8.     return channel
  9.  
  10. def open_url(url):
  11.     data = urllib2.urlopen(url)
  12.     return data.read()
  13.  
  14.  
  15. def parse_html_source(htmlSource, url_channel):
  16.     #get url video --------------------------------
  17.     start_link = "<link rel='alternate' type='text/html' href='"
  18.     end_link = "&amp;feature=youtube_gdata"
  19.     video = []
  20.  
  21.     lien = htmlSource.split(start_link)[1:]
  22.  
  23.     for element in lien:
  24.         video.append(element.split(end_link)[0])
  25.  
  26.     #get title video --------------------------------
  27.     start_title = "<title type='text'>"
  28.     end_title = "</title><content"
  29.  
  30.     titre = htmlSource.split(start_title)[1:]
  31.     title = []
  32.  
  33.     for element in titre:
  34.         title.append(element.split(end_title)[0])
  35.  
  36.     #remove first element description
  37.     channel_name = title.pop(0).split('</title>')[0]
  38.     channel_name = channel_name.split('Uploads by ')[1]
  39.     print channel_name
  40.  
  41.     #get image video --------------------------------
  42.     start_image = 'alt="" src="' #use in second
  43.     end_image = '"&gt;&lt;/' #use in first
  44.  
  45.     images = htmlSource.split(end_image)
  46.     image = []
  47.  
  48.     for img in images:
  49.         if '.jpg' in img:
  50.             image.append(img.split(start_image)[1])
  51.     video.pop(0)
  52.  
  53.  
  54.     liste = zip(video, title)
  55.     liste.insert(0, (url_channel, channel_name))
  56.  
  57.     return liste
  58.  
  59. def check_parsed_channel(channel, first_video_link):
  60.     #first, get data in memory
  61.     fichier = open('C:/Users/name/Desktop/parsed_channel.txt', 'r')
  62.     data = fichier.readlines()
  63.  
  64.     #if file is empty (first time used)
  65.     if len(data)<1:
  66.         return 1
  67.  
  68.     for line in data:
  69.         if channel in line:
  70.             #print data[data.index(line)+1]
  71.             if first_video_link in data[data.index(line)+1]:
  72.                 print 'channel already updated'
  73.                 return 0
  74.             else:
  75.                 print 'new video availaible'
  76.                 return 1
  77.  
  78.     return 1
  79.  
  80. def write_parsed_channel(list_video):
  81.     fichier = open('C:/Users/name/Desktop/parsed_channel.txt', 'a+')
  82.  
  83.     for video in list_video:
  84.         if 'user' in video[0] and not 'watch' in video[0] and list_video.index(video) != 0:
  85.             fichier.write('\n') #add delimiter
  86.         fichier.write(video[0]+' '+video[1]+'\n')
  87.  
  88.     fichier.write('\n') #add delimiter between different channel in eof
  89.     fichier.close()
  90.  
  91.  
  92.  
  93. fichier = open('C:/Users/name/Desktop/chaine.txt', 'r')
  94.  
  95. #récupérer toutes les lignes du fichiers sous forme d'une liste
  96. data = fichier.readlines()
  97. fichier.close()
  98.  
  99. i = 0
  100. for element in data:
  101.     data[i] = element.replace('\n', '')
  102.     i+=1
  103.  
  104. #links = list(set(data))
  105. links = data
  106. first_links = []
  107.  
  108. i=0
  109.  
  110. #take first three links, after some operation, they will append on the main list 'links'
  111. while i<3: #true limit is 3
  112.     a = links.pop(0)
  113.     first_links.append(a)
  114.     i+=1
  115.  
  116. liste_videos = []
  117. video_list_checked = []
  118.  
  119. for link in first_links:
  120.     url_channel = link
  121.  
  122.     if not 'gdata' in link:
  123.         url = format_gdata(link)
  124.         #first_links[first_links.index(link)] = url
  125.         #link = url
  126.  
  127.     #open_url(link) and get hmlt source to parse the data
  128.     liste_videos = parse_html_source(open_url(url), url_channel)
  129.     #print liste_videos
  130.     #print a[0],'\n',a[1]
  131.  
  132.     #channel name + first video link of this channel
  133.     check = check_parsed_channel(liste_videos[0][0], liste_videos[1][0])
  134.  
  135.     print liste_videos[0][0],'\n', liste_videos[1][0],'\n\n'
  136.     if check:
  137.         video_list_checked +=liste_videos
  138.  
  139. #write parsed_channel
  140. if len(video_list_checked):
  141.     write_parsed_channel(video_list_checked)
  142.  
  143.  
  144. links = links + first_links
  145.  
  146. fichier = open('C:/Users/name/Desktop/chaine.txt', 'w')
  147.  
  148. for link in links:
  149.     fichier.write(link+'\n')
  150.  
  151. fichier.close()
Advertisement
Add Comment
Please, Sign In to add comment