Advertisement
Guest User

Untitled

a guest
May 21st, 2012
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.47 KB | None | 0 0
  1. import urllib2, htmllib, formatter
  2. import multiprocessing.dummy as multiprocessing
  3. import xml.dom.minidom
  4. import os
  5. import string, random
  6. from urlparse import parse_qs, urlparse
  7.  
  8. from useful_util import retry
  9. import config
  10. from logger import log
  11.  
  12. class LinksExtractor(htmllib.HTMLParser):
  13.     def __init__(self, formatter):
  14.         htmllib.HTMLParser.__init__(self, formatter)
  15.         self.links = []
  16.         self.ignoredSites = config.WebParser_ignoredSites
  17.  
  18.     def start_a(self, attrs):
  19.         for attr in attrs:
  20.             if attr[0] == "href" and attr[1].endswith(".mp3"):
  21.                 if not filter(lambda x: (x in attr[1]), self.ignoredSites):
  22.                     self.links.append(attr[1])
  23.                        
  24.     def get_links(self):
  25.         return self.links
  26.        
  27.  
  28. def GetLinks(url, returnMetaUrlObj=False):
  29.     '''
  30.     Function gather links from a url.
  31.     @param url: Url Address.
  32.     @param returnMetaUrlObj: If true, returns a MetaUrl Object list.
  33.                              Else, returns a string list. Default is False.
  34.    
  35.     @return links: Look up.
  36.     '''
  37.     htmlparser = LinksExtractor(formatter.NullFormatter())
  38.    
  39.     try:
  40.         data = urllib2.urlopen(url)
  41.     except (urllib2.HTTPError, urllib2.URLError) as e:
  42.         log.error(e)
  43.         return []
  44.     htmlparser.feed(data.read())
  45.     htmlparser.close()
  46.    
  47.     links = list(set(htmlparser.get_links()))
  48.    
  49.     if returnMetaUrlObj:
  50.         links = map(MetaUrl, links)
  51.        
  52.     return links
  53.    
  54. def isAscii(s):
  55.     "Function checks is the string is ascii."
  56.     try:
  57.         s.decode('ascii')
  58.     except (UnicodeEncodeError, UnicodeDecodeError):
  59.         return False
  60.     return True
  61.  
  62. @retry(Exception, logger=log)
  63. def parse(song, source):
  64.     '''
  65.     Function parses the source search page and returns the .mp3 links in it.
  66.     @param song: Search string.
  67.     @param source: Search website source. Value can be dilandau, mp3skull, youtube, seekasong.
  68.    
  69.     @return links: .mp3 url links.
  70.     '''
  71.     source = source.lower()
  72.     if source == "dilandau":
  73.         return parse_dilandau(song)
  74.     elif source == "mp3skull":
  75.         return parse_Mp3skull(song)
  76.     elif source == "SeekASong":
  77.         return parse_SeekASong(song)
  78.     elif source == "youtube":
  79.         return parse_Youtube(song)
  80.  
  81.     log.error('no source "%s". (from parse function in WebParser)')
  82.     return []
  83.  
  84. def parse_dilandau(song, pages=1):
  85.     "Function connects to Dilandau.eu and returns the .mp3 links in it"
  86.     if not isAscii(song): # Dilandau doesn't like unicode.
  87.         log.warning("Song is not ASCII. Skipping on dilandau")
  88.         return []
  89.    
  90.     links = []
  91.     song = urllib2.quote(song.encode("utf8"))
  92.    
  93.     for i in range(pages):
  94.         url = 'http://en.dilandau.eu/download_music/%s-%d.html' % (song.replace('-','').replace(' ','-').replace('--','-').lower(),i+1)
  95.         log.debug("[Dilandau] Parsing %s... " % url)
  96.         links.extend(GetLinks(url, returnMetaUrlObj=True))
  97.     log.debug("[Dilandau] found %d links" % len(links))
  98.  
  99.     for metaUrl in links:
  100.         metaUrl.source = "Dilandau"
  101.    
  102.     return links
  103.  
  104. def parse_Mp3skull(song, pages=1):
  105.     "Function connects to mp3skull.com and returns the .mp3 links in it"
  106.     links = []
  107.     song = urllib2.quote(song.encode("utf8"))
  108.    
  109.     for i in range(pages):
  110.         # http://mp3skull.com/mp3/how_i_met_your_mother.html
  111.         url = 'http://mp3skull.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
  112.         log.debug("[Mp3skull] Parsing %s... " % url)
  113.         links.extend(GetLinks(url, returnMetaUrlObj=True))
  114.     log.debug("[Mp3skull] found %d links" % len(links))
  115.    
  116.     for metaUrl in links:
  117.         metaUrl.source = "Mp3skull"
  118.        
  119.     return links
  120.  
  121. def parse_SeekASong(song):
  122.     "Function connects to seekasong.com and returns the .mp3 links in it"
  123.     song = urllib2.quote(song.encode("utf8"))
  124.  
  125.     url = 'http://www.seekasong.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
  126.     log.debug("[SeekASong] Parsing %s... " % url)
  127.     links = GetLinks(url, returnMetaUrlObj=True)
  128.     for metaUrl in links:
  129.         metaUrl.source = "SeekASong"
  130.     log.debug("[SeekASong] found %d links" % len(links))
  131.    
  132.     return links
  133.  
  134. def parse_Youtube(song, amount=10):
  135.     '''
  136.     Function searches a song in youtube.com and returns the clips in it using Youtube API.
  137.     @param song: The search string.
  138.     @param amount: Amount of clips to obtain.
  139.    
  140.     @return links: List of links.
  141.     '''
  142.     "Function connects to youtube.com and returns the .mp3 links in it"
  143.     song = urllib2.quote(song.encode("utf8"))
  144.     url = r"http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=%d&v=2" % (song.replace(' ', '+'), amount)
  145.     urlObj = urllib2.urlopen(url, timeout=4)
  146.     data = urlObj.read()
  147.     videos = xml.dom.minidom.parseString(data).getElementsByTagName('feed')[0].getElementsByTagName('entry')
  148.    
  149.     links = []
  150.     for video in videos:
  151.         youtube_watchurl = video.getElementsByTagName('link')[0].attributes.item(0).value
  152.         links.append(get_youtube_hightest_quality_link(youtube_watchurl))
  153.    
  154.     return links
  155.  
  156. def get_youtube_hightest_quality_link(youtube_watchurl, priority=config.youtube_quality_priority):
  157.     '''
  158.     Function returns the highest quality link for a specific youtube clip.
  159.     @param youtube_watchurl: The Youtube Watch Url.
  160.     @param priority: A list represents the qualities priority.
  161.    
  162.     @return MetaUrlObj: MetaUrl Object.
  163.     '''
  164.     video_id = parse_qs(urlparse(youtube_watchurl).query)['v'][0]
  165.     youtube_embedded_watchurl = "http://www.youtube.com/embed/%s?autoplay=1" % video_id
  166.    
  167.     d = get_youtube_dl_links(video_id)
  168.     for x in priority:
  169.         if x in d.keys():
  170.             return MetaUrl(d[x][0], 'youtube', d['VideoName'], x, youtube_embedded_watchurl)
  171.     log.error("No Youtube link has been found in get_youtube_hightest_quality_link.")
  172.     return ""
  173.  
  174. @retry(Exception, logger=log)
  175. def get_youtube_dl_links(video_id):
  176.     '''
  177.     Function gets the download links for a youtube clip.
  178.     This function parses the get_video_info format of youtube.
  179.    
  180.     @param video_id: Youtube Video ID.
  181.     @return d: A dictonary of qualities as keys and urls as values.
  182.     '''
  183.     d = {}
  184.  
  185.     url = r"http://www.youtube.com/get_video_info?video_id=%s&el=vevo" % video_id
  186.    
  187.     urlObj = urllib2.urlopen(url, timeout=12)
  188.     data = urlObj.read()
  189.     data = urllib2.unquote(urllib2.unquote(urllib2.unquote(data)))
  190.     data = data.replace(',url', '\nurl')
  191.     data = data.split('\n')
  192.  
  193.     for line in data:
  194.         if 'timedtext' in line or 'status=fail' in line or '<AdBreaks>' in line:
  195.             continue
  196.  
  197.         try:
  198.             url = line.split('&quality=')[0].split('url=')[1]
  199.             quality = line.split('&quality=')[1].split('&')[0]
  200.         except:
  201.             continue
  202.         if quality in d:
  203.             d[quality].append(url)
  204.         else:
  205.             d[quality] = [url]
  206.  
  207.     try:
  208.         videoName = "|".join(data).split('&title=')[1].split('&')[0]
  209.     except Exception, e:
  210.         log.error("Could not parse VideoName out of get_video_info (%s)" % str(e))
  211.         videoName = ""
  212.    
  213.     videoName = unicode(videoName, 'utf-8')
  214.     d['VideoName'] = videoName.replace('+',' ').replace('--','-')
  215.     return d
  216.  
  217.  
  218. class NextList(object):
  219.     "A list with a 'next' method."
  220.     def __init__(self, l):
  221.         self.l = l
  222.         self.next_index = 0
  223.    
  224.     def next(self):
  225.         if self.next_index < len(self.l):
  226.             value = self.l[self.next_index]
  227.             self.next_index += 1
  228.             return value
  229.         else:
  230.             return None
  231.    
  232.     def isEOF(self):
  233.         " Checks if the list has reached the end "
  234.         return (self.next_index >= len(self.l))
  235.  
  236. class MetaUrl(object):
  237.     "a url strecture data with many metadata"
  238.     def __init__(self, url, source="", videoName="", quality="", youtube_watchurl=""):
  239.         self.url = str(url)
  240.         self.source = source
  241.         self.videoName = videoName # Youtube Links Only
  242.         self.quality = quality # Youtube Links Onlys
  243.         self.youtube_watchurl = youtube_watchurl # Youtube Links Onlys
  244.    
  245.     def __repr__(self):
  246.         return "<MetaUrl '%s' | %s>" % (self.url, self.source)
  247.        
  248.  
  249. def search(song, n, processes=config.search_processes):
  250.     '''
  251.     Function searches song and returns n valid .mp3 links.
  252.     @param song: Search string.
  253.     @param n: Number of songs.
  254.     @param processes: Number of processes to launch in the subprocessing pool.
  255.     '''
  256.     linksFromSources = []
  257.     pool = multiprocessing.Pool(processes)
  258.    
  259.     args = [(song, source) for source in config.search_sources]
  260.     imapObj = pool.imap_unordered(_parse_star, args)
  261.     for i in range(len(args)):
  262.         linksFromSources.append(NextList(imapObj.next(15)))
  263.     pool.terminate()
  264.    
  265.     links = []
  266.     next_source = 0
  267.     while len(links) < n and not all(map(lambda x: x.isEOF(), linksFromSources)):
  268.         nextItem = linksFromSources[next_source].next()
  269.         if nextItem:
  270.             log.debug("added song %.80s from source ID %d (%s)" % (nextItem.url.split('/')[-1], next_source, nextItem.source))
  271.             links.append(nextItem)
  272.        
  273.         if len(linksFromSources) == next_source+1:
  274.             next_source = 0
  275.         else:
  276.             next_source += 1
  277.  
  278.     return links
  279.  
  280. def _parse_star(args):
  281.     return parse(*args)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement