SHARE
TWEET

bio_gracenote.py

a guest Jul 2nd, 2010 2,647 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # -*- coding: utf-8 -*-
  2. import encodings.utf_8
  3. import sys, urllib, re
  4.  
  5. class MyOpener(urllib.FancyURLopener):
  6.     version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
  7.  
  8. def strip_white(stp):
  9.     stp = stp.replace('amp;','')
  10.     stp = re.sub('\W+?','', stp.lower())
  11.     return stp
  12.  
  13. def levenshtein(s1, s2):
  14.   l1 = len(s1)
  15.   l2 = len(s2)
  16.   matrix = [list(range(l1 + 1))] * (l2 + 1)
  17.   for zz in range(l2 + 1):
  18.     matrix[zz] = list(range(zz,zz + l1 + 1))
  19.   for zz in range(0,l2):
  20.     for sz in range(0,l1):
  21.       if s1[sz] == s2[zz]:
  22.         matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz])
  23.       else:
  24.         matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1)
  25.   return matrix[l2][l1]
  26.  
  27. def go(artist):
  28.     req = MyOpener().open(url + '/search/?query=%s&search_type=artist' % (urllib.quote(artist))).read()
  29.     rx_src = re.compile('Artist:.*?artist_id=(.*?)">(.*?)</a>', re.M|re.S|re.I)
  30.     for match in rx_src.finditer(req):
  31.         if strip_white(artist) == strip_white(match.group(2)) or levenshtein(strip_white(artist), strip_white(match.group(2))) <= ld:
  32.             link = url + '/artist/artist.php?artist_id=' + match.group(1)
  33.             req = MyOpener().open(link).read()
  34.             bio = re.search('var originalBiography.*?"(.*?)";', req, re.M|re.S|re.I)
  35.             if bio: print re.sub('<.*?>', '', bio.group(1)).replace('\\','')
  36.             break
  37.  
  38. url = 'http://www.gracenote.com'
  39. ld = 3 # more info: http://en.wikipedia.org/wiki/Levenshtein_distance
  40.  
  41. try:
  42.     go(sys.argv[1])
  43. except Exception, e:
  44.     print e
RAW Paste Data
Top