SHARE
TWEET

AMG_Biography.py

a guest Jun 22nd, 2010 1,196 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # -*- coding: utf-8 -*-
  2. import encodings.utf_8
  3. import sys, urllib, re
  4.  
  5. class MyOpener(urllib.FancyURLopener):
  6.     version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
  7.  
  8. def strip_white(stp):
  9.     stp = stp.replace('amp;','')
  10.     stp = re.sub('\W+?','', stp.lower())
  11.     return stp
  12.  
  13. def levenshtein(s1, s2):
  14.   l1 = len(s1)
  15.   l2 = len(s2)
  16.   matrix = [list(range(l1 + 1))] * (l2 + 1)
  17.   for zz in range(l2 + 1):
  18.     matrix[zz] = list(range(zz,zz + l1 + 1))
  19.   for zz in range(0,l2):
  20.     for sz in range(0,l1):
  21.       if s1[sz] == s2[zz]:
  22.         matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz])
  23.       else:
  24.         matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1)
  25.   return matrix[l2][l1]
  26.  
  27. def go(artist, album):
  28.     src = '/cg/amg.dll?p=amg&sql=%s&P=amg&opt1=1' % (urllib.quote(artist))
  29.     rx_src = re.compile('<tr class="visible".*?<td class="cell" style.*?<a href="(.*?)">(.*?)</a>', re.M|re.S|re.I)
  30.     req = MyOpener().open(url + src).read().decode('latin-1').encode('utf-8')
  31.     global rel
  32.     if req.find('~T2">Discography') > 0:
  33.         link = re.search('<a HREF="(.*?)">Discography</a>', req, re.I).group(1)
  34.         req = MyOpener().open(url + link).read().decode('latin-1').encode('utf-8')
  35.         if req.find(album) > 0:
  36.             req = MyOpener().open(url + link.replace('~T2','~T1')).read().decode('latin-1').encode('utf-8')
  37.             rel = re.search('<!--Begin Content-->(.*?)<!--ADBANNER-->', req, re.S|re.M)
  38.     else:
  39.         for match in rx_src.finditer(req):
  40.             if strip_white(artist) == strip_white(match.group(2)) or levenshtein(strip_white(artist), strip_white(match.group(2))) <= 3:
  41.                 link = match.group(1).replace('amp;','')
  42.                 req = MyOpener().open(url + link + '~T2').read().decode('latin-1').encode('utf-8')
  43.                 if req.find(album) > 0:
  44.                     req = MyOpener().open(url + link + '~T1').read().decode('latin-1').encode('utf-8')
  45.                     rel = re.search('<!--Begin Content-->(.*?)<!--ADBANNER-->', req, re.S|re.M)
  46.                     break
  47.  
  48. def get_bio():
  49.     bio = re.search('class="title">Biography</td><.*?>(by.*?)</td>.*?<p>(.*?)</p>', rel.group(1), re.M|re.S)
  50.     if bio:
  51.         amg_bio = re.sub('<.*?>', '', bio.group(2))
  52.         print amg_bio + '\r\n\r\nAMG Biography ' + bio.group(1).replace('  ',' ')
  53.  
  54. url = 'http://www.allmusic.com'
  55. rel = None;
  56.  
  57. try:
  58.     go(sys.argv[1], sys.argv[2])
  59.     if not rel == None: get_bio()
  60. except Exception, e:
  61.     print e
RAW Paste Data
Top