SHARE
TWEET

bio_amg.py

a guest Jun 21st, 2010 1,392 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # -*- coding: utf-8 -*-
  2. import encodings.utf_8
  3. import sys, urllib, re
  4.  
  5. class MyOpener(urllib.FancyURLopener):
  6.     version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
  7.  
  8. def strip_white(stp):
  9.     stp = stp.replace('amp;','')
  10.     stp = re.sub('\W+?','', stp.lower())
  11.     return stp
  12.  
  13. def levenshtein(s1, s2):
  14.   l1 = len(s1)
  15.   l2 = len(s2)
  16.   matrix = [list(range(l1 + 1))] * (l2 + 1)
  17.   for zz in range(l2 + 1):
  18.     matrix[zz] = list(range(zz,zz + l1 + 1))
  19.   for zz in range(0,l2):
  20.     for sz in range(0,l1):
  21.       if s1[sz] == s2[zz]:
  22.         matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz])
  23.       else:
  24.         matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1)
  25.   return matrix[l2][l1]
  26.  
  27. def re_parse(dt, rx):
  28.     match = re.search(rx, dt)
  29.     if match: return match.group(1)
  30.  
  31. def go(artist, album):
  32.     src = '/cg/amg.dll?p=amg&sql=%s&P=amg&opt1=2' % (urllib.quote(album))
  33.     rx_src = re.compile('(<tr class=\"visible\".*?<td class=\"cell\" style.*?>)(.*?)(<.*?;"><a href=\")(.*?)(\">)', re.M|re.S)
  34.     req = MyOpener().open(str(url + src)).read().decode('latin-1').encode('utf-8')
  35.     for match in rx_src.finditer(str(req)):
  36.         if strip_white(artist) == strip_white(match.group(2)) or levenshtein(strip_white(artist), strip_white(match.group(2))) <= 3:
  37.             global rel, link
  38.             link = match.group(4).replace('amp;','')
  39.             req = MyOpener().open(url + link).read().decode('latin-1').encode('utf-8')
  40.             rel = re.search('<!--Begin Content-->(.*?)<!--ADBANNER-->', req, re.S|re.M)
  41.             break
  42.  
  43. def get_review():
  44.     revw = re_parse(rel.group(1), '<td align=\"left\" class=\"title\">.*?\"author\">by\s.*?</td>.*?<p>(.*?)</p>')
  45.     if revw:
  46.         amg_review = re.sub('<.*?>', '', revw)
  47.         if amg_review.find('Read More...') > 0:
  48.             rx_more = re.compile('<td align="left" class="title">.*?"author">by\s.*?</td>.*?<p>(.*?)</p>', re.M|re.S)
  49.             more = re_parse(rel.group(1), '\.\.\. <a href="(.*?)\">Read More\.\.\.</a>')
  50.             more_review = MyOpener().open(url + more).read().decode('latin-1').encode('utf-8')
  51.             amg_review = re.sub('<.*?>', '', rx_more.search(more_review).group(1))
  52.         amg = amg_review + '\r\nAMG Review by ' + re_parse(rel.group(1), '<td align=\"left\" class=\"title\">.*?"author\">by\s.(.*?)<')
  53.         print amg
  54.  
  55. url = 'http://www.allmusic.com'
  56. rel = None;
  57.  
  58. try:
  59.     go(sys.argv[1], sys.argv[2])
  60.     if not rel == None: get_review()
  61. except Exception, e:
  62.     print e
RAW Paste Data
Top