AMG_Biography.py

# -*- coding: utf-8 -*-
import encodings.utf_8
import sys, urllib, re

class MyOpener(urllib.FancyURLopener):
    version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'

def strip_white(stp):
    stp = stp.replace('amp;','')
    stp = re.sub('\W+?','', stp.lower())
    return stp

def levenshtein(s1, s2):
  l1 = len(s1)
  l2 = len(s2)
  matrix = [list(range(l1 + 1))] * (l2 + 1)
  for zz in range(l2 + 1):
    matrix[zz] = list(range(zz,zz + l1 + 1))
  for zz in range(0,l2):
    for sz in range(0,l1):
      if s1[sz] == s2[zz]:
        matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz])
      else:
        matrix[zz+1][sz+1] = min(matrix[zz+1][sz] + 1, matrix[zz][sz+1] + 1, matrix[zz][sz] + 1)
  return matrix[l2][l1]

def go(artist, album):
    src = '/cg/amg.dll?p=amg&sql=%s&P=amg&opt1=1' % (urllib.quote(artist))
    rx_src = re.compile('<tr class="visible".*?<td class="cell" style.*?<a href="(.*?)">(.*?)</a>', re.M|re.S|re.I)
    req = MyOpener().open(url + src).read().decode('latin-1').encode('utf-8')
    global rel
    if req.find('~T2">Discography') > 0:
        link = re.search('<a HREF="(.*?)">Discography</a>', req, re.I).group(1)
        req = MyOpener().open(url + link).read().decode('latin-1').encode('utf-8')
        if req.find(album) > 0:
            req = MyOpener().open(url + link.replace('~T2','~T1')).read().decode('latin-1').encode('utf-8')
            rel = re.search('<!--Begin Content-->(.*?)<!--ADBANNER-->', req, re.S|re.M)
    else:
        for match in rx_src.finditer(req):
            if strip_white(artist) == strip_white(match.group(2)) or levenshtein(strip_white(artist), strip_white(match.group(2))) <= 3:
                link = match.group(1).replace('amp;','')
                req = MyOpener().open(url + link + '~T2').read().decode('latin-1').encode('utf-8')
                if req.find(album) > 0:
                    req = MyOpener().open(url + link + '~T1').read().decode('latin-1').encode('utf-8')
                    rel = re.search('<!--Begin Content-->(.*?)<!--ADBANNER-->', req, re.S|re.M)
                    break

def get_bio():
    bio = re.search('class="title">Biography</td><.*?>(by.*?)</td>.*?<p>(.*?)</p>', rel.group(1), re.M|re.S)
    if bio:
        amg_bio = re.sub('<.*?>', '', bio.group(2))
        print amg_bio + '\r\n\r\nAMG Biography ' + bio.group(1).replace('  ',' ')

url = 'http://www.allmusic.com'
rel = None;

try:
    go(sys.argv[1], sys.argv[2])
    if not rel == None: get_bio()
except Exception, e:
    print e