stardict extracting code

#!/usr/bin/env python
import codecs, re

# :: FilePath -> m Map Order (Expr,Read,Index)
def loadDatDict( filename, d={} ):
    ls = codecs.open( filename, encoding='utf-8').read().split('\n')[:-1]
    for l in ls:
        ps = l.split() # First get word with furigana, without, and index
        assert len(ps) == 2 or len(ps) == 3
        wf = w = ps[0]
        i = int( ps[-1] )
        if len(ps) == 3: w = ps[1][1:][:-1] # extract from inside brackets
        e = r = wf # Next separate furigana if applicable
        ps = wf.split('^')
        assert len(ps) <= 2
        if len(ps) == 2: e,r = ps[0],ps[1]
        d[ i ] = { 'id':i, 'word':w, 'withFurigana':wf, 'expression':e, 'reading':r }
    return d

# :: Str -> Unsigned Int
def getUint( xs ): # network byte order
   [a,b,c,d] = [ ord(x) for x in xs ]
   return a*256**3 + b*256**2 + c*256**1 + d*256**0

# :: Map Order IdxEntry
def loadIdx( path ):
   d = {}
   b = open( path, 'rb' ).read()
   s, maxS, i = 0, len(b), 0
   while s < maxS:
      t = b.find( '\x00', s ) +1
      word_str = b[s:t-1] # null terminated utf-8 word_str (but we drop the null)
      word_data_offset = getUint( b[t:t+4] )
      word_data_size = getUint( b[t+4:t+8] )
      d[i] = { 'key':word_str, 'offset':word_data_offset, 'size':word_data_size }
      s = t+8
      i += 1
   return d

# :: DictStr -> IdxEntry -> DictData
def lookupDict( src, e ): return src[ e['offset'] : e['offset']+e['size'] ]

# :: Map Order IdxEntry -> FilePath -> Map Order IdxEntry'
def crossRef( idx, dic ):
   dic = open( dic, 'rb' ).read()
   for k in idx: idx[k]['lookup'] = lookupDict( dic, idx[k] )
   return idx

# :: FilePath -> FilePath -> Map Order IdxEntry'
def load( idxPath, dictPath ):
   idx = loadIdx( idxPath )
   d = crossRef( idx, dictPath )
   print 'Loaded %d from %s' % ( len(d), idxPath )
   return d

def applyDat( dat, dic ):
   for k in dic:
      ms = dic[k]['lookup'].strip().split(',')
      dic[k]['indicies'] = [ int(m) +1 for m in ms ]
      dic[k]['expressions'] = [ dat[i]['expression'] for i in dic[k]['indicies'] ]
      dic[k]['readings'] = [ dat[i]['reading'] for i in dic[k]['indicies'] ]
   return dic

def writeAudio( d, path ):
   for i in d:
      try: open( path+'/'+d[i]['key'], 'wb' ).write( d[i]['lookup'] )
      except: print 'Failed to write %s, continuing...' % i

dat = {}
for f in [ '%s%d.dat' % (m,n) for n in range(1,11) for m in 'AB' ]: loadDatDict(f, dat )
print 'Loaded %d from .dat files' % len( dat )
da = load( 'JIB_ejA.idx', 'JIB_ejA.dict' ); applyDat( dat, da )
db = load( 'JIB_ejB.idx', 'JIB_ejB.dict' ); applyDat( dat, db )
dw = load( 'headwords.idx', 'headwords.dict' ); writeAudio( dw, 'wordAudio' )
ds = load( 'sentences.idx', 'sentences.dict' ); writeAudio( ds, 'sentenceAudio' )
dar = load( 'JIB_ejA.res.idx', 'JIB_ejA.res.dict' )
dbr = load( 'JIB_ejB.res.idx', 'JIB_ejB.res.dict' )