Advertisement
Guest User

stardict extracting code

a guest
Jan 22nd, 2011
351
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.88 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import codecs, re
  3.  
  4. # :: FilePath -> m Map Order (Expr,Read,Index)
  5. def loadDatDict( filename, d={} ):
  6.     ls = codecs.open( filename, encoding='utf-8').read().split('\n')[:-1]
  7.     for l in ls:
  8.         ps = l.split() # First get word with furigana, without, and index
  9.         assert len(ps) == 2 or len(ps) == 3
  10.         wf = w = ps[0]
  11.         i = int( ps[-1] )
  12.         if len(ps) == 3: w = ps[1][1:][:-1] # extract from inside brackets
  13.         e = r = wf # Next separate furigana if applicable
  14.         ps = wf.split('^')
  15.         assert len(ps) <= 2
  16.         if len(ps) == 2: e,r = ps[0],ps[1]
  17.         d[ i ] = { 'id':i, 'word':w, 'withFurigana':wf, 'expression':e, 'reading':r }
  18.     return d
  19.  
  20. # :: Str -> Unsigned Int
  21. def getUint( xs ): # network byte order
  22.    [a,b,c,d] = [ ord(x) for x in xs ]
  23.    return a*256**3 + b*256**2 + c*256**1 + d*256**0
  24.  
  25. # :: Map Order IdxEntry
  26. def loadIdx( path ):
  27.    d = {}
  28.    b = open( path, 'rb' ).read()
  29.    s, maxS, i = 0, len(b), 0
  30.    while s < maxS:
  31.       t = b.find( '\x00', s ) +1
  32.       word_str = b[s:t-1] # null terminated utf-8 word_str (but we drop the null)
  33.       word_data_offset = getUint( b[t:t+4] )
  34.       word_data_size = getUint( b[t+4:t+8] )
  35.       d[i] = { 'key':word_str, 'offset':word_data_offset, 'size':word_data_size }
  36.       s = t+8
  37.       i += 1
  38.    return d
  39.  
  40. # :: DictStr -> IdxEntry -> DictData
  41. def lookupDict( src, e ): return src[ e['offset'] : e['offset']+e['size'] ]
  42.  
  43. # :: Map Order IdxEntry -> FilePath -> Map Order IdxEntry'
  44. def crossRef( idx, dic ):
  45.    dic = open( dic, 'rb' ).read()
  46.    for k in idx: idx[k]['lookup'] = lookupDict( dic, idx[k] )
  47.    return idx
  48.  
  49. # :: FilePath -> FilePath -> Map Order IdxEntry'
  50. def load( idxPath, dictPath ):
  51.    idx = loadIdx( idxPath )
  52.    d = crossRef( idx, dictPath )
  53.    print 'Loaded %d from %s' % ( len(d), idxPath )
  54.    return d
  55.  
  56. def applyDat( dat, dic ):
  57.    for k in dic:
  58.       ms = dic[k]['lookup'].strip().split(',')
  59.       dic[k]['indicies'] = [ int(m) +1 for m in ms ]
  60.       dic[k]['expressions'] = [ dat[i]['expression'] for i in dic[k]['indicies'] ]
  61.       dic[k]['readings'] = [ dat[i]['reading'] for i in dic[k]['indicies'] ]
  62.    return dic
  63.  
  64. def writeAudio( d, path ):
  65.    for i in d:
  66.       try: open( path+'/'+d[i]['key'], 'wb' ).write( d[i]['lookup'] )
  67.       except: print 'Failed to write %s, continuing...' % i
  68.  
  69. dat = {}
  70. for f in [ '%s%d.dat' % (m,n) for n in range(1,11) for m in 'AB' ]: loadDatDict(f, dat )
  71. print 'Loaded %d from .dat files' % len( dat )
  72. da = load( 'JIB_ejA.idx', 'JIB_ejA.dict' ); applyDat( dat, da )
  73. db = load( 'JIB_ejB.idx', 'JIB_ejB.dict' ); applyDat( dat, db )
  74. dw = load( 'headwords.idx', 'headwords.dict' ); writeAudio( dw, 'wordAudio' )
  75. ds = load( 'sentences.idx', 'sentences.dict' ); writeAudio( ds, 'sentenceAudio' )
  76. dar = load( 'JIB_ejA.res.idx', 'JIB_ejA.res.dict' )
  77. dbr = load( 'JIB_ejB.res.idx', 'JIB_ejB.res.dict' )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement