Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import codecs, re
- # :: FilePath -> m Map Order (Expr,Read,Index)
- def loadDatDict( filename, d={} ):
- ls = codecs.open( filename, encoding='utf-8').read().split('\n')[:-1]
- for l in ls:
- ps = l.split() # First get word with furigana, without, and index
- assert len(ps) == 2 or len(ps) == 3
- wf = w = ps[0]
- i = int( ps[-1] )
- if len(ps) == 3: w = ps[1][1:][:-1] # extract from inside brackets
- e = r = wf # Next separate furigana if applicable
- ps = wf.split('^')
- assert len(ps) <= 2
- if len(ps) == 2: e,r = ps[0],ps[1]
- d[ i ] = { 'id':i, 'word':w, 'withFurigana':wf, 'expression':e, 'reading':r }
- return d
- # :: Str -> Unsigned Int
- def getUint( xs ): # network byte order
- [a,b,c,d] = [ ord(x) for x in xs ]
- return a*256**3 + b*256**2 + c*256**1 + d*256**0
- # :: Map Order IdxEntry
- def loadIdx( path ):
- d = {}
- b = open( path, 'rb' ).read()
- s, maxS, i = 0, len(b), 0
- while s < maxS:
- t = b.find( '\x00', s ) +1
- word_str = b[s:t-1] # null terminated utf-8 word_str (but we drop the null)
- word_data_offset = getUint( b[t:t+4] )
- word_data_size = getUint( b[t+4:t+8] )
- d[i] = { 'key':word_str, 'offset':word_data_offset, 'size':word_data_size }
- s = t+8
- i += 1
- return d
- # :: DictStr -> IdxEntry -> DictData
- def lookupDict( src, e ): return src[ e['offset'] : e['offset']+e['size'] ]
- # :: Map Order IdxEntry -> FilePath -> Map Order IdxEntry'
- def crossRef( idx, dic ):
- dic = open( dic, 'rb' ).read()
- for k in idx: idx[k]['lookup'] = lookupDict( dic, idx[k] )
- return idx
- # :: FilePath -> FilePath -> Map Order IdxEntry'
- def load( idxPath, dictPath ):
- idx = loadIdx( idxPath )
- d = crossRef( idx, dictPath )
- print 'Loaded %d from %s' % ( len(d), idxPath )
- return d
- def applyDat( dat, dic ):
- for k in dic:
- ms = dic[k]['lookup'].strip().split(',')
- dic[k]['indicies'] = [ int(m) +1 for m in ms ]
- dic[k]['expressions'] = [ dat[i]['expression'] for i in dic[k]['indicies'] ]
- dic[k]['readings'] = [ dat[i]['reading'] for i in dic[k]['indicies'] ]
- return dic
- def writeAudio( d, path ):
- for i in d:
- try: open( path+'/'+d[i]['key'], 'wb' ).write( d[i]['lookup'] )
- except: print 'Failed to write %s, continuing...' % i
- dat = {}
- for f in [ '%s%d.dat' % (m,n) for n in range(1,11) for m in 'AB' ]: loadDatDict(f, dat )
- print 'Loaded %d from .dat files' % len( dat )
- da = load( 'JIB_ejA.idx', 'JIB_ejA.dict' ); applyDat( dat, da )
- db = load( 'JIB_ejB.idx', 'JIB_ejB.dict' ); applyDat( dat, db )
- dw = load( 'headwords.idx', 'headwords.dict' ); writeAudio( dw, 'wordAudio' )
- ds = load( 'sentences.idx', 'sentences.dict' ); writeAudio( ds, 'sentenceAudio' )
- dar = load( 'JIB_ejA.res.idx', 'JIB_ejA.res.dict' )
- dbr = load( 'JIB_ejB.res.idx', 'JIB_ejB.res.dict' )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement