#!/usr/bin/env python
import codecs, re
# :: FilePath -> m Map Order (Expr,Read,Index)
def loadDatDict( filename, d={} ):
ls = codecs.open( filename, encoding='utf-8').read().split('\n')[:-1]
for l in ls:
ps = l.split() # First get word with furigana, without, and index
assert len(ps) == 2 or len(ps) == 3
wf = w = ps[0]
i = int( ps[-1] )
if len(ps) == 3: w = ps[1][1:][:-1] # extract from inside brackets
e = r = wf # Next separate furigana if applicable
ps = wf.split('^')
assert len(ps) <= 2
if len(ps) == 2: e,r = ps[0],ps[1]
d[ i ] = { 'id':i, 'word':w, 'withFurigana':wf, 'expression':e, 'reading':r }
return d
# :: Str -> Unsigned Int
def getUint( xs ): # network byte order
[a,b,c,d] = [ ord(x) for x in xs ]
return a*256**3 + b*256**2 + c*256**1 + d*256**0
# :: Map Order IdxEntry
def loadIdx( path ):
d = {}
b = open( path, 'rb' ).read()
s, maxS, i = 0, len(b), 0
while s < maxS:
t = b.find( '\x00', s ) +1
word_str = b[s:t-1] # null terminated utf-8 word_str (but we drop the null)
word_data_offset = getUint( b[t:t+4] )
word_data_size = getUint( b[t+4:t+8] )
d[i] = { 'key':word_str, 'offset':word_data_offset, 'size':word_data_size }
s = t+8
i += 1
return d
# :: DictStr -> IdxEntry -> DictData
def lookupDict( src, e ): return src[ e['offset'] : e['offset']+e['size'] ]
# :: Map Order IdxEntry -> FilePath -> Map Order IdxEntry'
def crossRef( idx, dic ):
dic = open( dic, 'rb' ).read()
for k in idx: idx[k]['lookup'] = lookupDict( dic, idx[k] )
return idx
# :: FilePath -> FilePath -> Map Order IdxEntry'
def load( idxPath, dictPath ):
idx = loadIdx( idxPath )
d = crossRef( idx, dictPath )
print 'Loaded %d from %s' % ( len(d), idxPath )
return d
def applyDat( dat, dic ):
for k in dic:
ms = dic[k]['lookup'].strip().split(',')
dic[k]['indicies'] = [ int(m) +1 for m in ms ]
dic[k]['expressions'] = [ dat[i]['expression'] for i in dic[k]['indicies'] ]
dic[k]['readings'] = [ dat[i]['reading'] for i in dic[k]['indicies'] ]
return dic
def writeAudio( d, path ):
for i in d:
try: open( path+'/'+d[i]['key'], 'wb' ).write( d[i]['lookup'] )
except: print 'Failed to write %s, continuing...' % i
dat = {}
for f in [ '%s%d.dat' % (m,n) for n in range(1,11) for m in 'AB' ]: loadDatDict(f, dat )
print 'Loaded %d from .dat files' % len( dat )
da = load( 'JIB_ejA.idx', 'JIB_ejA.dict' ); applyDat( dat, da )
db = load( 'JIB_ejB.idx', 'JIB_ejB.dict' ); applyDat( dat, db )
dw = load( 'headwords.idx', 'headwords.dict' ); writeAudio( dw, 'wordAudio' )
ds = load( 'sentences.idx', 'sentences.dict' ); writeAudio( ds, 'sentenceAudio' )
dar = load( 'JIB_ejA.res.idx', 'JIB_ejA.res.dict' )
dbr = load( 'JIB_ejB.res.idx', 'JIB_ejB.res.dict' )