Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import os, subprocess, sys, bz2
- try:
- import MeCab
- hasmecab = True
- except:
- hasmecab = False
- #TODO: update db format to know about sources
- ################################################################################
- ## Lexical analysis
- ################################################################################
- # Creates an instance of mecab process
- def mecab( fixPath=r'C:\Program Files\Anki\mecab' ): # :: Maybe Path -> IO MecabProc
- if hasmecab:
- return None
- else:
- try: from japanese.reading import si
- except: si = None
- if fixPath:
- os.environ['PATH'] += ';%s\\bin' % fixPath
- os.environ['MECABRC'] = '%s\\etc\\mecabrc' % fixPath
- mecabCmd = ['mecab', '--node-format=%m\t%f[0]\t%f[1]\t%f[8]\r', '--eos-format=\n', '--unk-format=%m\tUnknown\tUnknown\tUnknown\r']
- return subprocess.Popen( mecabCmd, bufsize=-1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, startupinfo=si )
- # Used to escape all strings interacting with mecab (useful for hooking)
- def escape( s ): return s # Str -> Str
- # Send mecab 1 input and receive 1 output
- def interact( p, expr ): # MecabProc -> Str -> IO Str
- expr = escape( expr ).encode( 'utf-8', 'ignore' )
- if p:
- p.stdin.write( expr + '\n' )
- p.stdin.flush()
- return u'\n'.join( [ unicode( p.stdout.readline().rstrip( '\r\n' ), 'utf-8' ) for l in expr.split('\n') ] )
- else:
- o = []
- m = MeCab.Tagger()
- n = m.parseToNode(expr)
- n = n.next
- while n:
- n2 = n.feature.split(',')
- try:
- o.append("%s\t%s\t%s\t%s\r" % (n.surface, n2[0], n2[1], n2[8]))
- except:
- o.append("%s\tUnknown\tUnknown\tUnknown\r" % n.surface)
- n = n.next
- return u'\r'.join(unicode(l, 'utf-8') for l in o)
- # MecabProc -> Str -> Maybe PosWhiteList -> Maybe PosBlackList -> IO [Morpheme]
- def getMorphemes( p, e, ws=None, bs=None ):
- ms = [ tuple( m.split('\t') ) for m in interact( p, e ).split('\r') ] # morphemes
- ms = [ m for m in ms if len( m ) == 4 ]
- if ws: ms = [ m for m in ms if m[1] in ws ]
- if bs: ms = [ m for m in ms if not m[1] in bs ]
- return ms
- # Str -> Maybe PosWhiteList -> Maybe PosBlackList -> IO [Morpheme]
- def getMorphemes1( e, ws=None, bs=None ): return getMorphemes( mecab(None), e, ws, bs )
- ################################################################################
- ## Morpheme db manipulation
- ################################################################################
- def ms2db( ms, srcName='unknown' ): # :: [Morpheme] -> Maybe SrcName -> Map Morpheme Int
- d = {}
- for m in ms:
- if m in d: d[m] += 1
- else: d[m] = 1
- return d
- def saveDbC( db, path ): open( path, 'wb' ).write( bz2.compress( db2str( db ) ) ) # :: Map Morpheme Int -> Path -> IO ()
- def loadDbC( path ):
- buf = bz2.decompress( open( path, 'rb' ).read() ).decode('utf-8')
- return dict([ ((a,b,c,d),int(i)) for (a,b,c,d,i) in [ row.split('\t') for row in buf.split('\n') ]])
- # uncompressed/human readable versions
- def saveDbU( db, path ): open( path, 'wb' ).write( db2str( db ) ) # :: Map Morpheme Int -> Path -> IO ()
- def loadDbU( path ): # :: Path -> IO Map Morpheme Int
- buf = open( path, 'rb' ).read().decode('utf-8')
- return dict([ ((a,b,c,d),int(i)) for (a,b,c,d,i) in [ row.split('\t') for row in buf.split('\n') ]])
- loadDb = loadDbU
- saveDb = saveDbU
- # :: [Morpheme] -> Str
- def ms2str( ms ): return u'\n'.join( [ u'\t'.join(m) for m in ms ] ).encode('utf-8')
- # :: Map Morpheme Int -> Str
- def db2str( db ): return u'\n'.join( [ u'\t'.join(m) + u'\t%d' % i for (m, i) in db.iteritems() ] ).encode('utf-8')
- def diffDb( da, db ):
- def f( xs, bs=[u'記号',u'助詞'] ): return [ x for x in xs if not x[1] in bs ]
- sa, sb = set( f(da.keys()) ), set( f(db.keys()) )
- i = sa.intersection( sb )
- AmB = sa.difference( sb )
- BmA = sb.difference( sa )
- sd = sa.symmetric_difference( sb )
- return (sa,sb,i,AmB,BmA,sd)
- def countByType( ms ):
- d = {}
- for m in ms:
- try: d[ m[1] ] += 1
- except KeyError: d[ m[1] ] = 1
- return d
- def analyze( ms ):
- d = {}
- d['posBreakdown'] = countByType( ms )
- d['count'] = len( ms )
- return d
- def analyze2str( ms ):
- d = analyze( ms )
- posStr = u'\n'.join( [ '%d\t%d%%\t%s' % ( v, 100.*v/d['count'], k ) for k,v in d['posBreakdown'].iteritems() ] )
- return '''Total morphemes: %d
- By part of spech:
- %s
- ''' % ( d['count'], posStr )
- def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
- inp = unicode( open( path, 'r' ).read(), 'utf-8' )
- return getMorphemes1( inp, ws, bs)
- def file2db( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
- ms = file2ms( path, ws, bs )
- return ms2db( ms )
- def mergeDbs( a, b ): # :: Map Morpheme Int -> Map Morpheme Int -> Map Morpheme Int
- D = {}
- for (m,i) in a.iteritems():
- try: D[m] += i
- except KeyError: D[m] = 1
- for (m,i) in b.iteritems():
- try: D[m] += i
- except KeyError: D[m] = 1
- return D
- def mergeFiles( aPath, bPath, destPath ):
- a, b = loadDb( aPath ), loadDb( bPath )
- c = mergeDbs( a, b )
- saveDb( c, destPath )
- ################################################################################
- ## Standalone program
- ################################################################################
- def test():
- def f( xs ):
- N = len(xs)
- return u'\n'.join( [ '%s: %d/%d = %d%%' % (t,n,N,100*n/N) for t,n in countByType( xs ).items() ] ).encode('utf-8')
- k = loadDb('known.morphdb')
- fsn = loadDb('fsnE01.morphdb')
- sa,sb,i,AmB,BmA,sd = diffDb( k, fsn )
- open('inter.txt','wb').write( ms2str(list( i ) ) + f(i) )
- open('B-A.txt','wb').write( ms2str(list( BmA ) ) + f(BmA) )
- def test2():
- k = loadDb('dbs/known.db')
- ks = loadDb('dbs/koreSentences.db')
- kw = loadDb('dbs/koreWords.db')
- kall = mergeDbs( ks, kw )
- sa,sb,i,AmB,BmA,sd = diffDb( k, kall )
- print '# missing from either:', len(sd)
- print 'Same morphemes?', set(k.keys()) == set(kall.keys())
- print 'Same # occurances?', sum(k.values()) == sum(kall.values())
- saveDb( kall, 'dbs/koreAll.db' )
- def main(): # :: IO ()
- if len( sys.argv ) != 3:
- print 'Usage: %s srcFile destFile' % sys.argv[0]
- return
- ms = file2ms( sys.argv[1] )
- #open( sys.argv[2]+'.morphemes', 'w' ).write( ms2str( ms ) ) # save .morphemes
- saveDb( ms2db( ms ), sys.argv[2]+'.db' ) # save .db
- if __name__ == '__main__': main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement