Advertisement
Guest User

Untitled

a guest
Jun 15th, 2011
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.63 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import os, subprocess, sys, bz2
  3. try:
  4. import MeCab
  5. hasmecab = True
  6. except:
  7. hasmecab = False
  8.  
  9. #TODO: update db format to know about sources
  10.  
  11. ################################################################################
  12. ## Lexical analysis
  13. ################################################################################
  14.  
  15. # Creates an instance of mecab process
  16. def mecab( fixPath=r'C:\Program Files\Anki\mecab' ): # :: Maybe Path -> IO MecabProc
  17. if hasmecab:
  18. return None
  19. else:
  20. try: from japanese.reading import si
  21. except: si = None
  22. if fixPath:
  23. os.environ['PATH'] += ';%s\\bin' % fixPath
  24. os.environ['MECABRC'] = '%s\\etc\\mecabrc' % fixPath
  25. mecabCmd = ['mecab', '--node-format=%m\t%f[0]\t%f[1]\t%f[8]\r', '--eos-format=\n', '--unk-format=%m\tUnknown\tUnknown\tUnknown\r']
  26. return subprocess.Popen( mecabCmd, bufsize=-1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, startupinfo=si )
  27.  
  28. # Used to escape all strings interacting with mecab (useful for hooking)
  29. def escape( s ): return s # Str -> Str
  30.  
  31. # Send mecab 1 input and receive 1 output
  32. def interact( p, expr ): # MecabProc -> Str -> IO Str
  33. expr = escape( expr ).encode( 'utf-8', 'ignore' )
  34. if p:
  35. p.stdin.write( expr + '\n' )
  36. p.stdin.flush()
  37. return u'\n'.join( [ unicode( p.stdout.readline().rstrip( '\r\n' ), 'utf-8' ) for l in expr.split('\n') ] )
  38. else:
  39. o = []
  40. m = MeCab.Tagger()
  41. n = m.parseToNode(expr)
  42. n = n.next
  43. while n:
  44. n2 = n.feature.split(',')
  45. try:
  46. o.append("%s\t%s\t%s\t%s\r" % (n.surface, n2[0], n2[1], n2[8]))
  47. except:
  48. o.append("%s\tUnknown\tUnknown\tUnknown\r" % n.surface)
  49. n = n.next
  50.  
  51. return u'\r'.join(unicode(l, 'utf-8') for l in o)
  52.  
  53.  
  54. # MecabProc -> Str -> Maybe PosWhiteList -> Maybe PosBlackList -> IO [Morpheme]
  55. def getMorphemes( p, e, ws=None, bs=None ):
  56. ms = [ tuple( m.split('\t') ) for m in interact( p, e ).split('\r') ] # morphemes
  57. ms = [ m for m in ms if len( m ) == 4 ]
  58. if ws: ms = [ m for m in ms if m[1] in ws ]
  59. if bs: ms = [ m for m in ms if not m[1] in bs ]
  60. return ms
  61.  
  62. # Str -> Maybe PosWhiteList -> Maybe PosBlackList -> IO [Morpheme]
  63. def getMorphemes1( e, ws=None, bs=None ): return getMorphemes( mecab(None), e, ws, bs )
  64.  
  65. ################################################################################
  66. ## Morpheme db manipulation
  67. ################################################################################
  68.  
  69. def ms2db( ms, srcName='unknown' ): # :: [Morpheme] -> Maybe SrcName -> Map Morpheme Int
  70. d = {}
  71. for m in ms:
  72. if m in d: d[m] += 1
  73. else: d[m] = 1
  74. return d
  75.  
  76. def saveDbC( db, path ): open( path, 'wb' ).write( bz2.compress( db2str( db ) ) ) # :: Map Morpheme Int -> Path -> IO ()
  77. def loadDbC( path ):
  78. buf = bz2.decompress( open( path, 'rb' ).read() ).decode('utf-8')
  79. return dict([ ((a,b,c,d),int(i)) for (a,b,c,d,i) in [ row.split('\t') for row in buf.split('\n') ]])
  80.  
  81. # uncompressed/human readable versions
  82. def saveDbU( db, path ): open( path, 'wb' ).write( db2str( db ) ) # :: Map Morpheme Int -> Path -> IO ()
  83. def loadDbU( path ): # :: Path -> IO Map Morpheme Int
  84. buf = open( path, 'rb' ).read().decode('utf-8')
  85. return dict([ ((a,b,c,d),int(i)) for (a,b,c,d,i) in [ row.split('\t') for row in buf.split('\n') ]])
  86.  
  87. loadDb = loadDbU
  88. saveDb = saveDbU
  89.  
  90. # :: [Morpheme] -> Str
  91. def ms2str( ms ): return u'\n'.join( [ u'\t'.join(m) for m in ms ] ).encode('utf-8')
  92. # :: Map Morpheme Int -> Str
  93. def db2str( db ): return u'\n'.join( [ u'\t'.join(m) + u'\t%d' % i for (m, i) in db.iteritems() ] ).encode('utf-8')
  94.  
  95. def diffDb( da, db ):
  96. def f( xs, bs=[u'記号',u'助詞'] ): return [ x for x in xs if not x[1] in bs ]
  97.  
  98. sa, sb = set( f(da.keys()) ), set( f(db.keys()) )
  99. i = sa.intersection( sb )
  100. AmB = sa.difference( sb )
  101. BmA = sb.difference( sa )
  102. sd = sa.symmetric_difference( sb )
  103. return (sa,sb,i,AmB,BmA,sd)
  104.  
  105. def countByType( ms ):
  106. d = {}
  107. for m in ms:
  108. try: d[ m[1] ] += 1
  109. except KeyError: d[ m[1] ] = 1
  110. return d
  111.  
  112. def analyze( ms ):
  113. d = {}
  114. d['posBreakdown'] = countByType( ms )
  115. d['count'] = len( ms )
  116. return d
  117. def analyze2str( ms ):
  118. d = analyze( ms )
  119. posStr = u'\n'.join( [ '%d\t%d%%\t%s' % ( v, 100.*v/d['count'], k ) for k,v in d['posBreakdown'].iteritems() ] )
  120. return '''Total morphemes: %d
  121. By part of spech:
  122. %s
  123. ''' % ( d['count'], posStr )
  124.  
  125. def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  126. inp = unicode( open( path, 'r' ).read(), 'utf-8' )
  127. return getMorphemes1( inp, ws, bs)
  128.  
  129. def file2db( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  130. ms = file2ms( path, ws, bs )
  131. return ms2db( ms )
  132.  
  133. def mergeDbs( a, b ): # :: Map Morpheme Int -> Map Morpheme Int -> Map Morpheme Int
  134. D = {}
  135. for (m,i) in a.iteritems():
  136. try: D[m] += i
  137. except KeyError: D[m] = 1
  138. for (m,i) in b.iteritems():
  139. try: D[m] += i
  140. except KeyError: D[m] = 1
  141. return D
  142.  
  143. def mergeFiles( aPath, bPath, destPath ):
  144. a, b = loadDb( aPath ), loadDb( bPath )
  145. c = mergeDbs( a, b )
  146. saveDb( c, destPath )
  147.  
  148. ################################################################################
  149. ## Standalone program
  150. ################################################################################
  151.  
  152. def test():
  153. def f( xs ):
  154. N = len(xs)
  155. return u'\n'.join( [ '%s: %d/%d = %d%%' % (t,n,N,100*n/N) for t,n in countByType( xs ).items() ] ).encode('utf-8')
  156. k = loadDb('known.morphdb')
  157. fsn = loadDb('fsnE01.morphdb')
  158. sa,sb,i,AmB,BmA,sd = diffDb( k, fsn )
  159. open('inter.txt','wb').write( ms2str(list( i ) ) + f(i) )
  160. open('B-A.txt','wb').write( ms2str(list( BmA ) ) + f(BmA) )
  161.  
  162. def test2():
  163. k = loadDb('dbs/known.db')
  164. ks = loadDb('dbs/koreSentences.db')
  165. kw = loadDb('dbs/koreWords.db')
  166. kall = mergeDbs( ks, kw )
  167. sa,sb,i,AmB,BmA,sd = diffDb( k, kall )
  168. print '# missing from either:', len(sd)
  169. print 'Same morphemes?', set(k.keys()) == set(kall.keys())
  170. print 'Same # occurances?', sum(k.values()) == sum(kall.values())
  171. saveDb( kall, 'dbs/koreAll.db' )
  172.  
  173. def main(): # :: IO ()
  174. if len( sys.argv ) != 3:
  175. print 'Usage: %s srcFile destFile' % sys.argv[0]
  176. return
  177. ms = file2ms( sys.argv[1] )
  178. #open( sys.argv[2]+'.morphemes', 'w' ).write( ms2str( ms ) ) # save .morphemes
  179. saveDb( ms2db( ms ), sys.argv[2]+'.db' ) # save .db
  180.  
  181. if __name__ == '__main__': main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement