This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Jun 15th, 2011  |  syntax: None  |  size: 6.63 KB  |  views: 33  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. # -*- coding: utf-8 -*-
  2. import os, subprocess, sys, bz2
  3. try:
  4.     import MeCab
  5.     hasmecab = True
  6. except:
  7.     hasmecab = False
  8.  
  9. #TODO: update db format to know about sources
  10.  
  11. ################################################################################
  12. ## Lexical analysis
  13. ################################################################################
  14.  
  15. # Creates an instance of mecab process
  16. def mecab( fixPath=r'C:\Program Files\Anki\mecab' ): # :: Maybe Path -> IO MecabProc
  17.     if hasmecab:
  18.         return None
  19.     else:
  20.         try: from japanese.reading import si
  21.         except: si = None
  22.         if fixPath:
  23.             os.environ['PATH'] += ';%s\\bin' % fixPath
  24.             os.environ['MECABRC'] = '%s\\etc\\mecabrc' % fixPath
  25.         mecabCmd = ['mecab', '--node-format=%m\t%f[0]\t%f[1]\t%f[8]\r', '--eos-format=\n', '--unk-format=%m\tUnknown\tUnknown\tUnknown\r']
  26.         return subprocess.Popen( mecabCmd, bufsize=-1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, startupinfo=si )
  27.  
  28. # Used to escape all strings interacting with mecab (useful for hooking)
  29. def escape( s ): return s # Str -> Str
  30.  
  31. # Send mecab 1 input and receive 1 output
  32. def interact( p, expr ): # MecabProc -> Str -> IO Str
  33.     expr = escape( expr ).encode( 'utf-8', 'ignore' )
  34.     if p:
  35.         p.stdin.write( expr + '\n' )
  36.         p.stdin.flush()
  37.         return u'\n'.join( [ unicode( p.stdout.readline().rstrip( '\r\n' ), 'utf-8' ) for l in expr.split('\n') ] )
  38.     else:
  39.         o = []
  40.         m = MeCab.Tagger()
  41.         n = m.parseToNode(expr)
  42.         n = n.next
  43.         while n:
  44.             n2 = n.feature.split(',')
  45.             try:
  46.                 o.append("%s\t%s\t%s\t%s\r" % (n.surface, n2[0], n2[1], n2[8]))
  47.             except:
  48.                 o.append("%s\tUnknown\tUnknown\tUnknown\r" % n.surface)
  49.             n = n.next
  50.  
  51.         return u'\r'.join(unicode(l, 'utf-8') for l in o)
  52.  
  53.  
  54. # MecabProc -> Str -> Maybe PosWhiteList -> Maybe PosBlackList -> IO [Morpheme]
  55. def getMorphemes( p, e, ws=None, bs=None ):
  56.     ms = [ tuple( m.split('\t') ) for m in interact( p, e ).split('\r') ] # morphemes
  57.     ms = [ m for m in ms if len( m ) == 4 ]
  58.     if ws: ms = [ m for m in ms if m[1] in ws ]
  59.     if bs: ms = [ m for m in ms if not m[1] in bs ]
  60.     return ms
  61.  
  62. # Str -> Maybe PosWhiteList -> Maybe PosBlackList -> IO [Morpheme]
  63. def getMorphemes1( e, ws=None, bs=None ): return getMorphemes( mecab(None), e, ws, bs )
  64.  
  65. ################################################################################
  66. ## Morpheme db manipulation
  67. ################################################################################
  68.  
  69. def ms2db( ms, srcName='unknown' ): # :: [Morpheme] -> Maybe SrcName -> Map Morpheme Int
  70.     d = {}
  71.     for m in ms:
  72.         if m in d: d[m] += 1
  73.         else: d[m] = 1
  74.     return d
  75.  
  76. def saveDbC( db, path ): open( path, 'wb' ).write( bz2.compress( db2str( db ) ) ) # :: Map Morpheme Int -> Path -> IO ()
  77. def loadDbC( path ):
  78.     buf = bz2.decompress( open( path, 'rb' ).read() ).decode('utf-8')
  79.     return dict([ ((a,b,c,d),int(i)) for (a,b,c,d,i) in [ row.split('\t') for row in buf.split('\n') ]])
  80.  
  81. # uncompressed/human readable versions
  82. def saveDbU( db, path ): open( path, 'wb' ).write( db2str( db ) ) # :: Map Morpheme Int -> Path -> IO ()
  83. def loadDbU( path ): # :: Path -> IO Map Morpheme Int
  84.     buf = open( path, 'rb' ).read().decode('utf-8')
  85.     return dict([ ((a,b,c,d),int(i)) for (a,b,c,d,i) in [ row.split('\t') for row in buf.split('\n') ]])
  86.  
  87. loadDb = loadDbU
  88. saveDb = saveDbU
  89.  
  90. # :: [Morpheme] -> Str
  91. def ms2str( ms ): return u'\n'.join( [ u'\t'.join(m) for m in ms ] ).encode('utf-8')
  92. # :: Map Morpheme Int -> Str
  93. def db2str( db ): return u'\n'.join( [ u'\t'.join(m) + u'\t%d' % i for (m, i) in db.iteritems() ] ).encode('utf-8')
  94.  
  95. def diffDb( da, db ):
  96.     def f( xs, bs=[u'記号',u'助詞'] ): return [ x for x in xs if not x[1] in bs ]
  97.  
  98.     sa, sb = set( f(da.keys()) ), set( f(db.keys()) )
  99.     i = sa.intersection( sb )
  100.     AmB = sa.difference( sb )
  101.     BmA = sb.difference( sa )
  102.     sd = sa.symmetric_difference( sb )
  103.     return (sa,sb,i,AmB,BmA,sd)
  104.  
  105. def countByType( ms ):
  106.     d = {}
  107.     for m in ms:
  108.         try: d[ m[1] ] += 1
  109.         except KeyError: d[ m[1] ] = 1
  110.     return d
  111.  
  112. def analyze( ms ):
  113.     d = {}
  114.     d['posBreakdown'] = countByType( ms )
  115.     d['count'] = len( ms )
  116.     return d
  117. def analyze2str( ms ):
  118.     d = analyze( ms )
  119.     posStr = u'\n'.join( [ '%d\t%d%%\t%s' % ( v, 100.*v/d['count'], k ) for k,v in d['posBreakdown'].iteritems() ] )
  120.     return '''Total morphemes: %d
  121. By part of spech:
  122. %s
  123. ''' % ( d['count'], posStr )
  124.  
  125. def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  126.     inp = unicode( open( path, 'r' ).read(), 'utf-8' )
  127.     return getMorphemes1( inp, ws, bs)
  128.  
  129. def file2db( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  130.     ms = file2ms( path, ws, bs )
  131.     return ms2db( ms )
  132.  
  133. def mergeDbs( a, b ): # :: Map Morpheme Int -> Map Morpheme Int -> Map Morpheme Int
  134.    D = {}
  135.    for (m,i) in a.iteritems():
  136.       try: D[m] += i
  137.       except KeyError: D[m] = 1
  138.    for (m,i) in b.iteritems():
  139.       try: D[m] += i
  140.       except KeyError: D[m] = 1
  141.    return D
  142.  
  143. def mergeFiles( aPath, bPath, destPath ):
  144.    a, b = loadDb( aPath ), loadDb( bPath )
  145.    c = mergeDbs( a, b )
  146.    saveDb( c, destPath )
  147.  
  148. ################################################################################
  149. ## Standalone program
  150. ################################################################################
  151.  
  152. def test():
  153.     def f( xs ):
  154.         N = len(xs)
  155.         return u'\n'.join( [ '%s: %d/%d = %d%%' % (t,n,N,100*n/N) for t,n in countByType( xs ).items() ] ).encode('utf-8')
  156.     k = loadDb('known.morphdb')
  157.     fsn = loadDb('fsnE01.morphdb')
  158.     sa,sb,i,AmB,BmA,sd = diffDb( k, fsn )
  159.     open('inter.txt','wb').write( ms2str(list( i ) ) + f(i) )
  160.     open('B-A.txt','wb').write( ms2str(list( BmA ) ) + f(BmA) )
  161.  
  162. def test2():
  163.    k = loadDb('dbs/known.db')
  164.    ks = loadDb('dbs/koreSentences.db')
  165.    kw = loadDb('dbs/koreWords.db')
  166.    kall = mergeDbs( ks, kw )
  167.    sa,sb,i,AmB,BmA,sd = diffDb( k, kall )
  168.    print '# missing from either:', len(sd)
  169.    print 'Same morphemes?', set(k.keys()) == set(kall.keys())
  170.    print 'Same # occurances?', sum(k.values()) == sum(kall.values())
  171.    saveDb( kall, 'dbs/koreAll.db' )
  172.  
  173. def main(): # :: IO ()
  174.     if len( sys.argv ) != 3:
  175.         print 'Usage: %s srcFile destFile' % sys.argv[0]
  176.         return
  177.     ms = file2ms( sys.argv[1] )
  178.     #open( sys.argv[2]+'.morphemes', 'w' ).write( ms2str( ms ) ) # save .morphemes
  179.     saveDb( ms2db( ms ), sys.argv[2]+'.db' ) # save .db
  180.  
  181. if __name__ == '__main__': main()
clone this paste RAW Paste Data