Advertisement
Guest User

morphemes.py

a guest
Jun 29th, 2011
157
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.51 KB | None | 0 0
  1. import codecs
  2.  
  3. def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  4.     #inp = unicode( open( path, 'r' ).read(), 'utf-8' )
  5.     #return getMorphemes1( inp, ws, bs)
  6.     f = codecs.open( path, 'r', 'utf-8' )
  7.     inp = f.readlines()
  8.     f.close()
  9.  
  10.     #return getMorphemes( mecab(None), e, ws, bs )
  11.     mcb = mecab(None)
  12.  
  13.     s = set()
  14.     for i in inp:
  15.         ms = getMorphemes( mcb, i.strip(), ws, bs)
  16.         for m in ms:
  17.             s.add(m)
  18.  
  19.     mcb.terminate()
  20.  
  21.     return list(s)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement