SHARE
TWEET

morphemes.py

a guest Jun 29th, 2011 101 Never
  1. import codecs
  2.  
  3. def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  4.     #inp = unicode( open( path, 'r' ).read(), 'utf-8' )
  5.     #return getMorphemes1( inp, ws, bs)
  6.     f = codecs.open( path, 'r', 'utf-8' )
  7.     inp = f.readlines()
  8.     f.close()
  9.  
  10.     #return getMorphemes( mecab(None), e, ws, bs )
  11.     mcb = mecab(None)
  12.  
  13.     s = set()
  14.     for i in inp:
  15.         ms = getMorphemes( mcb, i.strip(), ws, bs)
  16.         for m in ms:
  17.             s.add(m)
  18.  
  19.     mcb.terminate()
  20.  
  21.     return list(s)
RAW Paste Data
Top