Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs
- def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
- #inp = unicode( open( path, 'r' ).read(), 'utf-8' )
- #return getMorphemes1( inp, ws, bs)
- f = codecs.open( path, 'r', 'utf-8' )
- inp = f.readlines()
- f.close()
- #return getMorphemes( mecab(None), e, ws, bs )
- mcb = mecab(None)
- s = set()
- for i in inp:
- ms = getMorphemes( mcb, i.strip(), ws, bs)
- for m in ms:
- s.add(m)
- mcb.terminate()
- return list(s)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement