import codecs def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation #inp = unicode( open( path, 'r' ).read(), 'utf-8' ) #return getMorphemes1( inp, ws, bs) f = codecs.open( path, 'r', 'utf-8' ) inp = f.readlines() f.close() #return getMorphemes( mecab(None), e, ws, bs ) mcb = mecab(None) s = set() for i in inp: ms = getMorphemes( mcb, i.strip(), ws, bs) for m in ms: s.add(m) mcb.terminate() return list(s)