Want more features on Pastebin? Sign Up, it's FREE!
Guest

morphemes.py

By: a guest on Jun 29th, 2011  |  syntax: Python  |  size: 0.51 KB  |  views: 95  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import codecs
  2.  
  3. def file2ms( path, ws=None, bs=[u'記号'] ): # bs filters punctuation
  4.     #inp = unicode( open( path, 'r' ).read(), 'utf-8' )
  5.     #return getMorphemes1( inp, ws, bs)
  6.     f = codecs.open( path, 'r', 'utf-8' )
  7.     inp = f.readlines()
  8.     f.close()
  9.  
  10.     #return getMorphemes( mecab(None), e, ws, bs )
  11.     mcb = mecab(None)
  12.  
  13.     s = set()
  14.     for i in inp:
  15.         ms = getMorphemes( mcb, i.strip(), ws, bs)
  16.         for m in ms:
  17.             s.add(m)
  18.  
  19.     mcb.terminate()
  20.  
  21.     return list(s)
clone this paste RAW Paste Data