Advertisement
Guest User

Untitled

a guest
Feb 21st, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.80 KB | None | 0 0
  1. import MeCab
  2.  
  3. tagger = MeCab.Tagger('-d /usr/lib64/mecab/dic/mecab-ipadic-neologd/')
  4. ignore_parts_of_speech = ('記号','BOS/EOS',) # Select part of speech which you want to ignore from parse result
  5. adopt_parts_of_speech = ('名詞',) # Select part of speech which you want to adopt from parse result
  6.  
  7. def get_nouns(line):
  8. token_list = []
  9. node = tagger.parseToNode(line)
  10. while node:
  11. try:
  12. feature = node.feature
  13. if feature.split(',')[0] in adopt_parts_of_speech and feature.split(',')[0] not in ignore_parts_of_speech:
  14. #print("token:", node.surface)
  15. token_list.append(node.surface.encode('utf-8'))
  16. except Exception as e:
  17. print("Error:", e)
  18. continue
  19. finally:
  20. node = node.next
  21.  
  22. return token_list
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement