Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import MeCab
- tagger = MeCab.Tagger('-d /usr/lib64/mecab/dic/mecab-ipadic-neologd/')
- ignore_parts_of_speech = ('č¨ĺˇ','BOS/EOS',) # Select part of speech which you want to ignore from parse result
- adopt_parts_of_speech = ('ĺčŠ',) # Select part of speech which you want to adopt from parse result
- def get_nouns(line):
- token_list = []
- node = tagger.parseToNode(line)
- while node:
- try:
- feature = node.feature
- if feature.split(',')[0] in adopt_parts_of_speech and feature.split(',')[0] not in ignore_parts_of_speech:
- #print("token:", node.surface)
- token_list.append(node.surface.encode('utf-8'))
- except Exception as e:
- print("Error:", e)
- continue
- finally:
- node = node.next
- return token_list
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement