Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- from sudachipy import tokenizer
- from sudachipy import dictionary
- from sudachipy import config
- def get_token(path):
- with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
- settings = json.load(f)
- tokenizer_obj = dictionary.Dictionary(settings).create()
- test_data = open(path, "r")
- source = test_data.read()
- test_data.close()
- mode = tokenizer.Tokenizer.SplitMode.C
- result = [m.surface() for m in tokenizer_obj.tokenize(mode,source)]
- word_list = []
- for mrph in result:
- if not (mrph == ""):
- norm_word = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form()
- hinsi = tokenizer_obj.tokenize(mode,norm_word)[0].part_of_speech()[0]
- if hinsi in ["名詞", "動詞", "形容詞"]:
- word = tokenizer_obj.tokenize(mode,norm_word)[0].dictionary_form()
- word_list.append(word)
- return word_list
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement