Untitled

import json

from sudachipy import tokenizer
from sudachipy import dictionary
from sudachipy import config

def get_token(path):
    with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
        settings = json.load(f)
    tokenizer_obj = dictionary.Dictionary(settings).create()

    test_data = open(path, "r")

    source = test_data.read()

    test_data.close()

    mode = tokenizer.Tokenizer.SplitMode.C
    result = [m.surface() for m in tokenizer_obj.tokenize(mode,source)]

    word_list = []
    for mrph in result:
        if not (mrph == ""):
            norm_word = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form()
            hinsi = tokenizer_obj.tokenize(mode,norm_word)[0].part_of_speech()[0]

            if hinsi in  ["名詞", "動詞", "形容詞"]:
                word = tokenizer_obj.tokenize(mode,norm_word)[0].dictionary_form()
                word_list.append(word)

    return word_list