Advertisement
Guest User

Untitled

a guest
Mar 20th, 2019
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.90 KB | None | 0 0
  1. import json
  2.  
  3. from sudachipy import tokenizer
  4. from sudachipy import dictionary
  5. from sudachipy import config
  6.  
  7. def get_token(path):
  8. with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
  9. settings = json.load(f)
  10. tokenizer_obj = dictionary.Dictionary(settings).create()
  11.  
  12. test_data = open(path, "r")
  13.  
  14. source = test_data.read()
  15.  
  16. test_data.close()
  17.  
  18. mode = tokenizer.Tokenizer.SplitMode.C
  19. result = [m.surface() for m in tokenizer_obj.tokenize(mode,source)]
  20.  
  21. word_list = []
  22. for mrph in result:
  23. if not (mrph == ""):
  24. norm_word = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form()
  25. hinsi = tokenizer_obj.tokenize(mode,norm_word)[0].part_of_speech()[0]
  26.  
  27. if hinsi in ["名詞", "動詞", "形容詞"]:
  28. word = tokenizer_obj.tokenize(mode,norm_word)[0].dictionary_form()
  29. word_list.append(word)
  30.  
  31. return word_list
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement