Advertisement
Guest User

Untitled

a guest
Dec 16th, 2019
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.16 KB | None | 0 0
  1. # https://github.com/oreilly-japan/deep-learning-from-scratch-2/blob/master/dataset/ptb.py
  2. 62 words = open(file_path).read().replace('\n', '<eos>').strip().split()
  3. 63
  4. 64 for i, word in enumerate(words):
  5. 65 if word not in word_to_id:
  6. 66 tmp_id = len(word_to_id)
  7. 67 word_to_id[word] = tmp_id
  8. 68 id_to_word[tmp_id] = word
  9.  
  10. # file_pathは1行1俳句,1文字毎に半角スペースのtxt形式
  11. ## ア ブ ラ ゼ ミ 狙 う B 2 9 の 鉛 筆
  12. ## 下 丸 子 三 の 三 十 花 の 天
  13. ## 以下略
  14. # 68行目の後にprint(word_to_id)で
  15. {'<eos>ア': 0, 'ブ': 1, ... '鉛': 11, '筆<eos>下': 12, ...}
  16. # となってしまうので62行目を以下のように編集
  17. with open(file_path, 'r', encoding='utf-8') as f:
  18. words = []
  19. text = f.readlines()
  20. for haiku in text:
  21. for char in haiku:
  22. if char == ' ':
  23. continue
  24. if char == '\n':
  25. char = '<eos>'
  26. words.append(char)
  27. # すると68行目の後にprint(word_to_id)で以下.
  28. {'<eos>': 0, 'ア': 1, 'ブ': 2, 'ラ': 3, 'ゼ': 4, 'ミ' ...}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement