Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # https://github.com/oreilly-japan/deep-learning-from-scratch-2/blob/master/dataset/ptb.py
- 62 words = open(file_path).read().replace('\n', '<eos>').strip().split()
- 63
- 64 for i, word in enumerate(words):
- 65 if word not in word_to_id:
- 66 tmp_id = len(word_to_id)
- 67 word_to_id[word] = tmp_id
- 68 id_to_word[tmp_id] = word
- # file_pathは1行1俳句,1文字毎に半角スペースのtxt形式
- ## ア ブ ラ ゼ ミ 狙 う B 2 9 の 鉛 筆
- ## 下 丸 子 三 の 三 十 花 の 天
- ## 以下略
- # 68行目の後にprint(word_to_id)で
- {'<eos>ア': 0, 'ブ': 1, ... '鉛': 11, '筆<eos>下': 12, ...}
- # となってしまうので62行目を以下のように編集
- with open(file_path, 'r', encoding='utf-8') as f:
- words = []
- text = f.readlines()
- for haiku in text:
- for char in haiku:
- if char == ' ':
- continue
- if char == '\n':
- char = '<eos>'
- words.append(char)
- # すると68行目の後にprint(word_to_id)で以下.
- {'<eos>': 0, 'ア': 1, 'ブ': 2, 'ラ': 3, 'ゼ': 4, 'ミ' ...}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement