Advertisement
Guest User

Untitled

a guest
Dec 5th, 2016
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.38 KB | None | 0 0
  1. # en2ja_data_download.shで落としてきたデータから英日翻訳の訓練データを作成するスクリプト
  2.  
  3. import random
  4.  
  5. # 和文と英文を取り出す
  6. en_sentences, ja_sentences = {}, {}
  7. lines = open('sentences.csv').read().split('\n')
  8. for i in range(len(lines)):
  9. if lines[i] == '':
  10. continue
  11. no, lang, sentence = lines[i].split('\t')
  12. if lang == 'eng':
  13. en_sentences[no] = sentence
  14. elif lang == 'jpn':
  15. ja_sentences[no] = sentence
  16.  
  17. import MeCab
  18. mecab = MeCab.Tagger("-Owakati")
  19.  
  20. # 対訳データを作る
  21. lines = open('jpn_indices.csv').read().split('\n')
  22.  
  23. # 今回はデータサイズを制限する
  24. random.shuffle(lines)
  25. data_size = 2500
  26.  
  27. with open('en.txt', 'w') as en_file, open('ja.txt', 'w') as ja_file:
  28. for i in range(len(lines)):
  29. if lines[i] == '':
  30. continue
  31. ja_no, en_no, _ = lines[i].split('\t')
  32. if en_no == -1:
  33. continue
  34. if en_no not in en_sentences:
  35. continue
  36. if ja_no not in ja_sentences:
  37. continue
  38.  
  39. # 英文は小文字に
  40. en_file.write(en_sentences[en_no].lower())
  41. en_file.write('\n')
  42.  
  43. # 和文は分かち書きにする
  44. ja_file.write(mecab.parse(ja_sentences[ja_no]))
  45.  
  46. # 必要なデータサイズに達したので終了
  47. data_size -= 1
  48. if data_size <= 0:
  49. break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement