Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import random
- from janome.tokenizer import Tokenizer
- # Janomeを使用してテキストデータを単語に分割する
- def wakati(text):
- text = text.replace('\n','') #改行を削除
- text = text.replace('\r','') #スペースを削除
- text = text.replace('「','') ##開き括弧を削除
- text = text.replace('」','') ##開き括弧を削除
- text = text.replace('(','') ##開き括弧を削除
- text = text.replace(')','') ##閉じ括弧を削除
- text = text.replace('(','') ##開き括弧を削除
- text = text.replace(')','') ##閉じ括弧を削除
- t = Tokenizer()
- result =t.tokenize(text, wakati=True)
- return result
- #デフォルトの文の数は20
- def generate_text(num_sentence=20):
- filename = "sample.txt"
- src = open(filename, "r",encoding="utf-8").read()
- wordlist = wakati(src)
- ## src = open(filename, "r").read() に,encoding="utf-8"を追加
- #マルコフ連鎖用のテーブルを作成
- markov = {}
- w1 = ""
- w2 = ""
- for word in wordlist:
- if w1 and w2:
- if (w1, w2) not in markov:
- markov[(w1, w2)] = []
- markov[(w1, w2)].append(word)
- w1, w2 = w2, word
- #文章の自動生成
- count_kuten = 0 #句点「。」の数
- num_sentence= num_sentence
- sentence = ""
- w1, w2 = random.choice(list(markov.keys()))
- while count_kuten < num_sentence:
- tmp = random.choice(markov[(w1, w2)])
- sentence += tmp
- if(tmp=='。'):
- count_kuten += 1
- sentence += '\n' #1文ごとに改行
- w1, w2 = w2, tmp
- ## sentence += tmp # sentenceにtmpを加える
- ## count_kuten += 1 # count_kutenの数を1増やす
- with open('takuya.txt', 'a', encoding = 'utf_8') as f:
- f.writelines(sentence)
- ## with open('takuya.txt', 'a', encoding = 'utf_8') as f: # dics_markov.txtを末尾追加で書き込み用で開く
- ## f.writelines(sentence) # fにsentenceを書き込む
- print(sentence)
- if __name__ == "__main__":
- generate_text()
- ## if __name__ == "__main__": # 外部からインポートした時に自動で実行しないようにする
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement