Untitled

# -*- coding: utf-8 -*-

import random
from janome.tokenizer import Tokenizer

# Janomeを使用してテキストデータを単語に分割する
def wakati(text):
    text = text.replace('\n','') #改行を削除
    text = text.replace('\r','') #スペースを削除
    text = text.replace('「','') ##開き括弧を削除
    text = text.replace('」','') ##開き括弧を削除
    text = text.replace('(','') ##開き括弧を削除
    text = text.replace(')','') ##閉じ括弧を削除
    text = text.replace('（','') ##開き括弧を削除
    text = text.replace('）','') ##閉じ括弧を削除
    t = Tokenizer()
    result =t.tokenize(text, wakati=True)
    return result

#デフォルトの文の数は20
def generate_text(num_sentence=20):
    filename = "sample.txt"
    src = open(filename, "r",encoding="utf-8").read()
    wordlist = wakati(src)
##    src = open(filename, "r").read()　に,encoding="utf-8"を追加

    #マルコフ連鎖用のテーブルを作成
    markov = {}
    w1 = ""
    w2 = ""
    for word in wordlist:
        if w1 and w2:
            if (w1, w2) not in markov:
                markov[(w1, w2)] = []
            markov[(w1, w2)].append(word)
        w1, w2 = w2, word

    #文章の自動生成
    count_kuten = 0 #句点「。」の数
    num_sentence= num_sentence
    sentence = ""
    w1, w2  = random.choice(list(markov.keys()))
    while count_kuten < num_sentence:
        tmp = random.choice(markov[(w1, w2)])
        sentence += tmp
        if(tmp=='。'):
            count_kuten += 1
            sentence += '\n' #1文ごとに改行
        w1, w2 = w2, tmp
##        sentence += tmp　# sentenceにtmpを加える
##            count_kuten += 1　# count_kutenの数を1増やす

    with open('takuya.txt', 'a', encoding = 'utf_8') as f:
             f.writelines(sentence)
##　    with open('takuya.txt', 'a', encoding = 'utf_8') as f:　 # dics_markov.txtを末尾追加で書き込み用で開く
##　             f.writelines(sentence)　# fにsentenceを書き込む

    print(sentence)

if __name__ == "__main__":
    generate_text()
## if __name__ == "__main__":　 # 外部からインポートした時に自動で実行しないようにする