Untitled

#!/usr/bin/python3

import sys
import torch
from torch import nn, optim
import re


def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString)
    long_words=[]
    # remove short word
    for i in newString.split():
        if len(i) > 3:
            long_words.append(i)
    return (" ".join(long_words)).strip()


# preprocess the text
with open('100-0.txt', 'r', encoding='utf-8') as myfile:
  data = myfile.read()

data_new = text_cleaner(data)

# przewidujemy 33 znak na podstawie 33
history_length = 32

# na wypadek dziwnych znaków spoza ASCI
nb_of_char_codes = 128

embedding_size = 10

# startujemy od samych znaków końca linii
history_encoded = [ord('\n')] * history_length

device = torch.device('cpu')

hidden_size = 100

criterion = nn.NLLLoss()


def char_source():
    for line in data_new:
        for char in line:
            if ord(char) < nb_of_char_codes:
                yield ord(char)


class NGramLangaugeModel(nn.Module):
    def __init__(self, nb_of_char_codes, history_length, embedding_size, hidden_size):
        super(NGramLangaugeModel, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(history_length * embedding_size, hidden_size),
            nn.Linear(hidden_size, nb_of_char_codes),
            nn.LogSoftmax()
        ).to(device)

    def forward(self, inputs):
        embedded_inputs = self.embeddings(inputs)
        input_combined = torch.cat()
        return self.model(embedded_inputs.view(-1))

    def generate(self, to_be_con, n):
        t = (" " * history_length + to_be_con)[-history_length:]
        history = [ord(c) for c in t]

        with torch.no_grad():
            for _ in range(n):
                x = torch.tensor(history, dtype=torch.long)
                y = torch.exp(model(x))

                c = torch.multinomial(y, 1)[0].item()
                t += chr(c)

                history.pop(0)
                history.append(c)
                best = range(nb_of_char_codes)
        return t

    def initHidden(self):
        return torch.zeros(1, hidden_size)


model = NGramLangaugeModel(nb_of_char_codes, history_length, embedding_size, hidden_size)

optimizer = optim.Adam(model.parameters())

counter = 0
step = 1000
losses = []

for c in char_source():
    model.zero_grad()
    x = torch.tensor(history_encoded, dtype=torch.long, device=device)
    y = model(x)

    loss = criterion(y.view(1, -1),
                     torch.tensor([c], dtype=torch.long, device=device))

    # losses.append(loss.item())
    # if len(losses) > step:
    #     losses.pop()

    if counter == step:
        counter = 0
        print(model.generate("be or not to ", 100))
        print('----------------\n')

    counter += 1

    loss.backward()
    optimizer.step()

    history_encoded.pop(0)
    history_encoded.append(c)