Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- import sys
- import torch
- from torch import nn, optim
- import re
- def text_cleaner(text):
- # lower case text
- newString = text.lower()
- newString = re.sub(r"'s\b","",newString)
- # remove punctuations
- newString = re.sub("[^a-zA-Z]", " ", newString)
- long_words=[]
- # remove short word
- for i in newString.split():
- if len(i) > 3:
- long_words.append(i)
- return (" ".join(long_words)).strip()
- # preprocess the text
- with open('100-0.txt', 'r', encoding='utf-8') as myfile:
- data = myfile.read()
- data_new = text_cleaner(data)
- # przewidujemy 33 znak na podstawie 33
- history_length = 32
- # na wypadek dziwnych znaków spoza ASCI
- nb_of_char_codes = 128
- embedding_size = 10
- # startujemy od samych znaków końca linii
- history_encoded = [ord('\n')] * history_length
- device = torch.device('cpu')
- hidden_size = 100
- criterion = nn.NLLLoss()
- def char_source():
- for line in data_new:
- for char in line:
- if ord(char) < nb_of_char_codes:
- yield ord(char)
- class NGramLangaugeModel(nn.Module):
- def __init__(self, nb_of_char_codes, history_length, embedding_size, hidden_size):
- super(NGramLangaugeModel, self).__init__()
- self.model = nn.Sequential(
- nn.Linear(history_length * embedding_size, hidden_size),
- nn.Linear(hidden_size, nb_of_char_codes),
- nn.LogSoftmax()
- ).to(device)
- def forward(self, inputs):
- embedded_inputs = self.embeddings(inputs)
- input_combined = torch.cat()
- return self.model(embedded_inputs.view(-1))
- def generate(self, to_be_con, n):
- t = (" " * history_length + to_be_con)[-history_length:]
- history = [ord(c) for c in t]
- with torch.no_grad():
- for _ in range(n):
- x = torch.tensor(history, dtype=torch.long)
- y = torch.exp(model(x))
- c = torch.multinomial(y, 1)[0].item()
- t += chr(c)
- history.pop(0)
- history.append(c)
- best = range(nb_of_char_codes)
- return t
- def initHidden(self):
- return torch.zeros(1, hidden_size)
- model = NGramLangaugeModel(nb_of_char_codes, history_length, embedding_size, hidden_size)
- optimizer = optim.Adam(model.parameters())
- counter = 0
- step = 1000
- losses = []
- for c in char_source():
- model.zero_grad()
- x = torch.tensor(history_encoded, dtype=torch.long, device=device)
- y = model(x)
- loss = criterion(y.view(1, -1),
- torch.tensor([c], dtype=torch.long, device=device))
- # losses.append(loss.item())
- # if len(losses) > step:
- # losses.pop()
- if counter == step:
- counter = 0
- print(model.generate("be or not to ", 100))
- print('----------------\n')
- counter += 1
- loss.backward()
- optimizer.step()
- history_encoded.pop(0)
- history_encoded.append(c)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement