Advertisement
Guest User

Untitled

a guest
Dec 11th, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.00 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. import sys
  4. import torch
  5. from torch import nn, optim
  6. import re
  7.  
  8.  
  9. def text_cleaner(text):
  10. # lower case text
  11. newString = text.lower()
  12. newString = re.sub(r"'s\b","",newString)
  13. # remove punctuations
  14. newString = re.sub("[^a-zA-Z]", " ", newString)
  15. long_words=[]
  16. # remove short word
  17. for i in newString.split():
  18. if len(i) > 3:
  19. long_words.append(i)
  20. return (" ".join(long_words)).strip()
  21.  
  22.  
  23. # preprocess the text
  24. with open('100-0.txt', 'r', encoding='utf-8') as myfile:
  25. data = myfile.read()
  26.  
  27. data_new = text_cleaner(data)
  28.  
  29. # przewidujemy 33 znak na podstawie 33
  30. history_length = 32
  31.  
  32. # na wypadek dziwnych znaków spoza ASCI
  33. nb_of_char_codes = 128
  34.  
  35. embedding_size = 10
  36.  
  37. # startujemy od samych znaków końca linii
  38. history_encoded = [ord('\n')] * history_length
  39.  
  40. device = torch.device('cpu')
  41.  
  42. hidden_size = 100
  43.  
  44. criterion = nn.NLLLoss()
  45.  
  46.  
  47. def char_source():
  48. for line in data_new:
  49. for char in line:
  50. if ord(char) < nb_of_char_codes:
  51. yield ord(char)
  52.  
  53.  
  54. class NGramLangaugeModel(nn.Module):
  55. def __init__(self, nb_of_char_codes, history_length, embedding_size, hidden_size):
  56. super(NGramLangaugeModel, self).__init__()
  57.  
  58. self.model = nn.Sequential(
  59. nn.Linear(history_length * embedding_size, hidden_size),
  60. nn.Linear(hidden_size, nb_of_char_codes),
  61. nn.LogSoftmax()
  62. ).to(device)
  63.  
  64. def forward(self, inputs):
  65. embedded_inputs = self.embeddings(inputs)
  66. input_combined = torch.cat()
  67. return self.model(embedded_inputs.view(-1))
  68.  
  69. def generate(self, to_be_con, n):
  70. t = (" " * history_length + to_be_con)[-history_length:]
  71. history = [ord(c) for c in t]
  72.  
  73. with torch.no_grad():
  74. for _ in range(n):
  75. x = torch.tensor(history, dtype=torch.long)
  76. y = torch.exp(model(x))
  77.  
  78. c = torch.multinomial(y, 1)[0].item()
  79. t += chr(c)
  80.  
  81. history.pop(0)
  82. history.append(c)
  83. best = range(nb_of_char_codes)
  84. return t
  85.  
  86. def initHidden(self):
  87. return torch.zeros(1, hidden_size)
  88.  
  89.  
  90. model = NGramLangaugeModel(nb_of_char_codes, history_length, embedding_size, hidden_size)
  91.  
  92. optimizer = optim.Adam(model.parameters())
  93.  
  94. counter = 0
  95. step = 1000
  96. losses = []
  97.  
  98. for c in char_source():
  99. model.zero_grad()
  100. x = torch.tensor(history_encoded, dtype=torch.long, device=device)
  101. y = model(x)
  102.  
  103. loss = criterion(y.view(1, -1),
  104. torch.tensor([c], dtype=torch.long, device=device))
  105.  
  106. # losses.append(loss.item())
  107. # if len(losses) > step:
  108. # losses.pop()
  109.  
  110. if counter == step:
  111. counter = 0
  112. print(model.generate("be or not to ", 100))
  113. print('----------------\n')
  114.  
  115. counter += 1
  116.  
  117. loss.backward()
  118. optimizer.step()
  119.  
  120. history_encoded.pop(0)
  121. history_encoded.append(c)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement