Advertisement
Guest User

Untitled

a guest
May 27th, 2018
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.32 KB | None | 0 0
  1. import pickle
  2. import torch
  3. import torch.nn as nn
  4. import torch.nn.functional as F
  5. import torch.optim as optim
  6. from torch.autograd import Variable
  7. import sklearn
  8. from sklearn.feature_extraction.text import CountVectorizer
  9. from sklearn.model_selection import train_test_split
  10. from collections import Counter
  11.  
  12. torch.manual_seed(2)
  13. CUDA_LAUNCH_BLOCKING=1
  14. torch.backends.cudnn.enabled=False
  15.  
  16. class ConvNet(nn.Module):
  17.  
  18. def __init__(self):
  19. super(ConvNet, self).__init__()
  20.  
  21. V = 50000
  22. D = 300
  23. C = 1
  24. Ci = 1
  25. Co = 100
  26. Ks = [3,4,5,5,5]
  27.  
  28. self.embed = nn.Embedding(V, D)
  29. # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
  30. self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
  31. '''
  32. self.conv13 = nn.Conv2d(Ci, Co, (3, D))
  33. self.conv14 = nn.Conv2d(Ci, Co, (4, D))
  34. self.conv15 = nn.Conv2d(Ci, Co, (5, D))
  35. '''
  36. self.dropout = nn.Dropout(0.2)
  37. self.fc1 = nn.Linear(len(Ks)*Co, C)
  38.  
  39. def conv_and_pool(self, x, conv):
  40. x = F.relu(conv(x)).squeeze(3) # (N, Co, W)
  41. x = F.max_pool1d(x, x.size(2)).squeeze(2)
  42. return x
  43.  
  44. def forward(self, x):
  45. x = self.embed(x) # (N, W, D)
  46.  
  47. x = x.unsqueeze(1) # (N, Ci, W, D)
  48.  
  49. x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks)
  50.  
  51. x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks)
  52.  
  53. x = torch.cat(x, 1)
  54.  
  55. '''
  56. x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
  57. x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
  58. x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
  59. x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
  60. '''
  61. x = self.dropout(x) # (N, len(Ks)*Co)
  62. logit = self.fc1(x) # (N, C)
  63. return logit
  64.  
  65.  
  66. class LSTMClassifier(nn.Module):
  67. def __init__(self, embedding_dim, hidden_dim, vocab_size, use_gpu, batch_size, dropout=0.3, num_layers=5):
  68. super(LSTMClassifier, self).__init__()
  69. self.hidden_dim = hidden_dim
  70. self.use_gpu = use_gpu
  71. self.batch_size = batch_size
  72. self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, dropout=dropout, num_layers=num_layers)
  73. self.hidden2label = nn.Linear(hidden_dim, 1)
  74. self.num_layers = num_layers
  75. self.hidden = self.init_hidden()
  76. self.normalization = nn.BatchNorm1d(embedding_dim)
  77.  
  78. def init_hidden(self):
  79. # first is the hidden h
  80. # second is the cell c
  81. if self.use_gpu:
  82. return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim, dtype = torch.float32),
  83. torch.zeros(self.num_layers, self.batch_size, self.hidden_dim, dtype = torch.float32))
  84. else:
  85. return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim, dtype = torch.float32),
  86. torch.zeros(self.num_layers, self.batch_size, self.hidden_dim, dtype = torch.float32))
  87.  
  88. def forward(self, review):
  89. # x = self.embeddings(review).view(review.shape[1], self.batch_size, -1)
  90. review = self.normalization(review)
  91. lstm_out, self.hidden = self.lstm(review.view(1, self.batch_size, -1), self.hidden)
  92. # y = self.hidden2label(lstm_out[-1])
  93. bruh = self.hidden[0].transpose(0, 1).contiguous().view(self.batch_size, -1)
  94. out=self.hidden2label(bruh)
  95. return out
  96. # return y
  97.  
  98.  
  99. def computeAccuracy(scores, Y, silent=False):
  100. totalError = 0
  101. percentageError = 0
  102. for i in range(len(scores)):
  103. # guess = torch.argmax(scores[i])
  104. if(not silent):
  105. print("Guess: ", scores[i].item())
  106. print("True value: ", Y[i].item())
  107. # if (Y[i] == guess):
  108. # correctGuesses +=1
  109. totalError += (Y[i]-scores[i])**2
  110. percentageError += (abs(Y[i]-scores[i])/max(Y[i], 0.1))
  111. accuracy = totalError/len(scores)
  112. # print("correctGuesses", correctGuesses)
  113. return accuracy, percentageError/len(scores)
  114.  
  115.  
  116. def start(batch_size = 100, hidden_dim = 100, n_epochs = 10,
  117. num_layers=1, lr=0.01, weight_decay=0.95,momentum=0.9,
  118. dropout=0, trainingExamples = 3000, testExamples=400):
  119. data, vocabLen = pickle.load(open("vectorizedReviews.pkl", "rb"))
  120. embedding_dim = 500
  121.  
  122. # # Create your dictionary that maps vocab words to integers here
  123.  
  124. training_data, test_data = train_test_split(data, test_size=0.2)
  125. # print(training_data)
  126. # tmp = []
  127. # for i in range(len(X)):
  128. # tmp.append([X[i], Y[i]])
  129. # print(tmp)
  130. # print(X)
  131. # DETTA VAR SÃ… VI GJORDE INNAN. Pre-Github
  132. # vectorizer = CountVectorizer(analyzer = "word", \
  133. # tokenizer = None, \
  134. # preprocessor = None, \
  135. # stop_words = "english", \
  136. # max_features = 1000)
  137. #
  138. #atom://teletype/portal/32456f34-0df8-4ba8-a2fb-95d928e6c81a
  139. Xtrain = Variable(torch.zeros(trainingExamples, training_data[0]["content"].shape[0], dtype=torch.long))
  140. # Xtrain = Variable(10*torch.randn(trainingExamples,1,dtype=torch.long))
  141. Xtest = Variable(torch.zeros(testExamples, test_data[0]["content"].shape[0], dtype=torch.long))
  142. # Xtest = Variable(10*torch.randn(testExamples,1,dtype=torch.long))
  143. Ytrain = Variable(torch.zeros(trainingExamples, dtype=torch.float32))
  144. # Ytrain = Variable(Xtrain*Xtrain)
  145. Ytest = Variable(torch.zeros(testExamples, dtype=torch.float32))
  146. # Ytest = Variable(Xtest*Xtest)
  147.  
  148. for i in range(trainingExamples):
  149. Xtrain[i] = torch.from_numpy(training_data[i]["content"])
  150. Ytrain[i] = training_data[i]["score"]
  151. for i in range(testExamples):
  152. Xtest[i] = torch.from_numpy(test_data[i]["content"])
  153. Ytest[i] = test_data[i]["score"]
  154.  
  155. # Xtrain = vectorizer.fit_transform(textDataTrain).toarray()
  156.  
  157. if (torch.cuda.is_available()):
  158. lstm = LSTMClassifier(embedding_dim, hidden_dim, vocabLen+5, True, batch_size, num_layers=num_layers, dropout=dropout)
  159.  
  160. else:
  161. # lstm = LSTMClassifier(embedding_dim, hidden_dim, vocabLen+5, False, batch_size, num_layers=num_layers, dropout=dropout)
  162. lstm = ConvNet()
  163.  
  164. lossFunction = nn.MSELoss()
  165. optimizer = torch.optim.Adam(lstm.parameters(), lr = lr, weight_decay = weight_decay)
  166. # trainingData = torch.zeros(round(X.shape[0]/batch_size), batch_size, X.shape[1])
  167. # Xtrain = Xtrain.view(Xtrain.shape[0], 1, Xtrain.shape[1])
  168. # Xtest = Xtest.view(Xtest.shape[0], 1, Xtest.shape[1])
  169. lstm.train(True)
  170. for epoch in range(n_epochs):
  171. print(epoch)
  172. lossSum=0
  173. testXBatch = Variable(Xtest[0:batch_size])
  174. testYBatch = Variable(Ytest[0:batch_size])
  175. for i in range(round(len(Xtrain)/batch_size)-1):
  176. i_start = i*batch_size
  177. i_end=(i+1)*batch_size
  178. Xbatch = Variable(Xtrain[i_start:i_end, :])
  179. Ybatch = Variable(Ytrain[i_start:i_end])
  180. # lstm.hidden = lstm.init_hidden()
  181. scores = lstm(Xbatch)
  182. loss = lossFunction(scores, Ybatch.view(len(Ybatch), 1))
  183. testLoss = lossFunction(lstm(testXBatch), testYBatch.view(len(testYBatch), 1))
  184. lossSum+=testLoss
  185. lstm.zero_grad()
  186. loss.backward()
  187. optimizer.step()
  188. print(lossSum)
  189.  
  190.  
  191. # lstm.train(False)
  192.  
  193. lstm.eval()
  194. accuracy = []
  195. baseLineAccuracy = []
  196.  
  197. percentageError = []
  198. baseLinePercentageError = []
  199. meanScore = torch.Tensor.mean(Ytrain).item()
  200.  
  201.  
  202. with torch.no_grad():
  203. for i in range(round(len(Xtest)/batch_size)-1):
  204. print(i)
  205. i_start = i*batch_size
  206. i_end=(i+1)*batch_size
  207. Xbatch = Xtest[i_start:i_end, :]
  208. Ybatch = Ytest[i_start:i_end]
  209. scores = lstm(Xbatch)
  210. baseLineScores = meanScore*torch.ones(batch_size,1)
  211.  
  212. result = computeAccuracy(scores, Ybatch.view(len(Ybatch), 1))
  213. baseLineResult = computeAccuracy(baseLineScores, Ybatch.view(len(Ybatch), 1), silent=True)
  214.  
  215.  
  216. accuracy.append(result[0])
  217. baseLineAccuracy.append(baseLineResult[0])
  218.  
  219. percentageError.append(result[1])
  220. baseLinePercentageError.append(baseLineResult[1])
  221.  
  222. totalAccuracy = sum(accuracy)/len(accuracy)
  223. baseLineTotalAccuracy = sum(baseLineAccuracy)/len(baseLineAccuracy)
  224.  
  225. print(len(percentageError))
  226. totalPercentageAccuracy = sum(percentageError)/len(percentageError)
  227. baseLineTotalPercentageAccuracy = sum(baseLinePercentageError)/len(baseLinePercentageError)
  228. # print(accuracy)
  229. print("totalAccuracy:", totalAccuracy.item())
  230. print("BaselineAccuracy", baseLineTotalAccuracy.item())
  231.  
  232. print("average percentage error", totalPercentageAccuracy.item())
  233. print("base line percentage error", baseLineTotalPercentageAccuracy.item())
  234. return totalAccuracy.item()
  235.  
  236.  
  237.  
  238.  
  239.  
  240. # def start(batch_size = 100, hidden_dim = 120, n_epochs = 10, num_layers=1, lr=0.01, weight_decay=0.09,momentum=0.9, dropout=0, testexampel, tarining):
  241.  
  242. def main():
  243. trainingExamples = 80
  244. testExamples = 40
  245. batch_size = 20
  246. hidden_dim = 4
  247. n_epochs = 10
  248. num_layers = 1
  249. lr = 0.0005
  250. momentum = 0.95
  251. weight_decay = 0
  252. file = open("cross-search.txt", "a")
  253. for _ in range(9,0 ,-1):
  254. # momentum *=0.1
  255. for _ in range(0, 10, 1):
  256. # weight_decay*=0.1
  257. for _ in range(1,10,1):
  258. # lr *= 0.001
  259. accuracy = start(trainingExamples=trainingExamples, testExamples=testExamples, batch_size=batch_size, hidden_dim=hidden_dim,
  260. n_epochs=n_epochs, num_layers=num_layers, lr=lr, weight_decay=weight_decay, momentum=momentum)
  261. print("________________________________")
  262. print("hidden Dims: ",hidden_dim)
  263. print("num_layers", num_layers)
  264. print("trainingExamples: " ,trainingExamples)
  265. print("testExamples: " ,testExamples)
  266. print("batch_size: " ,batch_size)
  267. print("n_epochs: " ,n_epochs)
  268. print("lr: " ,lr)
  269. print("weight_decay: " ,weight_decay)
  270. print("momentum: " ,momentum)
  271. print("absolute error: " ,accuracy)
  272.  
  273. file.write("________________________________"+"\n")
  274. file.write("hidden Dims: "+str(hidden_dim)+"\n")
  275. file.write("num_layers"+str(num_layers)+"\n")
  276. file.write("trainingExamples: "+str(trainingExamples)+"\n")
  277. file.write("testExamples: "+str(testExamples)+"\n")
  278. file.write("batch_size: "+str(batch_size)+"\n")
  279. file.write("n_epochs: "+str(n_epochs)+"\n")
  280. file.write("lr: " +str(lr)+"\n")
  281. file.write("weight_decay: " +str(weight_decay)+"\n")
  282. file.write("momentum: " +str(momentum)+"\n")
  283. file.write("absolute error: " +str(accuracy)+"\n")
  284.  
  285.  
  286.  
  287. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement