Advertisement
Guest User

Untitled

a guest
May 1st, 2017
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.23 KB | None | 0 0
  1. #CS84: word2vec project
  2.  
  3.  
  4. from __future__ import print_function
  5. import os
  6. import numpy as np
  7. np.random.seed(1337) #re-seed generator
  8.  
  9. from keras.preprocessing.text import Tokenizer
  10. from keras.preprocessing.sequence import pad_sequences
  11. from keras.utils.np_utils import to_categorical
  12. from keras.layers import Dense, Input, Flatten, Dropout
  13. from keras.layers import Conv1D, MaxPooling1D, Embedding
  14. from keras.models import Model
  15. import collections, numpy, sys, re
  16.  
  17. def loadGloveEmbeddings():
  18. #Load Glove, a model of words to numbers
  19. # Stores a dictionary of words, with numbers corresponding
  20. print('Indexing word vectors.')
  21. BASE_DIR = '/home/student/newsgroup' #where glove file is
  22. GLOVE_DIR = BASE_DIR + '/'
  23. GLOVE_DIR = BASE_DIR + '/glove.6B/'#accesses glove file
  24. embeddings_index = {} #opens Glove
  25. f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
  26. for line in f:
  27. values = line.split()
  28. word = values[0]#sets the word to 0th value in array
  29. coefs = np.asarray(values[1:], dtype='float32')
  30. embeddings_index[word] = coefs
  31. #index mapping words in the embeddings set
  32. #to their embedding vector
  33. f.close()
  34. return embeddings_index
  35.  
  36.  
  37. embeddings_index = loadGloveEmbeddings() #opens Glove
  38.  
  39. print('Found %s word vectors.' % len(embeddings_index))
  40. # Loaded Glove.
  41. #embeddings_index is a map. ex: 'cat' => array(100)
  42.  
  43. def loadbooks():
  44. filename= "bothcanonfanon.txt"
  45. books = []
  46. with open(filename) as f:
  47. for line in f: #splits each line at pipe
  48. books.append([n for n in line.strip().split('|')])
  49. booktexts = [] #string text
  50. bookisfanon = []
  51. for book in books:
  52. canonfanon,ident,text = book[0],book[1],book[2]
  53. #canonfanon = 0th book
  54. #identification = 1th book
  55. #text = 2th book
  56. text = re.sub(r'[^a-zA-Z ]+','', text)
  57. text = text.lower()
  58. #makes all lower and cleans up by taking out
  59. booktexts.append(text)
  60. bookisfanon.append(1 if canonfanon=='fanon' else 0)
  61. # Converts to One-Hot encoding
  62. y_isfanon = to_categorical(bookisfanon)
  63. return (booktexts,y_isfanon) #bookisfanon)
  64.  
  65. (booktexts,y_isfanon) = loadbooks()
  66.  
  67.  
  68. test_text = raw_input("Input test text: ")
  69. test_text = [test_text]
  70.  
  71. corpi = [booktexts, test_text]
  72.  
  73. def create_embedding_matrix(EMBEDDING_DIM, MAX_NB_WORDS, word_index):
  74. print('Preparing embedding matrix.')
  75. # prepare embedding matrix
  76. nb_words = min(MAX_NB_WORDS, len(word_index))
  77. embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
  78. for word, i in word_index.items():
  79. if i > MAX_NB_WORDS:
  80. continue
  81. embedding_vector = embeddings_index.get(word)
  82. if embedding_vector is not None: # words not found in embedding index will be all-zeros.
  83. embedding_matrix[i] = embedding_vector
  84. return (nb_words, embedding_matrix)
  85.  
  86. MAX_SEQUENCE_LENGTH = 1000
  87.  
  88. def create_tokenizer_and_embedding(MAX_SEQUENCE_LENGTH, train):
  89. MAX_NB_WORDS = 5000 #sets up for padding
  90. EMBEDDING_DIM = 100
  91. tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
  92. tokenizer.fit_on_texts(train)
  93. (nb_words, embedding_matrix) = create_embedding_matrix(EMBEDDING_DIM, MAX_NB_WORDS, tokenizer.word_index)
  94. # load pre-trained word embeddings into an Embedding layer
  95. # set trainable = False so as to keep the embeddings fixed
  96. embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)
  97. return (tokenizer, embedding_layer)
  98.  
  99. (tokenizer, embedding_layer) = create_tokenizer_and_embedding(MAX_SEQUENCE_LENGTH, corpi[0])
  100.  
  101.  
  102.  
  103. def create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, corpi):
  104. MAX_NB_WORDS = 5000 #sets up for padding
  105. EMBEDDING_DIM = 100
  106. padded_sequences = []
  107. for corpus in corpi:
  108. corpi_sequence = tokenizer.texts_to_sequences(corpus)
  109. padded_sequences.append(pad_sequences(corpi_sequence, maxlen=MAX_SEQUENCE_LENGTH))
  110. return padded_sequences
  111.  
  112. padded_sequences = create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, corpi)
  113.  
  114. #Sequences has the index of each word
  115. #instead of the string of each word
  116. #length is still the same
  117.  
  118. #tokenizes to get rid of repeats
  119. #word_index = tokenizer.word_index
  120. # print('Found %s unique tokens.' % len(word_index))
  121.  
  122. data = padded_sequences[0] #the books, not the user input
  123.  
  124.  
  125. VALIDATION_SPLIT = 0.3 #splits in train and test
  126. # train is 70%, test 30%
  127. indices = np.arange(data.shape[0])
  128. np.random.shuffle(indices)
  129. data = data[indices]
  130. labels = y_isfanon[indices]
  131. nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
  132.  
  133. #sets train and test(data and labels)
  134. x_train = data[:-nb_validation_samples]
  135. y_train = labels[:-nb_validation_samples]
  136. x_val = data[-nb_validation_samples:]
  137. y_val = labels[-nb_validation_samples:]
  138.  
  139. x_test = padded_sequences[1]
  140.  
  141. print('Training model.')
  142.  
  143. # train a 1D convnet with global maxpooling
  144. sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
  145.  
  146. embedded_sequences = embedding_layer(sequence_input)
  147.  
  148. x = Conv1D(32, 5, activation='relu')(embedded_sequences)
  149. x = MaxPooling1D(3)(x)
  150. x = Dropout(0.2)(x)
  151.  
  152. x = Conv1D(32, 5, activation='tanh')(x)
  153. x = MaxPooling1D(3)(x)
  154. x = Dropout(0.2)(x)
  155.  
  156. x = Conv1D(32, 5, activation='tanh')(x)
  157. x = Dropout(0.2)(x)
  158.  
  159. x = Conv1D(32, 5, activation='tanh')(x)
  160. x = MaxPooling1D(3)(x)
  161. x = Dropout(0.2)(x)
  162.  
  163. x = Conv1D(32, 5, activation='tanh')(x)
  164. x = Dropout(0.2)(x)
  165.  
  166. x = Conv1D(32, 5, activation='tanh')(x)
  167. x = MaxPooling1D(3)(x)
  168. x = Flatten()(x)
  169. x = Dropout(0.2)(x)
  170. x = Dense(32, activation='softmax')(x)
  171. preds = Dense(len(labels[0]), activation='softmax')(x)
  172.  
  173. model = Model(sequence_input, preds)
  174. model.compile(loss='mean_squared_error',
  175. optimizer='adamax',
  176. #optimizes net and minimizes losses
  177. metrics=['acc'])
  178.  
  179.  
  180. #learning, running model
  181. model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=300, batch_size=256)
  182.  
  183.  
  184. def predictText(textstr):
  185. textstr = re.sub(r'[^a-zA-Z ]', '', textstr)
  186. testcorpus = [[textstr]]
  187. test_sequences = create_sequences(MAX_SEQUENCE_LENGTH, tokenizer, testcorpus)
  188. return model.predict(test_sequences, batch_size= 256, verbose=0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement