Advertisement
Guest User

sentiment-code

a guest
Feb 18th, 2017
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.38 KB | None | 0 0
  1. def trainModule():
  2.     """
  3.    Method to train a model and stores it in the program root directory.
  4.    Stored name is "trained_model.h5"
  5.    :return:
  6.    """
  7.  
  8.     MAX_NB_WORDS = 20000
  9.     MAX_SEQUENCE_LENGTH = 1000
  10.     BASE_DIR = "/home/frost/PycharmProjects/imdb-es"
  11.     GLOVE_DIR = BASE_DIR + "/glove.6B/"
  12.     EMBEDDING_DIM = 100
  13.     VALIDATION_SPLIT = 0.19
  14.     seed = 7
  15.     numpy.random.seed(seed)
  16.  
  17.     if not os.path.isfile("trained_model.h5"):
  18.         print "No model saved ... start learning"
  19.  
  20.         embedding_index = {}
  21.         f = open(os.path.join(GLOVE_DIR, "glove.6B.100d.txt"))
  22.         for line in f:
  23.             values = line.split()
  24.             word = values[0]
  25.             coefs = numpy.asarray(values[1:], dtype="float32")
  26.             embedding_index[word] = coefs
  27.         f.close()
  28.  
  29.         print("Found %s word vectors." % len(embedding_index))
  30.  
  31.         ###Info - load and prepare training data set
  32.         print "Load complete dataset..."
  33.         texts_train, labels = getFullTwitterDataSet()
  34.         ###Info - Convert list from string to int
  35.         labels = map(int,labels)
  36.         print "Start processing dataset..."
  37.         ###Info - Convert texts from unicode to ascii (tokenizer.fit_on_texts throw an error)
  38.         texts_train = [s.encode("ascii") for s in texts_train]
  39.  
  40.         tokenizer_train = Tokenizer(nb_words=MAX_NB_WORDS)
  41.         tokenizer_train.fit_on_texts(texts_train)
  42.         sequences_train = tokenizer_train.texts_to_sequences(texts_train)
  43.  
  44.         word_index_train = tokenizer_train.word_index
  45.         print ("Found %s unique tokens. (train data set)" % len(word_index_train))
  46.  
  47.         data = pad_sequences(sequences_train, maxlen = MAX_SEQUENCE_LENGTH)
  48.         labels = to_categorical(numpy.asarray(labels))
  49.         print("Train: Shape of x_train tensor:", data.shape) #('Train: Shape of x_train tensor:', (11842, 1000))
  50.         print("Train: Shape of y_train tensor:", labels.shape)
  51.  
  52.         ###Info - split data into training / validation sets
  53.         indices = numpy.arange(data.shape[0])
  54.         numpy.random.shuffle(indices)
  55.         data = data[indices]
  56.         labels = labels[indices]
  57.         nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
  58.  
  59.         print "Dataset splitted into Trainset and Validationset"
  60.         x_train = data[:-nb_validation_samples]
  61.         y_train = labels[:-nb_validation_samples]
  62.         x_val = data[-nb_validation_samples:]
  63.         y_val = labels[-nb_validation_samples:]
  64.         print ("Train: Shape of x_train: ", x_train.shape)
  65.         print ("Train: Shape of y_train: ", y_train.shape)
  66.         print ("Validate: Shape of x_val: ", x_val.shape)
  67.         print ("Validate: Shape of x_val: ", y_val.shape)
  68.  
  69.         nb_words = min(MAX_NB_WORDS, len(word_index_train))
  70.         embedding_matrix = numpy.zeros((nb_words + 1, EMBEDDING_DIM))
  71.         for word, i in word_index_train.items():
  72.             if i > MAX_NB_WORDS:
  73.                 continue
  74.             embedding_vector = embedding_index.get(word)
  75.             if embedding_vector is not None:
  76.                 embedding_matrix[i] = embedding_vector
  77.  
  78.         print ("Prepare embedding layer...")
  79.         embedding_layer = Embedding(nb_words + 1,
  80.                                     EMBEDDING_DIM,
  81.                                     weights=[embedding_matrix],
  82.                                     input_length=MAX_SEQUENCE_LENGTH,
  83.                                     trainable=False)
  84.  
  85.         print ("Prepare sequence input...")
  86.         sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
  87.         embedding_sequence = embedding_layer(sequence_input)
  88.         x = Conv1D(128, 5, activation="relu")(embedding_sequence)
  89.         x = MaxPooling1D(5)(x)
  90.         x = Conv1D(128, 5, activation="relu")(x)
  91.         x = MaxPooling1D(5)(x)
  92.         x = Conv1D(128, 5, activation="relu")(x)
  93.         x = MaxPooling1D(35)(x)  # gloal max pooling
  94.         x = Flatten()(x)
  95.         x = Dense(128, activation="relu")(x)
  96.         preds = Dense(3, activation='softmax')(x)
  97.  
  98.         #sparse_categorical_crossentropy
  99.         #categorical_crossentropy
  100.         model = Model(sequence_input, preds)
  101.         model.compile(loss="categorical_crossentropy",
  102.                       optimizer="rmsprop",
  103.                       metrics=['acc'])
  104.  
  105.         ###Info - Happy learning!
  106.         model_fit = model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=128)
  107.         scores = model.evaluate(x_train, y_train)
  108.         print ("Score: %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
  109.         model.save("trained_model.h5")
  110.  
  111.     else:
  112.         print ("Saved model loaded")
  113.  
  114.  
  115. def predict_sentiment():
  116.     MAX_SEQUENCE_LENGTH = 1000
  117.     # numpy.save("train-np-data.dat", data_train)
  118.     # numpy.save("twitter-np-data.dat", data_predict)
  119.  
  120.     data_train = numpy.load("train-np-data.dat.npy")
  121.     data_predict = numpy.load("twitter-np-data.dat.npy")
  122.  
  123.     print("Train: Shape of data_train tensor:", data_train.shape)
  124.     print("Train: Shape[0] of data_train tensor:", data_train.shape[0])
  125.     print("Train: Shape[1] of data_train tensor:", data_train.shape[1])
  126.     print("Predict: Shape of data_predict tensor:", data_predict.shape)
  127.     print("Predict: Shape[0] of data_predict tensor:", data_predict.shape[0])
  128.     print("Predict: Shape[1] of data_predict tensor:", data_predict.shape[1])
  129.  
  130.     print ("####")
  131.  
  132.     data_train_indices = numpy.arange(data_train.shape[0])
  133.     data_predict_indices = numpy.arange(data_predict.shape[0])
  134.     print ("Indices train")
  135.     print (data_train)
  136.     print ("Indices Predict data")
  137.     print (data_predict)
  138.  
  139.     print ("Type Train:")
  140.     print (data_train)
  141.     print ("Type Predict:")
  142.     print (data_predict)
  143.  
  144.     #    ###Info - split data into training / validation sets
  145.     #    indices = numpy.arange(data.shape[0])
  146.     #    numpy.random.shuffle(indices)
  147.     #    data = data[indices]
  148.     #    labels = labels[indices]
  149.     #    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
  150.  
  151.  
  152.     trained_model = load_model("trained_model.h5")
  153.     prediction_result = trained_model.predict(data_train, batch_size=128)
  154.     print ("Prediction shape:")
  155.     print (prediction_result.shape)
  156.  
  157.     trained_model_2 = load_model("trained_model.h5")
  158.     prediction_result_2 = trained_model_2.predict(data_predict, batch_size=128)
  159.     print ("Prediction shape:")
  160.     print (prediction_result_2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement