Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def trainModule():
- """
- Method to train a model and stores it in the program root directory.
- Stored name is "trained_model.h5"
- :return:
- """
- MAX_NB_WORDS = 20000
- MAX_SEQUENCE_LENGTH = 1000
- BASE_DIR = "/home/frost/PycharmProjects/imdb-es"
- GLOVE_DIR = BASE_DIR + "/glove.6B/"
- EMBEDDING_DIM = 100
- VALIDATION_SPLIT = 0.19
- seed = 7
- numpy.random.seed(seed)
- if not os.path.isfile("trained_model.h5"):
- print "No model saved ... start learning"
- embedding_index = {}
- f = open(os.path.join(GLOVE_DIR, "glove.6B.100d.txt"))
- for line in f:
- values = line.split()
- word = values[0]
- coefs = numpy.asarray(values[1:], dtype="float32")
- embedding_index[word] = coefs
- f.close()
- print("Found %s word vectors." % len(embedding_index))
- ###Info - load and prepare training data set
- print "Load complete dataset..."
- texts_train, labels = getFullTwitterDataSet()
- ###Info - Convert list from string to int
- labels = map(int,labels)
- print "Start processing dataset..."
- ###Info - Convert texts from unicode to ascii (tokenizer.fit_on_texts throw an error)
- texts_train = [s.encode("ascii") for s in texts_train]
- tokenizer_train = Tokenizer(nb_words=MAX_NB_WORDS)
- tokenizer_train.fit_on_texts(texts_train)
- sequences_train = tokenizer_train.texts_to_sequences(texts_train)
- word_index_train = tokenizer_train.word_index
- print ("Found %s unique tokens. (train data set)" % len(word_index_train))
- data = pad_sequences(sequences_train, maxlen = MAX_SEQUENCE_LENGTH)
- labels = to_categorical(numpy.asarray(labels))
- print("Train: Shape of x_train tensor:", data.shape) #('Train: Shape of x_train tensor:', (11842, 1000))
- print("Train: Shape of y_train tensor:", labels.shape)
- ###Info - split data into training / validation sets
- indices = numpy.arange(data.shape[0])
- numpy.random.shuffle(indices)
- data = data[indices]
- labels = labels[indices]
- nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
- print "Dataset splitted into Trainset and Validationset"
- x_train = data[:-nb_validation_samples]
- y_train = labels[:-nb_validation_samples]
- x_val = data[-nb_validation_samples:]
- y_val = labels[-nb_validation_samples:]
- print ("Train: Shape of x_train: ", x_train.shape)
- print ("Train: Shape of y_train: ", y_train.shape)
- print ("Validate: Shape of x_val: ", x_val.shape)
- print ("Validate: Shape of x_val: ", y_val.shape)
- nb_words = min(MAX_NB_WORDS, len(word_index_train))
- embedding_matrix = numpy.zeros((nb_words + 1, EMBEDDING_DIM))
- for word, i in word_index_train.items():
- if i > MAX_NB_WORDS:
- continue
- embedding_vector = embedding_index.get(word)
- if embedding_vector is not None:
- embedding_matrix[i] = embedding_vector
- print ("Prepare embedding layer...")
- embedding_layer = Embedding(nb_words + 1,
- EMBEDDING_DIM,
- weights=[embedding_matrix],
- input_length=MAX_SEQUENCE_LENGTH,
- trainable=False)
- print ("Prepare sequence input...")
- sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
- embedding_sequence = embedding_layer(sequence_input)
- x = Conv1D(128, 5, activation="relu")(embedding_sequence)
- x = MaxPooling1D(5)(x)
- x = Conv1D(128, 5, activation="relu")(x)
- x = MaxPooling1D(5)(x)
- x = Conv1D(128, 5, activation="relu")(x)
- x = MaxPooling1D(35)(x) # gloal max pooling
- x = Flatten()(x)
- x = Dense(128, activation="relu")(x)
- preds = Dense(3, activation='softmax')(x)
- #sparse_categorical_crossentropy
- #categorical_crossentropy
- model = Model(sequence_input, preds)
- model.compile(loss="categorical_crossentropy",
- optimizer="rmsprop",
- metrics=['acc'])
- ###Info - Happy learning!
- model_fit = model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=128)
- scores = model.evaluate(x_train, y_train)
- print ("Score: %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
- model.save("trained_model.h5")
- else:
- print ("Saved model loaded")
- def predict_sentiment():
- MAX_SEQUENCE_LENGTH = 1000
- # numpy.save("train-np-data.dat", data_train)
- # numpy.save("twitter-np-data.dat", data_predict)
- data_train = numpy.load("train-np-data.dat.npy")
- data_predict = numpy.load("twitter-np-data.dat.npy")
- print("Train: Shape of data_train tensor:", data_train.shape)
- print("Train: Shape[0] of data_train tensor:", data_train.shape[0])
- print("Train: Shape[1] of data_train tensor:", data_train.shape[1])
- print("Predict: Shape of data_predict tensor:", data_predict.shape)
- print("Predict: Shape[0] of data_predict tensor:", data_predict.shape[0])
- print("Predict: Shape[1] of data_predict tensor:", data_predict.shape[1])
- print ("####")
- data_train_indices = numpy.arange(data_train.shape[0])
- data_predict_indices = numpy.arange(data_predict.shape[0])
- print ("Indices train")
- print (data_train)
- print ("Indices Predict data")
- print (data_predict)
- print ("Type Train:")
- print (data_train)
- print ("Type Predict:")
- print (data_predict)
- # ###Info - split data into training / validation sets
- # indices = numpy.arange(data.shape[0])
- # numpy.random.shuffle(indices)
- # data = data[indices]
- # labels = labels[indices]
- # nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
- trained_model = load_model("trained_model.h5")
- prediction_result = trained_model.predict(data_train, batch_size=128)
- print ("Prediction shape:")
- print (prediction_result.shape)
- trained_model_2 = load_model("trained_model.h5")
- prediction_result_2 = trained_model_2.predict(data_predict, batch_size=128)
- print ("Prediction shape:")
- print (prediction_result_2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement