sentiment-code

def trainModule():
    """
    Method to train a model and stores it in the program root directory.
    Stored name is "trained_model.h5"
    :return:
    """

    MAX_NB_WORDS = 20000
    MAX_SEQUENCE_LENGTH = 1000
    BASE_DIR = "/home/frost/PycharmProjects/imdb-es"
    GLOVE_DIR = BASE_DIR + "/glove.6B/"
    EMBEDDING_DIM = 100
    VALIDATION_SPLIT = 0.19
    seed = 7
    numpy.random.seed(seed)

    if not os.path.isfile("trained_model.h5"):
        print "No model saved ... start learning"

        embedding_index = {}
        f = open(os.path.join(GLOVE_DIR, "glove.6B.100d.txt"))
        for line in f:
            values = line.split()
            word = values[0]
            coefs = numpy.asarray(values[1:], dtype="float32")
            embedding_index[word] = coefs
        f.close()

        print("Found %s word vectors." % len(embedding_index))

        ###Info - load and prepare training data set
        print "Load complete dataset..."
        texts_train, labels = getFullTwitterDataSet()
        ###Info - Convert list from string to int
        labels = map(int,labels)
        print "Start processing dataset..."
        ###Info - Convert texts from unicode to ascii (tokenizer.fit_on_texts throw an error)
        texts_train = [s.encode("ascii") for s in texts_train]

        tokenizer_train = Tokenizer(nb_words=MAX_NB_WORDS)
        tokenizer_train.fit_on_texts(texts_train)
        sequences_train = tokenizer_train.texts_to_sequences(texts_train)

        word_index_train = tokenizer_train.word_index
        print ("Found %s unique tokens. (train data set)" % len(word_index_train))

        data = pad_sequences(sequences_train, maxlen = MAX_SEQUENCE_LENGTH)
        labels = to_categorical(numpy.asarray(labels))
        print("Train: Shape of x_train tensor:", data.shape) #('Train: Shape of x_train tensor:', (11842, 1000))
        print("Train: Shape of y_train tensor:", labels.shape)

        ###Info - split data into training / validation sets
        indices = numpy.arange(data.shape[0])
        numpy.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

        print "Dataset splitted into Trainset and Validationset"
        x_train = data[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x_val = data[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]
        print ("Train: Shape of x_train: ", x_train.shape)
        print ("Train: Shape of y_train: ", y_train.shape)
        print ("Validate: Shape of x_val: ", x_val.shape)
        print ("Validate: Shape of x_val: ", y_val.shape)

        nb_words = min(MAX_NB_WORDS, len(word_index_train))
        embedding_matrix = numpy.zeros((nb_words + 1, EMBEDDING_DIM))
        for word, i in word_index_train.items():
            if i > MAX_NB_WORDS:
                continue
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        print ("Prepare embedding layer...")
        embedding_layer = Embedding(nb_words + 1,
                                    EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)

        print ("Prepare sequence input...")
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
        embedding_sequence = embedding_layer(sequence_input)
        x = Conv1D(128, 5, activation="relu")(embedding_sequence)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation="relu")(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation="relu")(x)
        x = MaxPooling1D(35)(x)  # gloal max pooling
        x = Flatten()(x)
        x = Dense(128, activation="relu")(x)
        preds = Dense(3, activation='softmax')(x)

        #sparse_categorical_crossentropy
        #categorical_crossentropy
        model = Model(sequence_input, preds)
        model.compile(loss="categorical_crossentropy",
                      optimizer="rmsprop",
                      metrics=['acc'])

        ###Info - Happy learning!
        model_fit = model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=128)
        scores = model.evaluate(x_train, y_train)
        print ("Score: %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        model.save("trained_model.h5")

    else:
        print ("Saved model loaded")


def predict_sentiment():
    MAX_SEQUENCE_LENGTH = 1000
    # numpy.save("train-np-data.dat", data_train)
    # numpy.save("twitter-np-data.dat", data_predict)

    data_train = numpy.load("train-np-data.dat.npy")
    data_predict = numpy.load("twitter-np-data.dat.npy")

    print("Train: Shape of data_train tensor:", data_train.shape)
    print("Train: Shape[0] of data_train tensor:", data_train.shape[0])
    print("Train: Shape[1] of data_train tensor:", data_train.shape[1])
    print("Predict: Shape of data_predict tensor:", data_predict.shape)
    print("Predict: Shape[0] of data_predict tensor:", data_predict.shape[0])
    print("Predict: Shape[1] of data_predict tensor:", data_predict.shape[1])

    print ("####")

    data_train_indices = numpy.arange(data_train.shape[0])
    data_predict_indices = numpy.arange(data_predict.shape[0])
    print ("Indices train")
    print (data_train)
    print ("Indices Predict data")
    print (data_predict)

    print ("Type Train:")
    print (data_train)
    print ("Type Predict:")
    print (data_predict)

    #    ###Info - split data into training / validation sets
    #    indices = numpy.arange(data.shape[0])
    #    numpy.random.shuffle(indices)
    #    data = data[indices]
    #    labels = labels[indices]
    #    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


    trained_model = load_model("trained_model.h5")
    prediction_result = trained_model.predict(data_train, batch_size=128)
    print ("Prediction shape:")
    print (prediction_result.shape)

    trained_model_2 = load_model("trained_model.h5")
    prediction_result_2 = trained_model_2.predict(data_predict, batch_size=128)
    print ("Prediction shape:")
    print (prediction_result_2)