Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- import itertools
- import os
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import tensorflow as tf
- from sklearn.preprocessing import LabelBinarizer, LabelEncoder
- from sklearn.metrics import confusion_matrix
- from tensorflow import keras
- from keras.models import Sequential
- from keras.layers import Dense, Activation, Dropout
- from keras.preprocessing import text, sequence
- from keras import utils
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- print("You have TensorFlow version", tf.__version__)
- data = pd.read_csv("dataset_1.csv")
- data.head()
- print(data['tags'].value_counts())
- train_size = int(len(data) * .8)
- print ("Train size: %d" % train_size)
- print ("Test size: %d" % (len(data) - train_size))
- train_posts = data['post'][:train_size]
- train_tags = data['tags'][:train_size]
- test_posts = data['post'][train_size:]
- test_tags = data['tags'][train_size:]
- max_words = 5000
- tokenize = text.Tokenizer(num_words=max_words, char_level=False)
- tokenize.fit_on_texts(train_posts) # only fit on train
- x_train = tokenize.texts_to_matrix(train_posts)
- x_test = tokenize.texts_to_matrix(test_posts)
- encoder = LabelEncoder()
- encoder.fit(train_tags)
- y_train = encoder.transform(train_tags)
- y_test = encoder.transform(test_tags)
- num_classes = np.max(y_train) + 1
- y_train = utils.to_categorical(y_train, num_classes)
- y_test = utils.to_categorical(y_test, num_classes)
- print('x_train shape:', x_train.shape)
- print('x_test shape:', x_test.shape)
- print('y_train shape:', y_train.shape)
- print('y_test shape:', y_test.shape)
- batch_size = 64
- epochs = 5
- model = Sequential()
- model.add(Dense(512, input_shape=(max_words,)))
- model.add(Activation('relu'))
- model.add(Dropout(0.5))
- model.add(Dense(num_classes))
- model.add(Activation('softmax'))
- model.compile(loss='categorical_crossentropy',
- optimizer='adam',
- metrics=['accuracy'])
- history = model.fit(x_train, y_train,
- batch_size=batch_size,
- epochs=epochs,
- verbose=1,
- validation_split=0.1)
- score = model.evaluate(x_test, y_test,
- batch_size=batch_size, verbose=1)
- print('Test score:', score[0])
- print('Test accuracy:', score[1])
- text_labels = encoder.classes_
- #for i in range(10):
- #prediction = model.predict(np.array([x_test[i]]))
- #predicted_label = text_labels[np.argmax(prediction)]
- #print(test_posts.iloc[i][:50], "...")
- #print('Actual label:' + test_tags.iloc[i])
- #print("Predicted label: " + predicted_label + "n")
- string='naplata propisanih obveza'
- x_test2 = tokenize.sequences_to_matrix(string)
- prediction2 = model.predict(np.array(x_test2))
- predicted_label2 = text_labels[np.argmax(prediction2)]
- print("Predicted label: " + predicted_label2 + "n")
Add Comment
Please, Sign In to add comment