Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import keras.backend as K
- import multiprocessing
- import tensorflow as tf
- import numpy as np
- import h5py
- from gensim.models.word2vec import Word2Vec
- from gensim.utils import simple_preprocess
- from gensim.parsing.preprocessing import STOPWORDS
- from gensim.parsing.porter import PorterStemmer
- from keras.callbacks import EarlyStopping
- from keras.models import Sequential
- from keras.layers.core import Dense, Dropout, Flatten
- from keras.layers.convolutional import Conv1D
- from keras.optimizers import Adam
- from sklearn.metrics import precision_recall_fscore_support, classification_report
- from sklearn.model_selection import KFold
- # Set random seed (for reproducibility)
- np.random.seed(1000)
- use_gpu = True
- config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(),
- inter_op_parallelism_threads=multiprocessing.cpu_count(),
- allow_soft_placement=True,
- device_count={'CPU': 1,
- 'GPU': 1 if use_gpu else 0})
- session = tf.Session(config=config)
- K.set_session(session)
- dataset = '/tmp/test_dataset.txt'
- corpus = []
- labels = []
- # Parse texts and sentiments
- file_id = open(dataset, 'r')
- counter = 0
- for line in file_id:
- # Sentiment (0 = Negative, 1 = Positive)
- counter += 1
- labels.append(int(line[0]))
- # Text
- text = line[2:]
- if text.startswith('"'):
- text = text[1:]
- if text.endswith('"'):
- text = text[::-1]
- corpus.append(text.lower())
- corpus = corpus[:1000000]
- labels = labels[:1000000]
- corpus_size = len(corpus)
- print('Corpus size: {}'.format(len(corpus)))
- print('1st text {}'.format(corpus[1]))
- print('label {}'.format(labels[1]))
- print('2nd text {}'.format(corpus[2]))
- print('label {}'.format(labels[2]))
- # Tokenize and stem
- stemmer = PorterStemmer()
- tokenized_corpus = []
- for i, text in enumerate(corpus):
- tokens = [stemmer.stem(t) for t in simple_preprocess(text) if t not in STOPWORDS or not t.startswith('@') or not t.startswith('#') \
- or not t.startswith('<br') or not t.startswith('http')]
- tokenized_corpus.append(tokens)
- # Gensim Word2Vec model
- size = 512
- window = 10
- # Create Word2Vec
- word2vec = Word2Vec(sentences=tokenized_corpus,
- size=size,
- window=window,
- negative=20,
- iter=50,
- seed=1000,
- workers=multiprocessing.cpu_count())
- X_vecs = word2vec.wv
- del word2vec
- del corpus
- # Compute average and max text length
- avg_length = 0.0
- max_length = 0
- for text in tokenized_corpus:
- if len(text) > max_length:
- max_length = len(text)
- avg_length += float(len(text))
- print('Length tokinzed corpus: {}'.format(len(tokenized_corpus)))
- print('Average text length: {}'.format(avg_length / float(len(tokenized_corpus))))
- print('Max text length: {}'.format(max_length))
- # Text max length (number of tokens)
- max_text_length = 15
- # Generate random indexes
- indexes = set(np.random.choice(len(tokenized_corpus), corpus_size, replace=False))
- X = np.zeros((corpus_size, max_text_length, size), dtype=K.floatx())
- Y = np.zeros((corpus_size, 2), dtype=np.int32)
- for i, index in enumerate(indexes):
- for t, token in enumerate(tokenized_corpus[index]):
- if t >= max_text_length:
- break
- if token not in X_vecs:
- continue
- X[i, t, :] = X_vecs[token]
- Y[i, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
- def create_model():
- model = Sequential()
- model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_text_length, size)))
- model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
- model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
- model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
- model.add(Dropout(0.25))
- model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
- model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
- model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
- model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
- model.add(Dropout(0.25))
- model.add(Flatten())
- model.add(Dense(256, activation='elu'))
- model.add(Dense(256, activation='elu'))
- model.add(Dropout(0.5))
- model.add(Dense(2, activation='softmax'))
- # Compile the model
- model.compile(loss='categorical_crossentropy',
- optimizer=Adam(lr=0.0001, decay=1e-6),
- metrics=['accuracy'])
- print(model.summary())
- return model
- print('loading model')
- model = create_model()
- # load model weights
- model.load_weights('/tmp/Model_weights_sentimet_without_sw_and_with_stem.h5')
- scores = model.evaluate(X, Y, verbose=0)
- print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
- Y_pred = model.predict(X)
- Y_label = []
- Y_pred_label = []
- for label in Y:
- if label[0] > label[1]:
- Y_label.append(0)
- else:
- Y_label.append(1)
- for label in Y_pred:
- if label[0] > label[1]:
- Y_pred_label.append(0)
- else:
- Y_pred_label.append(1)
- precision, recall, fscore, support = precision_recall_fscore_support(Y_label, Y_pred_label, labels=['0','1'])
- print(classification_report(Y_label, Y_pred_label))
- print('precision', precision)
- print('recall', recall)
- print('fscore', fscore)
- print('support', support)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement