Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from keras.preprocessing.sequence import pad_sequences
- from keras.utils import to_categorical
- from nlp_processor import Tokenizer as NepaliTokenizer, NepaliStemmer
- from gensim.models import Word2Vec
- import tensorflow as tf
- from postagger.train_lstm_generator import POS
- nepali2vec = Word2Vec.load("nep2vec_snowball_stemmer.model")
- nepali_tokenizer = NepaliTokenizer()
- nepali_stemmer = NepaliStemmer.NepaliStemmer()
- def vectorize(source, vectorizer, seq_length = 100, feature_length = 100):
- train_words = np.zeros([seq_length,feature_length])
- for k, token in enumerate(source):
- if token in vectorizer.wv.vocab:
- if k >= seq_length:
- continue
- train_words[k] = vectorizer.wv[token]
- return train_words
- def data_generator(data, labels, vectorizer, pos_encoder, batch_size=16, seq_length=100, feature_length=100):
- pos_tagger = POS()
- pos_tagger.build_model(layers=2)
- pos_tagger.loader('postagger/pos_model_9808_9742_.h5')
- pos_model = pos_tagger
- ret_x, ret_y, ret_pos = [], [], []
- for i in range(len(data)):
- current_x = data[i]
- current_y = labels[i]
- # label encoding
- y = 0 if current_y is 'POS' else 1
- y = to_categorical(y , num_classes=2, dtype='float32')
- # for pos prediction
- pos_prediction = pos_model.predict_pos(current_x)
- pos_prediction = list(zip(*pos_prediction))[1]
- (encoder, source) = pos_encoder
- pos_pred_encoded = encoder(pos_prediction, source)
- # veectorize sentences
- tokenized_x = nepali_tokenizer.tokenize_words([current_x])
- stemmed_x = nepali_stemmer.stem_sentence(tokenized_x)
- (vectorize, nep2vec) = vectorizer
- vector = vectorize(stemmed_x, nep2vec)
- ret_x.append(vector)
- ret_y.append(y)
- ret_pos.append(pos_pred_encoded)
- if i % batch_size == batch_size - 1:
- ret_pos = pad_sequences(ret_pos, maxlen=seq_length, padding='post', truncating='post')
- yield([ np.array(ret_x), np.array(ret_pos) ], np.array(ret_y))
- ret_x, ret_y, ret_pos = [], [], []
- # yield the remaining results
- ret_pos = pad_sequences(ret_pos, maxlen=seq_length, padding='post', truncating='post')
- yield([ np.array(ret_x), np.array(ret_pos) ], np.array(ret_y))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement