Advertisement
Guest User

Untitled

a guest
Jul 24th, 2019
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.21 KB | None | 0 0
  1. from keras.preprocessing.sequence import pad_sequences
  2. from keras.utils import to_categorical
  3. from nlp_processor import Tokenizer as NepaliTokenizer, NepaliStemmer
  4. from gensim.models import Word2Vec
  5. import tensorflow as tf
  6. from postagger.train_lstm_generator import POS
  7.    
  8. nepali2vec = Word2Vec.load("nep2vec_snowball_stemmer.model")
  9. nepali_tokenizer = NepaliTokenizer()
  10. nepali_stemmer = NepaliStemmer.NepaliStemmer()
  11.  
  12. def vectorize(source, vectorizer, seq_length = 100, feature_length = 100):
  13.   train_words = np.zeros([seq_length,feature_length])        
  14.   for k, token in enumerate(source):
  15.     if token in vectorizer.wv.vocab:
  16.       if k >= seq_length:
  17.         continue
  18.       train_words[k] = vectorizer.wv[token]
  19.   return train_words
  20.  
  21. def data_generator(data, labels, vectorizer, pos_encoder, batch_size=16, seq_length=100, feature_length=100):
  22.   pos_tagger = POS()
  23.   pos_tagger.build_model(layers=2)
  24.   pos_tagger.loader('postagger/pos_model_9808_9742_.h5')
  25.   pos_model = pos_tagger
  26.   ret_x, ret_y, ret_pos = [], [], []
  27.   for i in range(len(data)):
  28.     current_x = data[i]
  29.     current_y = labels[i]
  30.  
  31.     # label encoding    
  32.     y = 0 if current_y is 'POS' else 1
  33.     y = to_categorical(y , num_classes=2, dtype='float32')
  34.  
  35.     # for pos prediction    
  36.     pos_prediction = pos_model.predict_pos(current_x)
  37.     pos_prediction = list(zip(*pos_prediction))[1]
  38.     (encoder, source) = pos_encoder
  39.     pos_pred_encoded = encoder(pos_prediction, source)
  40.  
  41.     # veectorize sentences    
  42.     tokenized_x = nepali_tokenizer.tokenize_words([current_x])
  43.     stemmed_x = nepali_stemmer.stem_sentence(tokenized_x)
  44.  
  45.     (vectorize, nep2vec) = vectorizer
  46.     vector = vectorize(stemmed_x, nep2vec)
  47.  
  48.     ret_x.append(vector)
  49.     ret_y.append(y)
  50.     ret_pos.append(pos_pred_encoded)
  51.  
  52.     if i % batch_size == batch_size - 1:
  53.       ret_pos = pad_sequences(ret_pos, maxlen=seq_length, padding='post', truncating='post')
  54.       yield([ np.array(ret_x), np.array(ret_pos) ], np.array(ret_y))
  55.       ret_x, ret_y, ret_pos = [], [], []
  56.  
  57.   # yield the remaining results  
  58.   ret_pos = pad_sequences(ret_pos, maxlen=seq_length, padding='post', truncating='post')
  59.   yield([ np.array(ret_x), np.array(ret_pos) ], np.array(ret_y))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement