Advertisement
Guest User

Untitled

a guest
Nov 11th, 2019
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.89 KB | None | 0 0
  1.  
  2. import json
  3. import math
  4.  
  5. from numpy import array
  6. from tensorflow.keras.utils import to_categorical
  7. from tensorflow.keras.preprocessing.text import Tokenizer
  8. from tensorflow.keras.preprocessing.sequence import pad_sequences
  9. from tensorflow.keras.models import Sequential
  10. from tensorflow.keras.layers import LSTM
  11. from tensorflow.keras.layers import RepeatVector
  12. from tensorflow.keras.layers import TimeDistributed
  13. from tensorflow.keras.layers import Dense
  14. from tensorflow.keras.callbacks import ModelCheckpoint
  15. from numpy.random import shuffle
  16.  
  17. def create_tokenizer(lines):
  18. tokenizer = Tokenizer()
  19. tokenizer.fit_on_texts(lines)
  20. return tokenizer
  21.  
  22. def max_length(lines):
  23. return max(len(line.split()) for line in lines)
  24.  
  25. def encode_sequences(tokenizer, length, lines):
  26. x = tokenizer.texts_to_sequences(lines)
  27. return pad_sequences(x, maxlen=length, padding='post')
  28.  
  29. def encode_output(sequences, vocab_size):
  30. ylist = list()
  31. for sequence in sequences:
  32. encoded = to_categorical(sequence, num_classes=vocab_size)
  33. ylist.append(encoded)
  34. print(len(ylist))
  35. y = array(ylist)
  36. return y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  37.  
  38. def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
  39. model = Sequential()
  40. model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
  41. model.add(LSTM(n_units))
  42. model.add(RepeatVector(tar_timesteps))
  43. model.add(LSTM(n_units, return_sequences=True))
  44. model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  45. return model
  46.  
  47. def create_model(data):
  48. train, test = data[:divider], data[divider:]
  49. kidz_tokenizer = create_tokenizer(data[:, 0])
  50. kidz_vocab_size = len(kidz_tokenizer.word_index) + 1
  51. kidz_length = max_length(data[:, 0])
  52. os_tokenizer = create_tokenizer(data[:, 1])
  53. os_vocab_size = create_tokenizer(data[:, 1])
  54. os_length = max_length(data[:, 1])
  55. train_x = encode_sequences(os_tokenizer, os_length, train[:, 1])
  56. train_y = encode_sequences(kidz_tokenizer, kidz_length, train[:, 0])
  57. train_y = encode_output(train_y, kidz_vocab_size)
  58. test_x = encode_sequences(os_tokenizer, os_length, test[:, 1])
  59. test_y = encode_sequences(kidz_tokenizer, kidz_length, test[:, 0])
  60. test_y = encode_output(test_y, kidz_vocab_size)
  61. model = define_model(os_vocab_size, kidz_vocab_size, os_length, kidz_length, 256)
  62. model.compile(optimizer='adam', loss='categorical_crossentropy')
  63. print(model.summary())
  64. out_model = 'model.h5'
  65. cp = ModelCheckpoint(out_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
  66. model.fit(train_x, train_y, epochs=30, batch_size=64, validation_data=(test_x, test_y), callbacks=[cp], verbose=2)
  67.  
  68. with open('../scraper/data.json') as jf:
  69. data = array(json.load(jf))
  70. shuffle(data)
  71. divider = math.floor(len(data) * 0.9)
  72. create_model(data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement