Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import math
- from numpy import array
- from tensorflow.keras.utils import to_categorical
- from tensorflow.keras.preprocessing.text import Tokenizer
- from tensorflow.keras.preprocessing.sequence import pad_sequences
- from tensorflow.keras.models import Sequential
- from tensorflow.keras.layers import LSTM
- from tensorflow.keras.layers import RepeatVector
- from tensorflow.keras.layers import TimeDistributed
- from tensorflow.keras.layers import Dense
- from tensorflow.keras.callbacks import ModelCheckpoint
- from numpy.random import shuffle
- def create_tokenizer(lines):
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(lines)
- return tokenizer
- def max_length(lines):
- return max(len(line.split()) for line in lines)
- def encode_sequences(tokenizer, length, lines):
- x = tokenizer.texts_to_sequences(lines)
- return pad_sequences(x, maxlen=length, padding='post')
- def encode_output(sequences, vocab_size):
- ylist = list()
- for sequence in sequences:
- encoded = to_categorical(sequence, num_classes=vocab_size)
- ylist.append(encoded)
- print(len(ylist))
- y = array(ylist)
- return y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
- def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
- model = Sequential()
- model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
- model.add(LSTM(n_units))
- model.add(RepeatVector(tar_timesteps))
- model.add(LSTM(n_units, return_sequences=True))
- model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
- return model
- def create_model(data):
- train, test = data[:divider], data[divider:]
- kidz_tokenizer = create_tokenizer(data[:, 0])
- kidz_vocab_size = len(kidz_tokenizer.word_index) + 1
- kidz_length = max_length(data[:, 0])
- os_tokenizer = create_tokenizer(data[:, 1])
- os_vocab_size = create_tokenizer(data[:, 1])
- os_length = max_length(data[:, 1])
- train_x = encode_sequences(os_tokenizer, os_length, train[:, 1])
- train_y = encode_sequences(kidz_tokenizer, kidz_length, train[:, 0])
- train_y = encode_output(train_y, kidz_vocab_size)
- test_x = encode_sequences(os_tokenizer, os_length, test[:, 1])
- test_y = encode_sequences(kidz_tokenizer, kidz_length, test[:, 0])
- test_y = encode_output(test_y, kidz_vocab_size)
- model = define_model(os_vocab_size, kidz_vocab_size, os_length, kidz_length, 256)
- model.compile(optimizer='adam', loss='categorical_crossentropy')
- print(model.summary())
- out_model = 'model.h5'
- cp = ModelCheckpoint(out_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
- model.fit(train_x, train_y, epochs=30, batch_size=64, validation_data=(test_x, test_y), callbacks=[cp], verbose=2)
- with open('../scraper/data.json') as jf:
- data = array(json.load(jf))
- shuffle(data)
- divider = math.floor(len(data) * 0.9)
- create_model(data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement