Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- import os
- import yaml
- import numpy as np
- import requests, zipfile, io
- import pickle
- from keras.preprocessing.sequence import pad_sequences
- from keras.preprocessing.text import Tokenizer
- import numpy as np
- from keras.utils import to_categorical
- from tensorflow.keras import layers , activations , models , preprocessing
- import requests, zipfile, io
- tokenizer = Tokenizer(num_words=5000)
- dir_path = 'raw_data'
- files_list = os.listdir(dir_path + os.sep)
- questions = list()
- answers = list()
- for filepath in files_list:
- stream = open( dir_path + os.sep + filepath , 'rb')
- docs = yaml.safe_load(stream)
- conversations = docs['conversations']
- for con in conversations:
- if len( con ) > 2 :
- questions.append(con[0])
- replies = con[ 1 : ]
- ans = ''
- for rep in replies:
- ans += ' ' + rep
- answers.append(str(ans) + " end")
- elif len( con )> 1:
- questions.append(con[0])
- answers.append(str(con[1]) + " end")
- a = []
- for i in answers:
- a.append("start "+i)
- tokenizer.fit_on_texts(questions + a)
- encoder_input_data = pad_sequences(tokenizer.texts_to_sequences(questions), maxlen=22)
- decoder_input_data = pad_sequences(tokenizer.texts_to_sequences(a), maxlen=74)
- decoder_target_data = to_categorical(pad_sequences(tokenizer.texts_to_sequences(answers), maxlen=74))
- num_tokens = len( tokenizer.word_index )+1
- word_dict = tokenizer.word_index
- max_question_len = encoder_input_data.shape[1]
- max_answer_len = decoder_input_data.shape[1]
- print( 'Max length of question is {}'.format( max_question_len) )
- print( 'Max length of answer is {}'.format( max_answer_len) )
- print(num_tokens)
- print( encoder_input_data.shape )
- print( decoder_input_data.shape )
- print( decoder_target_data.shape )
- encoder_inputs = tf.keras.layers.Input(shape=( None , ))
- encoder_embedding = tf.keras.layers.Embedding( num_tokens, 200 , mask_zero=True) (encoder_inputs)
- encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
- encoder_states = [ state_h , state_c ]
- decoder_inputs = tf.keras.layers.Input(shape=( None , ))
- decoder_embedding = tf.keras.layers.Embedding( num_tokens, 200 , mask_zero=True) (decoder_inputs)
- decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
- decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
- decoder_dense = tf.keras.layers.Dense( num_tokens , activation=tf.keras.activations.softmax )
- output = decoder_dense ( decoder_outputs )
- model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
- model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')
- model.summary()
- model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=100, epochs=100 )
- model.save( 'model.h5' )
- def make_inference_models():
- encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
- decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
- decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
- decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
- decoder_outputs, state_h, state_c = decoder_lstm(
- decoder_embedding , initial_state=decoder_states_inputs)
- decoder_states = [state_h, state_c]
- decoder_outputs = decoder_dense(decoder_outputs)
- decoder_model = tf.keras.models.Model(
- [decoder_inputs] + decoder_states_inputs,
- [decoder_outputs] + decoder_states)
- return encoder_model , decoder_model
- def str_to_tokens( sentence : str ):
- words = sentence.lower().split()
- tokens_list = list()
- for word in words:
- tokens_list.append( word_dict[ word ] )
- return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_question_len , padding='post')
- enc_model , dec_model = make_inference_models()
- for _ in range(10):
- states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
- empty_target_seq = np.zeros( ( 1 , 1 ) )
- empty_target_seq[0, 0] = word_dict['start']
- stop_condition = False
- decoded_translation = ''
- while not stop_condition :
- dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
- sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
- sampled_word = None
- for word , index in word_dict.items() :
- if sampled_word_index == index :
- decoded_translation += ' {}'.format( word )
- sampled_word = word
- if sampled_word == 'end' or len(decoded_translation.split()) > max_answer_len:
- stop_condition = True
- empty_target_seq = np.zeros( ( 1 , 1 ) )
- empty_target_seq[ 0 , 0 ] = sampled_word_index
- states_values = [ h , c ]
- print( decoded_translation )
- Enter question : how are you
- capacity normally again again again often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement