Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.28 KB | None | 0 0
  1. import tensorflow as tf
  2. import os
  3. import yaml
  4. import numpy as np
  5. import requests, zipfile, io
  6. import pickle
  7. from keras.preprocessing.sequence import pad_sequences
  8. from keras.preprocessing.text import Tokenizer
  9. import numpy as np
  10. from keras.utils import to_categorical
  11. from tensorflow.keras import layers , activations , models , preprocessing
  12. import requests, zipfile, io
  13. tokenizer = Tokenizer(num_words=5000)
  14.  
  15. dir_path = 'raw_data'
  16. files_list = os.listdir(dir_path + os.sep)
  17.  
  18. questions = list()
  19. answers = list()
  20. for filepath in files_list:
  21. stream = open( dir_path + os.sep + filepath , 'rb')
  22. docs = yaml.safe_load(stream)
  23. conversations = docs['conversations']
  24. for con in conversations:
  25. if len( con ) > 2 :
  26. questions.append(con[0])
  27. replies = con[ 1 : ]
  28. ans = ''
  29. for rep in replies:
  30. ans += ' ' + rep
  31.  
  32. answers.append(str(ans) + " end")
  33. elif len( con )> 1:
  34. questions.append(con[0])
  35. answers.append(str(con[1]) + " end")
  36. a = []
  37. for i in answers:
  38. a.append("start "+i)
  39. tokenizer.fit_on_texts(questions + a)
  40. encoder_input_data = pad_sequences(tokenizer.texts_to_sequences(questions), maxlen=22)
  41. decoder_input_data = pad_sequences(tokenizer.texts_to_sequences(a), maxlen=74)
  42. decoder_target_data = to_categorical(pad_sequences(tokenizer.texts_to_sequences(answers), maxlen=74))
  43.  
  44. num_tokens = len( tokenizer.word_index )+1
  45. word_dict = tokenizer.word_index
  46. max_question_len = encoder_input_data.shape[1]
  47. max_answer_len = decoder_input_data.shape[1]
  48.  
  49. print( 'Max length of question is {}'.format( max_question_len) )
  50. print( 'Max length of answer is {}'.format( max_answer_len) )
  51. print(num_tokens)
  52. print( encoder_input_data.shape )
  53. print( decoder_input_data.shape )
  54. print( decoder_target_data.shape )
  55.  
  56. encoder_inputs = tf.keras.layers.Input(shape=( None , ))
  57. encoder_embedding = tf.keras.layers.Embedding( num_tokens, 200 , mask_zero=True) (encoder_inputs)
  58. encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
  59. encoder_states = [ state_h , state_c ]
  60.  
  61. decoder_inputs = tf.keras.layers.Input(shape=( None , ))
  62. decoder_embedding = tf.keras.layers.Embedding( num_tokens, 200 , mask_zero=True) (decoder_inputs)
  63. decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
  64. decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
  65. decoder_dense = tf.keras.layers.Dense( num_tokens , activation=tf.keras.activations.softmax )
  66. output = decoder_dense ( decoder_outputs )
  67.  
  68. model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
  69. model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')
  70.  
  71.  
  72. model.summary()
  73.  
  74. model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=100, epochs=100 )
  75. model.save( 'model.h5' )
  76.  
  77.  
  78. def make_inference_models():
  79.  
  80. encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
  81.  
  82. decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
  83. decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
  84.  
  85. decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
  86.  
  87. decoder_outputs, state_h, state_c = decoder_lstm(
  88. decoder_embedding , initial_state=decoder_states_inputs)
  89. decoder_states = [state_h, state_c]
  90. decoder_outputs = decoder_dense(decoder_outputs)
  91. decoder_model = tf.keras.models.Model(
  92. [decoder_inputs] + decoder_states_inputs,
  93. [decoder_outputs] + decoder_states)
  94.  
  95. return encoder_model , decoder_model
  96.  
  97. def str_to_tokens( sentence : str ):
  98. words = sentence.lower().split()
  99. tokens_list = list()
  100. for word in words:
  101. tokens_list.append( word_dict[ word ] )
  102. return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_question_len , padding='post')
  103.  
  104.  
  105. enc_model , dec_model = make_inference_models()
  106.  
  107. for _ in range(10):
  108. states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
  109. empty_target_seq = np.zeros( ( 1 , 1 ) )
  110. empty_target_seq[0, 0] = word_dict['start']
  111. stop_condition = False
  112. decoded_translation = ''
  113. while not stop_condition :
  114. dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
  115. sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
  116. sampled_word = None
  117. for word , index in word_dict.items() :
  118. if sampled_word_index == index :
  119. decoded_translation += ' {}'.format( word )
  120. sampled_word = word
  121.  
  122. if sampled_word == 'end' or len(decoded_translation.split()) > max_answer_len:
  123. stop_condition = True
  124.  
  125. empty_target_seq = np.zeros( ( 1 , 1 ) )
  126. empty_target_seq[ 0 , 0 ] = sampled_word_index
  127. states_values = [ h , c ]
  128.  
  129. print( decoded_translation )
  130.  
  131. Enter question : how are you
  132.  
  133. capacity normally again again again often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often often
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement