Advertisement
Guest User

Untitled

a guest
Feb 28th, 2020
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.29 KB | None | 0 0
  1. docs1 = df.emag_name
  2. docs2 = df.competitor_name
  3. matches = df.Match
  4.  
  5.  
  6. # encoding
  7. tokenizer = Tokenizer()
  8. tokenizer.fit_on_texts(docs1)
  9. tokenizer.fit_on_texts(docs2)
  10.  
  11. vocabulary_size = len(tokenizer.word_counts)+1
  12. encoded_docs1 = tokenizer.texts_to_sequences(docs1)
  13. encoded_docs2 = tokenizer.texts_to_sequences(docs2)
  14. encoded_docs = [encod_1 + encod_2 for (encod_1, encod_2) in zip(encoded_docs1, encoded_docs2)]
  15. # padding
  16. max_sentence_len1 = max([len(doc) for doc in encoded_docs1])
  17. max_sentence_len2 = max([len(doc) for doc in encoded_docs2])
  18. max_length = max(max_sentence_len1, max_sentence_len2)
  19. padded_docs1 = pad_sequences(encoded_docs1, maxlen=max_length, padding='post')
  20. padded_docs2 = pad_sequences(encoded_docs2, maxlen=max_length, padding='post')
  21.  
  22. #%%
  23.  
  24. # channel 1
  25. inputs1 = Input(shape=(max_length,), name='in1')
  26. embedding1 = Embedding(vocabulary_size, 100, weights=[embedding_matrix])(inputs1)
  27. conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
  28. drop1 = Dropout(0.5)(conv1)
  29. pool1 = MaxPooling1D()(drop1)
  30. flat1 = Flatten()(pool1)
  31. # channel 2
  32. inputs2 = Input(shape=(max_length,), name='in2')
  33. embedding2 = Embedding(vocabulary_size, 100, weights=[embedding_matrix])(inputs2)
  34. conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
  35. drop2 = Dropout(0.5)(conv2)
  36. pool2 = MaxPooling1D()(drop2)
  37. flat2 = Flatten()(pool2)
  38. # merge
  39. merged = concatenate([flat1, flat2])
  40. # interpretation
  41. dense1 = Dense(10, activation='relu')(merged)
  42. outputs = Dense(1, activation='sigmoid')(dense1)
  43. model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  44. # compile
  45. model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  46. # summarize
  47. print(model.summary())
  48. # plot_model(model, show_shapes=True, to_file='multichannel.png')
  49.  
  50. #%%
  51.  
  52. # split
  53. labels = np.array(matches)
  54. train_data1, test_data1, train_labels, test_labels = train_test_split(padded_docs1, labels, test_size=0.3)
  55. train_data2, test_data2, _, _ = train_test_split(padded_docs2, labels, test_size=0.3)
  56.  
  57. #%%
  58. # fit the model
  59. model.fit({'in1':train_data1, 'in2': train_data2}, train_labels, epochs=50, verbose=1)
  60. # evaluate the model
  61. loss, accuracy = model.evaluate({'in1':test_data1, 'in2': test_data2}, test_labels, verbose=0)
  62. print('Accuracy: %f' % (accuracy*100))
  63.  
  64.  
  65. #%%
  66.  
  67. def predict(title1, title2):
  68.  
  69. tit1 = text_to_word_sequence(title1)
  70. tit2 = text_to_word_sequence(title2)
  71.  
  72. print(tit1)
  73. print(tit2)
  74.  
  75. tit1 = tokenizer.texts_to_sequences([tit1])
  76. tit2 = tokenizer.texts_to_sequences([tit2])
  77.  
  78. print(tit1)
  79. print(tit2)
  80.  
  81. print(type(tit1))
  82. print(type(tit2))
  83.  
  84. tit1 = pad_sequences(tit1, maxlen=max_length, padding='post')
  85. tit2 = pad_sequences(tit2, maxlen=max_length, padding='post')
  86.  
  87. print(tit1)
  88. print(tit2)
  89.  
  90. tit1 = tit1[0]
  91. tit2 = tit2[0]
  92. tit1 = np.array(tit1)
  93. tit2 = np.array(tit2)
  94.  
  95. print(tit1)
  96. print(tit2)
  97. print(tit1.shape)
  98. print(tit2.shape)
  99.  
  100. res = model.predict([tit1, tit2])
  101. return res
  102. predict('Telefon mobil Apple iPhone 4, 8GB, White', 'iPhone 4 APPLE, 8GB, 3.5", 5Mp, Bluetooth, alb')
  103. predict('GUESS JEANS, Tricou in dungi longline fit, Albastru royal/Alb, XL', 'Vin rosu demisec Pelin Urlati 0.75L')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement