Untitled

docs1 = df.emag_name
docs2 = df.competitor_name
matches = df.Match


# encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs1)
tokenizer.fit_on_texts(docs2)

vocabulary_size = len(tokenizer.word_counts)+1
encoded_docs1 = tokenizer.texts_to_sequences(docs1)
encoded_docs2 = tokenizer.texts_to_sequences(docs2)
encoded_docs = [encod_1 + encod_2 for (encod_1, encod_2) in zip(encoded_docs1, encoded_docs2)]
# padding
max_sentence_len1 = max([len(doc) for doc in encoded_docs1])
max_sentence_len2 = max([len(doc) for doc in encoded_docs2])
max_length = max(max_sentence_len1, max_sentence_len2)
padded_docs1 = pad_sequences(encoded_docs1, maxlen=max_length, padding='post')
padded_docs2 = pad_sequences(encoded_docs2, maxlen=max_length, padding='post')

#%%

# channel 1
inputs1 = Input(shape=(max_length,), name='in1')
embedding1 = Embedding(vocabulary_size, 100, weights=[embedding_matrix])(inputs1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPooling1D()(drop1)
flat1 = Flatten()(pool1)
# channel 2
inputs2 = Input(shape=(max_length,), name='in2')
embedding2 = Embedding(vocabulary_size, 100, weights=[embedding_matrix])(inputs2)
conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
drop2 = Dropout(0.5)(conv2)
pool2 = MaxPooling1D()(drop2)
flat2 = Flatten()(pool2)
# merge
merged = concatenate([flat1, flat2])
# interpretation
dense1 = Dense(10, activation='relu')(merged)
outputs = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize
print(model.summary())
# plot_model(model, show_shapes=True, to_file='multichannel.png')

#%%

# split
labels = np.array(matches)
train_data1, test_data1, train_labels, test_labels = train_test_split(padded_docs1, labels, test_size=0.3)
train_data2, test_data2, _, _ = train_test_split(padded_docs2, labels, test_size=0.3)

#%%
# fit the model
model.fit({'in1':train_data1, 'in2': train_data2}, train_labels, epochs=50, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate({'in1':test_data1, 'in2': test_data2}, test_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))


#%%

def predict(title1, title2):

    tit1 = text_to_word_sequence(title1)
    tit2 = text_to_word_sequence(title2)

    print(tit1)
    print(tit2)

    tit1 = tokenizer.texts_to_sequences([tit1])
    tit2 = tokenizer.texts_to_sequences([tit2])

    print(tit1)
    print(tit2)

    print(type(tit1))
    print(type(tit2))

    tit1 = pad_sequences(tit1, maxlen=max_length, padding='post')
    tit2 = pad_sequences(tit2, maxlen=max_length, padding='post')

    print(tit1)
    print(tit2)

    tit1 = tit1[0]
    tit2 = tit2[0]
    tit1 = np.array(tit1)
    tit2 = np.array(tit2)

    print(tit1)
    print(tit2)
    print(tit1.shape)
    print(tit2.shape)

    res = model.predict([tit1, tit2])
    return res
predict('Telefon mobil Apple iPhone 4, 8GB, White', 'iPhone 4 APPLE, 8GB, 3.5&quot;, 5Mp, Bluetooth, alb')
predict('GUESS JEANS, Tricou in dungi longline fit, Albastru royal/Alb, XL', 'Vin rosu demisec Pelin Urlati 0.75L')