Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- docs1 = df.emag_name
- docs2 = df.competitor_name
- matches = df.Match
- # encoding
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(docs1)
- tokenizer.fit_on_texts(docs2)
- vocabulary_size = len(tokenizer.word_counts)+1
- encoded_docs1 = tokenizer.texts_to_sequences(docs1)
- encoded_docs2 = tokenizer.texts_to_sequences(docs2)
- encoded_docs = [encod_1 + encod_2 for (encod_1, encod_2) in zip(encoded_docs1, encoded_docs2)]
- # padding
- max_sentence_len1 = max([len(doc) for doc in encoded_docs1])
- max_sentence_len2 = max([len(doc) for doc in encoded_docs2])
- max_length = max(max_sentence_len1, max_sentence_len2)
- padded_docs1 = pad_sequences(encoded_docs1, maxlen=max_length, padding='post')
- padded_docs2 = pad_sequences(encoded_docs2, maxlen=max_length, padding='post')
- #%%
- # channel 1
- inputs1 = Input(shape=(max_length,), name='in1')
- embedding1 = Embedding(vocabulary_size, 100, weights=[embedding_matrix])(inputs1)
- conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
- drop1 = Dropout(0.5)(conv1)
- pool1 = MaxPooling1D()(drop1)
- flat1 = Flatten()(pool1)
- # channel 2
- inputs2 = Input(shape=(max_length,), name='in2')
- embedding2 = Embedding(vocabulary_size, 100, weights=[embedding_matrix])(inputs2)
- conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
- drop2 = Dropout(0.5)(conv2)
- pool2 = MaxPooling1D()(drop2)
- flat2 = Flatten()(pool2)
- # merge
- merged = concatenate([flat1, flat2])
- # interpretation
- dense1 = Dense(10, activation='relu')(merged)
- outputs = Dense(1, activation='sigmoid')(dense1)
- model = Model(inputs=[inputs1, inputs2], outputs=outputs)
- # compile
- model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
- # summarize
- print(model.summary())
- # plot_model(model, show_shapes=True, to_file='multichannel.png')
- #%%
- # split
- labels = np.array(matches)
- train_data1, test_data1, train_labels, test_labels = train_test_split(padded_docs1, labels, test_size=0.3)
- train_data2, test_data2, _, _ = train_test_split(padded_docs2, labels, test_size=0.3)
- #%%
- # fit the model
- model.fit({'in1':train_data1, 'in2': train_data2}, train_labels, epochs=50, verbose=1)
- # evaluate the model
- loss, accuracy = model.evaluate({'in1':test_data1, 'in2': test_data2}, test_labels, verbose=0)
- print('Accuracy: %f' % (accuracy*100))
- #%%
- def predict(title1, title2):
- tit1 = text_to_word_sequence(title1)
- tit2 = text_to_word_sequence(title2)
- print(tit1)
- print(tit2)
- tit1 = tokenizer.texts_to_sequences([tit1])
- tit2 = tokenizer.texts_to_sequences([tit2])
- print(tit1)
- print(tit2)
- print(type(tit1))
- print(type(tit2))
- tit1 = pad_sequences(tit1, maxlen=max_length, padding='post')
- tit2 = pad_sequences(tit2, maxlen=max_length, padding='post')
- print(tit1)
- print(tit2)
- tit1 = tit1[0]
- tit2 = tit2[0]
- tit1 = np.array(tit1)
- tit2 = np.array(tit2)
- print(tit1)
- print(tit2)
- print(tit1.shape)
- print(tit2.shape)
- res = model.predict([tit1, tit2])
- return res
- predict('Telefon mobil Apple iPhone 4, 8GB, White', 'iPhone 4 APPLE, 8GB, 3.5", 5Mp, Bluetooth, alb')
- predict('GUESS JEANS, Tricou in dungi longline fit, Albastru royal/Alb, XL', 'Vin rosu demisec Pelin Urlati 0.75L')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement