Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- from gensim import corpora
- from gensim import models
- from scipy import spatial
- import numpy as np
- import csv
- import sys
- def get_sentences(file_name):
- sentences = []
- with open(file_name, 'r') as f:
- reader = csv.reader(f)
- for sentence in reader:
- sentences.append(sentence)
- return sentences
- def create_gensim_bow(sentences):
- dictionary = corpora.Dictionary(sentences)
- dictionary.token2id
- return list(map(dictionary.doc2bow, sentences)), len(dictionary)
- def apply_tfidf(corpus):
- test_model = models.TfidfModel(corpus)
- return test_model[corpus]
- def create_word_vectors(corpus, num_of_words):
- # outputs as ndarray
- word_vectors = []
- for id_freq_pairs in corpus:
- word_vector = [0 for i in range(num_of_words)]
- for id_freq_pair in id_freq_pairs:
- word_vector[id_freq_pair[0]] = id_freq_pair[1]
- word_vectors.append(word_vector)
- return np.array(word_vectors)
- def calculate_cos_similarity(vector1, vector2):
- return 1 - spatial.distance.cosine(vector1, vector2)
- def find_most_similar_id(target_id, word_vectors):
- max = {'id': 0, 'similarity': 0}
- for index, word_vector in enumerate(word_vectors):
- if index is not target_id:
- similarity = calculate_cos_similarity(
- word_vector, word_vectors[target_id])
- if similarity > max['similarity']:
- max = {'id': index, 'similarity': similarity}
- return max
- if __name__ == '__main__':
- # ex)
- # python find_smilar_sentence.py input.csv vectors.npy
- import_file = sys.argv[1]
- vector_file = sys.argv[2]
- sentences = get_sentences(import_file)
- print('finish loading sentences')
- # enable these when you create word_vectors
- corpus, num_of_words = create_gensim_bow(sentences)
- corpus_tfidf = apply_tfidf(corpus)
- word_vectors = create_word_vectors(corpus_tfidf, num_of_words)
- np.save(vector_file, word_vectors)
- print('finish saving vectors')
- # enable this when you load word_vectors from csv
- # word_vectors = np.load(vector_file)
- # print('finish loading vectors')
- similar = find_most_similar_id(0, word_vectors)
- print(similar)
- print(sentences[0])
- print(sentences[similar['id']])
Add Comment
Please, Sign In to add comment