Guest User

Untitled

a guest
Jul 21st, 2018
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.27 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from gensim import corpora
  5. from gensim import models
  6. from scipy import spatial
  7. import numpy as np
  8. import csv
  9. import sys
  10.  
  11.  
  12. def get_sentences(file_name):
  13. sentences = []
  14. with open(file_name, 'r') as f:
  15. reader = csv.reader(f)
  16. for sentence in reader:
  17. sentences.append(sentence)
  18. return sentences
  19.  
  20.  
  21. def create_gensim_bow(sentences):
  22. dictionary = corpora.Dictionary(sentences)
  23. dictionary.token2id
  24. return list(map(dictionary.doc2bow, sentences)), len(dictionary)
  25.  
  26.  
  27. def apply_tfidf(corpus):
  28. test_model = models.TfidfModel(corpus)
  29. return test_model[corpus]
  30.  
  31.  
  32. def create_word_vectors(corpus, num_of_words):
  33. # outputs as ndarray
  34. word_vectors = []
  35. for id_freq_pairs in corpus:
  36. word_vector = [0 for i in range(num_of_words)]
  37. for id_freq_pair in id_freq_pairs:
  38. word_vector[id_freq_pair[0]] = id_freq_pair[1]
  39. word_vectors.append(word_vector)
  40. return np.array(word_vectors)
  41.  
  42.  
  43. def calculate_cos_similarity(vector1, vector2):
  44. return 1 - spatial.distance.cosine(vector1, vector2)
  45.  
  46.  
  47. def find_most_similar_id(target_id, word_vectors):
  48. max = {'id': 0, 'similarity': 0}
  49. for index, word_vector in enumerate(word_vectors):
  50. if index is not target_id:
  51. similarity = calculate_cos_similarity(
  52. word_vector, word_vectors[target_id])
  53. if similarity > max['similarity']:
  54. max = {'id': index, 'similarity': similarity}
  55. return max
  56.  
  57.  
  58. if __name__ == '__main__':
  59. # ex)
  60. # python find_smilar_sentence.py input.csv vectors.npy
  61. import_file = sys.argv[1]
  62. vector_file = sys.argv[2]
  63. sentences = get_sentences(import_file)
  64. print('finish loading sentences')
  65.  
  66. # enable these when you create word_vectors
  67. corpus, num_of_words = create_gensim_bow(sentences)
  68. corpus_tfidf = apply_tfidf(corpus)
  69. word_vectors = create_word_vectors(corpus_tfidf, num_of_words)
  70. np.save(vector_file, word_vectors)
  71. print('finish saving vectors')
  72.  
  73. # enable this when you load word_vectors from csv
  74. # word_vectors = np.load(vector_file)
  75. # print('finish loading vectors')
  76.  
  77. similar = find_most_similar_id(0, word_vectors)
  78. print(similar)
  79. print(sentences[0])
  80. print(sentences[similar['id']])
Add Comment
Please, Sign In to add comment