Advertisement
Guest User

Untitled

a guest
Apr 25th, 2019
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.30 KB | None | 0 0
  1. import numpy as np
  2. import re
  3. from collections import defaultdict
  4. import io
  5. import random
  6. import jellyfish
  7. from difflib import SequenceMatcher
  8. import pickle
  9. import os
  10. import argparse
  11.  
  12. def serialize(obj, out_file):
  13. binary_file = open(out_file,mode='wb')
  14. pickle.dump(obj, binary_file)
  15. binary_file.close()
  16.  
  17. def deserialize(in_file):
  18. binary_file = open(in_file, mode='rb')
  19. obj = pickle.load(binary_file)
  20. binary_file.close()
  21. return obj
  22.  
  23. def lcs(str1,str2):
  24. seqMatch = SequenceMatcher(None,str1,str2)
  25. match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
  26. max_len = max(len(str1), len(str2))
  27. if max_len == 0:
  28. return 0
  29. else:
  30. return 1 - len(match) / max_len
  31. def hamming(str1, str2):
  32. return jellyfish.hamming_distance(str1, str2)
  33.  
  34. def levenshtein(str1, str2):
  35. return jellyfish.levenshtein_distance(str1, str2)
  36.  
  37. def read_file(path):
  38. lines = []
  39. with io.open(path, 'r', encoding='utf-8') as content_file:
  40. for line in content_file:
  41. lines += [line]
  42. return lines
  43.  
  44. def remove_digits(text):
  45. return re.sub(r'\d', '', text)
  46.  
  47. def remove_non_aplhanumeric(text):
  48. return re.sub(r'[^\w\s]', ' ', text)
  49.  
  50. def remove_newlines(text):
  51. return re.sub(r'[\t\n\r]', '', text)
  52.  
  53. def truncate_spaces(text):
  54. return re.sub(r'[\s]+', ' ', text)
  55.  
  56. def truncate_newlines(text):
  57. return re.sub(r'[\t\n\r]', ' ', text)
  58.  
  59. def preprocess_line(line):
  60. line = remove_non_aplhanumeric(line)
  61. line = truncate_spaces(line)
  62. line = line.upper()
  63. return line
  64.  
  65. def preprocess_text(text):
  66. return list(map(preprocess_line, text))
  67.  
  68. def get_word_count(text):
  69. word_count = defaultdict(int)
  70. for line in text:
  71. for word in line.split():
  72. word_count[word] += 1
  73. return word_count
  74.  
  75. def get_lines(text):
  76. lines = []
  77. for ix, line in enumerate(text):
  78. lines += [(ix, preprocess_line(line))]
  79. return lines
  80.  
  81. def get_distances(text, metric, recreate=False, cache=True):
  82. filename = metric.__name__ + '.bin'
  83. if os.path.isfile(filename) and not recreate:
  84. distances = deserialize(filename)
  85. else:
  86. n = len(text)
  87. distances = np.zeros((n, n))
  88. for i in range(n):
  89. print(i)
  90. for j in range(i + 1, n):
  91. distances[j, i] = distances[i, j] = metric(text[i], text[j])
  92. if cache:
  93. serialize(distances, filename)
  94. return distances
  95.  
  96. def get_clusters(distances, epsilion):
  97. non_clustered_lines = set(list(range(len(distances))))
  98. clusters = []
  99. while len(non_clustered_lines) > 0:
  100. center = random.sample(non_clustered_lines, 1)
  101. current_cluster = []
  102. for j in range(len(distances)):
  103. if distances[center, j] <= epsilion and j in non_clustered_lines:
  104. current_cluster += [j]
  105. non_clustered_lines.remove(j)
  106. clusters += [current_cluster]
  107. return clusters
  108.  
  109. def get_stoplist(text, n):
  110. word_count = get_word_count(text)
  111. sorted_word_count = list(word_count.items())
  112. sorted_word_count.sort(key=lambda a: a[1], reverse=True)
  113. # print('\n'.join(map(str, sorted_word_count[:n])))
  114. return {w[0] for w in sorted_word_count[:n]}
  115.  
  116. def remve_stopwords(text, stoplist):
  117. return [' '.join([w for w in line.split() if w not in stoplist]) for line in text]
  118.  
  119. def clusters_to_file(clusters, path, raw_text):
  120. reasult = []
  121. for cluster in clusters:
  122. reasult += ['#']
  123. for line_index in cluster:
  124. reasult += [raw_text[line_index]]
  125. f = open(path,"w+")
  126. f.write('\n'.join(reasult))
  127. f.close()
  128.  
  129. parser = argparse.ArgumentParser()
  130. parser.add_argument('-m', default='levenshtein')
  131. parser.add_argument('-e', type=float)
  132. args = parser.parse_args()
  133. if args.m =='levenshtein':
  134. metric = levenshtein
  135. if args.m == 'lcs':
  136. metric = lcs
  137. if args.m == 'hamming':
  138. metric = hamming
  139.  
  140. raw_text = read_file('lab2/lines.txt')
  141. text = preprocess_text(raw_text)
  142. stoplist = get_stoplist(text, 1000)
  143. text = remve_stopwords(text, stoplist)
  144. distances = get_distances(text, metric)
  145. clusters = get_clusters(distances, args.e)
  146. clusters_to_file(clusters, f'{metric.__name__}_result.txt' , raw_text)
  147. print('clusters coutn: ' + str(len(clusters)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement