Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import re
- from collections import defaultdict
- import io
- import random
- import jellyfish
- from difflib import SequenceMatcher
- import pickle
- import os
- import argparse
- def serialize(obj, out_file):
- binary_file = open(out_file,mode='wb')
- pickle.dump(obj, binary_file)
- binary_file.close()
- def deserialize(in_file):
- binary_file = open(in_file, mode='rb')
- obj = pickle.load(binary_file)
- binary_file.close()
- return obj
- def lcs(str1,str2):
- seqMatch = SequenceMatcher(None,str1,str2)
- match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
- max_len = max(len(str1), len(str2))
- if max_len == 0:
- return 0
- else:
- return 1 - len(match) / max_len
- def hamming(str1, str2):
- return jellyfish.hamming_distance(str1, str2)
- def levenshtein(str1, str2):
- return jellyfish.levenshtein_distance(str1, str2)
- def read_file(path):
- lines = []
- with io.open(path, 'r', encoding='utf-8') as content_file:
- for line in content_file:
- lines += [line]
- return lines
- def remove_digits(text):
- return re.sub(r'\d', '', text)
- def remove_non_aplhanumeric(text):
- return re.sub(r'[^\w\s]', ' ', text)
- def remove_newlines(text):
- return re.sub(r'[\t\n\r]', '', text)
- def truncate_spaces(text):
- return re.sub(r'[\s]+', ' ', text)
- def truncate_newlines(text):
- return re.sub(r'[\t\n\r]', ' ', text)
- def preprocess_line(line):
- line = remove_non_aplhanumeric(line)
- line = truncate_spaces(line)
- line = line.upper()
- return line
- def preprocess_text(text):
- return list(map(preprocess_line, text))
- def get_word_count(text):
- word_count = defaultdict(int)
- for line in text:
- for word in line.split():
- word_count[word] += 1
- return word_count
- def get_lines(text):
- lines = []
- for ix, line in enumerate(text):
- lines += [(ix, preprocess_line(line))]
- return lines
- def get_distances(text, metric, recreate=False, cache=True):
- filename = metric.__name__ + '.bin'
- if os.path.isfile(filename) and not recreate:
- distances = deserialize(filename)
- else:
- n = len(text)
- distances = np.zeros((n, n))
- for i in range(n):
- print(i)
- for j in range(i + 1, n):
- distances[j, i] = distances[i, j] = metric(text[i], text[j])
- if cache:
- serialize(distances, filename)
- return distances
- def get_clusters(distances, epsilion):
- non_clustered_lines = set(list(range(len(distances))))
- clusters = []
- while len(non_clustered_lines) > 0:
- center = random.sample(non_clustered_lines, 1)
- current_cluster = []
- for j in range(len(distances)):
- if distances[center, j] <= epsilion and j in non_clustered_lines:
- current_cluster += [j]
- non_clustered_lines.remove(j)
- clusters += [current_cluster]
- return clusters
- def get_stoplist(text, n):
- word_count = get_word_count(text)
- sorted_word_count = list(word_count.items())
- sorted_word_count.sort(key=lambda a: a[1], reverse=True)
- # print('\n'.join(map(str, sorted_word_count[:n])))
- return {w[0] for w in sorted_word_count[:n]}
- def remve_stopwords(text, stoplist):
- return [' '.join([w for w in line.split() if w not in stoplist]) for line in text]
- def clusters_to_file(clusters, path, raw_text):
- reasult = []
- for cluster in clusters:
- reasult += ['#']
- for line_index in cluster:
- reasult += [raw_text[line_index]]
- f = open(path,"w+")
- f.write('\n'.join(reasult))
- f.close()
- parser = argparse.ArgumentParser()
- parser.add_argument('-m', default='levenshtein')
- parser.add_argument('-e', type=float)
- args = parser.parse_args()
- if args.m =='levenshtein':
- metric = levenshtein
- if args.m == 'lcs':
- metric = lcs
- if args.m == 'hamming':
- metric = hamming
- raw_text = read_file('lab2/lines.txt')
- text = preprocess_text(raw_text)
- stoplist = get_stoplist(text, 1000)
- text = remve_stopwords(text, stoplist)
- distances = get_distances(text, metric)
- clusters = get_clusters(distances, args.e)
- clusters_to_file(clusters, f'{metric.__name__}_result.txt' , raw_text)
- print('clusters coutn: ' + str(len(clusters)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement