• API
• FAQ
• Tools
• Archive
SHARE
TWEET

# Untitled

a guest Apr 26th, 2019 79 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import io
2. import re
3. import pickle
4. import os
5. import numpy as np
6.
7. def levenshtein(seq1, seq2):
8.     size_x = len(seq1) + 1
9.     size_y = len(seq2) + 1
10.     matrix = np.zeros ((size_x, size_y))
11.     for x in xrange(size_x):
12.         matrix [x, 0] = x
13.     for y in xrange(size_y):
14.         matrix [0, y] = y
15.
16.     for x in xrange(1, size_x):
17.         for y in xrange(1, size_y):
18.             if seq1[x-1] == seq2[y-1]:
19.                 matrix [x,y] = min(
20.                     matrix[x-1, y] + 1,
21.                     matrix[x-1, y-1],
22.                     matrix[x, y-1] + 1
23.                 )
24.             else:
25.                 matrix [x,y] = min(
26.                     matrix[x-1,y] + 1,
27.                     matrix[x-1,y-1] + 1,
28.                     matrix[x,y-1] + 1
29.                 )
30.     return (matrix[size_x - 1, size_y - 1])
31.
32. def serialize(obj, out_file):
33.     binary_file = open(out_file,mode='wb')
34.     pickle.dump(obj, binary_file)
35.     binary_file.close()
36.
37. def deserialize(in_file):
38.     binary_file = open(in_file, mode='rb')
40.     binary_file.close()
41.     return obj
42.
44.     lines = []
45.     with io.open(path, 'r', encoding='utf-8') as content_file:
46.         for line in content_file:
47.             lines += [line]
48.     return lines
49.
50. def remove_digits(text):
51.     return re.sub(r'\d', '', text)
52.
53. def remove_non_aplhanumeric(text):
54.     return re.sub(r'[^\w\s]', '', text)
55. def remove_newlines(text):
56.     return re.sub(r'[\t\n\r]', ' ', text)
57.
58. text_corpus_data = ['lab4/dramat.txt', 'lab4/popul.txt', 'lab4/proza.txt', 'lab4/publ.txt', 'lab4/wp.txt']
59.
60. def process_text_corpus(text):
61.     text = remove_newlines(text)
62.     text = remove_digits(text)
63.     text = text.lower()
64.     text = remove_non_aplhanumeric(text)
65.     text = text.split()
66.     # text = ' '.join(text)
67.     # text = text.split(' ')
68.     # text = map(lambda w: w.lower(), text)
69.     # text = map(remove_digits, text)
70.     # text = map(remove_non_aplhanumeric, text)
71.     # text = filter(lambda w: w != '', text)
72.     return text
73.
75.     words = []
76.     for file in text_corpus_data:
78.         text_corpus = process_text_corpus(text)
79.         words += [*text_corpus]
80.     print(words)
81.     return [word for word in words if word in forms]
82.
85.     forms = map(remove_non_aplhanumeric, forms)
86.     return set(forms)
87. def get_Pc(word, text_corpus, forms, word_count):
88.     return (word_count[word] + 1) / (len(text_corpus) + len(forms))
89.
90. def count_words(forms, text_corpus, recreate=False, cache=True):
91.     filename = 'word_count.bin'
92.     if os.path.isfile(filename) and not recreate:
93.         word_count = deserialize(filename)
94.     else:
95.         word_count = {word:0 for word in forms}
96.         print(len(text_corpus))
97.         for i, word in enumerate(text_corpus):
98.             if i % 1000 == 0:
99.                 print(i)
100.             word_count[word] += 1
101.         if cache:
102.             serialize(word_count, filename)
103.     return word_count
104.
105.