Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- import re
- import pickle
- import os
- import numpy as np
- def levenshtein(seq1, seq2):
- size_x = len(seq1) + 1
- size_y = len(seq2) + 1
- matrix = np.zeros ((size_x, size_y))
- for x in xrange(size_x):
- matrix [x, 0] = x
- for y in xrange(size_y):
- matrix [0, y] = y
- for x in xrange(1, size_x):
- for y in xrange(1, size_y):
- if seq1[x-1] == seq2[y-1]:
- matrix [x,y] = min(
- matrix[x-1, y] + 1,
- matrix[x-1, y-1],
- matrix[x, y-1] + 1
- )
- else:
- matrix [x,y] = min(
- matrix[x-1,y] + 1,
- matrix[x-1,y-1] + 1,
- matrix[x,y-1] + 1
- )
- return (matrix[size_x - 1, size_y - 1])
- def serialize(obj, out_file):
- binary_file = open(out_file,mode='wb')
- pickle.dump(obj, binary_file)
- binary_file.close()
- def deserialize(in_file):
- binary_file = open(in_file, mode='rb')
- obj = pickle.load(binary_file)
- binary_file.close()
- return obj
- def read_file(path):
- lines = []
- with io.open(path, 'r', encoding='utf-8') as content_file:
- for line in content_file:
- lines += [line]
- return lines
- def remove_digits(text):
- return re.sub(r'\d', '', text)
- def remove_non_aplhanumeric(text):
- return re.sub(r'[^\w\s]', '', text)
- def remove_newlines(text):
- return re.sub(r'[\t\n\r]', ' ', text)
- text_corpus_data = ['lab4/dramat.txt', 'lab4/popul.txt', 'lab4/proza.txt', 'lab4/publ.txt', 'lab4/wp.txt']
- def process_text_corpus(text):
- text = remove_newlines(text)
- text = remove_digits(text)
- text = text.lower()
- text = remove_non_aplhanumeric(text)
- text = text.split()
- # text = ' '.join(text)
- # text = text.split(' ')
- # text = map(lambda w: w.lower(), text)
- # text = map(remove_digits, text)
- # text = map(remove_non_aplhanumeric, text)
- # text = filter(lambda w: w != '', text)
- return text
- def read_text_corpus_data(text_corpus_data):
- words = []
- for file in text_corpus_data:
- text = open(file).read()
- text_corpus = process_text_corpus(text)
- words += [*text_corpus]
- print(words)
- return words
- def read_forms(file):
- forms = read_file(file)
- forms = map(remove_non_aplhanumeric, forms)
- return list(forms)
- def get_Pc(word, text_corpus, forms, word_count):
- return (word_count[word] + 1) / (len(text_corpus) + len(forms))
- def count_words(forms, text_corpus, recreate=False, cache=True):
- filename = 'word_count.bin'
- if os.path.isfile(filename) and not recreate:
- word_count = deserialize(filename)
- else:
- word_count = {word:0 for word in forms}
- # print(len(text_corpus))
- for i, word in enumerate(text_corpus):
- # if i % 1000 = 0:
- # print(i)
- if word in forms:
- word_count[word] += 1
- if cache:
- serialize(word_count, filename)
- return word_count
- text_corpus = read_text_corpus_data(text_corpus_data)
- # print(text_corpus)
- forms = read_forms('lab4/formy.txt')
- word_count = count_words(forms, text_corpus)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement