Untitled

import io
import re
import pickle
import os
import numpy as np

def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in xrange(size_x):
        matrix [x, 0] = x
    for y in xrange(size_y):
        matrix [0, y] = y

    for x in xrange(1, size_x):
        for y in xrange(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

def serialize(obj, out_file):
    binary_file = open(out_file,mode='wb')
    pickle.dump(obj, binary_file)
    binary_file.close()

def deserialize(in_file):
    binary_file = open(in_file, mode='rb')
    obj =  pickle.load(binary_file)
    binary_file.close()
    return obj

def read_file(path):
    lines = []
    with io.open(path, 'r', encoding='utf-8') as content_file:
        for line in content_file:
            lines += [line]
    return lines

def remove_digits(text):
    return re.sub(r'\d', '', text)

def remove_non_aplhanumeric(text):
    return re.sub(r'[^\w\s]', '', text)
def remove_newlines(text):
    return re.sub(r'[\t\n\r]', ' ', text)

text_corpus_data = ['lab4/dramat.txt', 'lab4/popul.txt', 'lab4/proza.txt', 'lab4/publ.txt', 'lab4/wp.txt']

def process_text_corpus(text):
    text = remove_newlines(text)
    text = remove_digits(text)
    text = text.lower()
    text = remove_non_aplhanumeric(text)
    text = text.split()
    # text = ' '.join(text)
    # text = text.split(' ')
    # text = map(lambda w: w.lower(), text)
    # text = map(remove_digits, text)
    # text = map(remove_non_aplhanumeric, text)
    # text = filter(lambda w: w != '', text)
    return text

def read_text_corpus_data(text_corpus_data, forms):
    words = []
    for file in text_corpus_data:
        text = open(file).read()
        text_corpus = process_text_corpus(text)
        words += [*text_corpus]
    print(words)
    return [word for word in words if word in forms]

def read_forms(file):
    forms = read_file(file)
    forms = map(remove_non_aplhanumeric, forms)
    return set(forms)
def get_Pc(word, text_corpus, forms, word_count):
    return (word_count[word] + 1) / (len(text_corpus) + len(forms))

def count_words(forms, text_corpus, recreate=False, cache=True):
    filename = 'word_count.bin'
    if os.path.isfile(filename) and not recreate:
        word_count = deserialize(filename)
    else:
        word_count = {word:0 for word in forms}
        print(len(text_corpus))
        for i, word in enumerate(text_corpus):
            if i % 1000 == 0:
                print(i)
            word_count[word] += 1
        if cache:
            serialize(word_count, filename)
    return word_count


forms = read_forms('lab4/formy.txt')

text_corpus = read_text_corpus_data(text_corpus_data, forms)
# print(text_corpus)
word_count = count_words(forms, text_corpus)