SHARE
TWEET

Untitled

a guest Apr 26th, 2019 79 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import io
  2. import re
  3. import pickle
  4. import os
  5. import numpy as np
  6.  
  7. def levenshtein(seq1, seq2):  
  8.     size_x = len(seq1) + 1
  9.     size_y = len(seq2) + 1
  10.     matrix = np.zeros ((size_x, size_y))
  11.     for x in xrange(size_x):
  12.         matrix [x, 0] = x
  13.     for y in xrange(size_y):
  14.         matrix [0, y] = y
  15.  
  16.     for x in xrange(1, size_x):
  17.         for y in xrange(1, size_y):
  18.             if seq1[x-1] == seq2[y-1]:
  19.                 matrix [x,y] = min(
  20.                     matrix[x-1, y] + 1,
  21.                     matrix[x-1, y-1],
  22.                     matrix[x, y-1] + 1
  23.                 )
  24.             else:
  25.                 matrix [x,y] = min(
  26.                     matrix[x-1,y] + 1,
  27.                     matrix[x-1,y-1] + 1,
  28.                     matrix[x,y-1] + 1
  29.                 )
  30.     return (matrix[size_x - 1, size_y - 1])
  31.  
  32. def serialize(obj, out_file):
  33.     binary_file = open(out_file,mode='wb')
  34.     pickle.dump(obj, binary_file)
  35.     binary_file.close()
  36.  
  37. def deserialize(in_file):
  38.     binary_file = open(in_file, mode='rb')
  39.     obj =  pickle.load(binary_file)
  40.     binary_file.close()
  41.     return obj
  42.  
  43. def read_file(path):
  44.     lines = []
  45.     with io.open(path, 'r', encoding='utf-8') as content_file:
  46.         for line in content_file:
  47.             lines += [line]
  48.     return lines
  49.  
  50. def remove_digits(text):
  51.     return re.sub(r'\d', '', text)
  52.  
  53. def remove_non_aplhanumeric(text):
  54.     return re.sub(r'[^\w\s]', '', text)
  55. def remove_newlines(text):
  56.     return re.sub(r'[\t\n\r]', ' ', text)
  57.  
  58. text_corpus_data = ['lab4/dramat.txt', 'lab4/popul.txt', 'lab4/proza.txt', 'lab4/publ.txt', 'lab4/wp.txt']
  59.  
  60. def process_text_corpus(text):
  61.     text = remove_newlines(text)
  62.     text = remove_digits(text)
  63.     text = text.lower()
  64.     text = remove_non_aplhanumeric(text)
  65.     text = text.split()
  66.     # text = ' '.join(text)
  67.     # text = text.split(' ')
  68.     # text = map(lambda w: w.lower(), text)
  69.     # text = map(remove_digits, text)
  70.     # text = map(remove_non_aplhanumeric, text)
  71.     # text = filter(lambda w: w != '', text)
  72.     return text
  73.  
  74. def read_text_corpus_data(text_corpus_data, forms):
  75.     words = []
  76.     for file in text_corpus_data:
  77.         text = open(file).read()
  78.         text_corpus = process_text_corpus(text)
  79.         words += [*text_corpus]
  80.     print(words)
  81.     return [word for word in words if word in forms]
  82.  
  83. def read_forms(file):
  84.     forms = read_file(file)
  85.     forms = map(remove_non_aplhanumeric, forms)
  86.     return set(forms)
  87. def get_Pc(word, text_corpus, forms, word_count):
  88.     return (word_count[word] + 1) / (len(text_corpus) + len(forms))
  89.  
  90. def count_words(forms, text_corpus, recreate=False, cache=True):
  91.     filename = 'word_count.bin'
  92.     if os.path.isfile(filename) and not recreate:
  93.         word_count = deserialize(filename)
  94.     else:
  95.         word_count = {word:0 for word in forms}
  96.         print(len(text_corpus))
  97.         for i, word in enumerate(text_corpus):
  98.             if i % 1000 == 0:
  99.                 print(i)
  100.             word_count[word] += 1
  101.         if cache:
  102.             serialize(word_count, filename)
  103.     return word_count
  104.  
  105.  
  106. forms = read_forms('lab4/formy.txt')
  107.  
  108. text_corpus = read_text_corpus_data(text_corpus_data, forms)
  109. # print(text_corpus)
  110. word_count = count_words(forms, text_corpus)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top