Advertisement
Guest User

Untitled

a guest
Apr 26th, 2019
102
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.24 KB | None | 0 0
  1. import io
  2. import re
  3. import pickle
  4. import os
  5. import numpy as np
  6.  
  7. def levenshtein(seq1, seq2):
  8. size_x = len(seq1) + 1
  9. size_y = len(seq2) + 1
  10. matrix = np.zeros ((size_x, size_y))
  11. for x in xrange(size_x):
  12. matrix [x, 0] = x
  13. for y in xrange(size_y):
  14. matrix [0, y] = y
  15.  
  16. for x in xrange(1, size_x):
  17. for y in xrange(1, size_y):
  18. if seq1[x-1] == seq2[y-1]:
  19. matrix [x,y] = min(
  20. matrix[x-1, y] + 1,
  21. matrix[x-1, y-1],
  22. matrix[x, y-1] + 1
  23. )
  24. else:
  25. matrix [x,y] = min(
  26. matrix[x-1,y] + 1,
  27. matrix[x-1,y-1] + 1,
  28. matrix[x,y-1] + 1
  29. )
  30. return (matrix[size_x - 1, size_y - 1])
  31.  
  32. def serialize(obj, out_file):
  33. binary_file = open(out_file,mode='wb')
  34. pickle.dump(obj, binary_file)
  35. binary_file.close()
  36.  
  37. def deserialize(in_file):
  38. binary_file = open(in_file, mode='rb')
  39. obj = pickle.load(binary_file)
  40. binary_file.close()
  41. return obj
  42.  
  43. def read_file(path):
  44. lines = []
  45. with io.open(path, 'r', encoding='utf-8') as content_file:
  46. for line in content_file:
  47. lines += [line]
  48. return lines
  49.  
  50. def remove_digits(text):
  51. return re.sub(r'\d', '', text)
  52.  
  53. def remove_non_aplhanumeric(text):
  54. return re.sub(r'[^\w\s]', '', text)
  55. def remove_newlines(text):
  56. return re.sub(r'[\t\n\r]', ' ', text)
  57.  
  58. text_corpus_data = ['lab4/dramat.txt', 'lab4/popul.txt', 'lab4/proza.txt', 'lab4/publ.txt', 'lab4/wp.txt']
  59.  
  60. def process_text_corpus(text):
  61. text = remove_newlines(text)
  62. text = remove_digits(text)
  63. text = text.lower()
  64. text = remove_non_aplhanumeric(text)
  65. text = text.split()
  66. # text = ' '.join(text)
  67. # text = text.split(' ')
  68. # text = map(lambda w: w.lower(), text)
  69. # text = map(remove_digits, text)
  70. # text = map(remove_non_aplhanumeric, text)
  71. # text = filter(lambda w: w != '', text)
  72. return text
  73.  
  74. def read_text_corpus_data(text_corpus_data, forms):
  75. words = []
  76. for file in text_corpus_data:
  77. text = open(file).read()
  78. text_corpus = process_text_corpus(text)
  79. words += [*text_corpus]
  80. print(words)
  81. return [word for word in words if word in forms]
  82.  
  83. def read_forms(file):
  84. forms = read_file(file)
  85. forms = map(remove_non_aplhanumeric, forms)
  86. return set(forms)
  87. def get_Pc(word, text_corpus, forms, word_count):
  88. return (word_count[word] + 1) / (len(text_corpus) + len(forms))
  89.  
  90. def count_words(forms, text_corpus, recreate=False, cache=True):
  91. filename = 'word_count.bin'
  92. if os.path.isfile(filename) and not recreate:
  93. word_count = deserialize(filename)
  94. else:
  95. word_count = {word:0 for word in forms}
  96. print(len(text_corpus))
  97. for i, word in enumerate(text_corpus):
  98. if i % 1000 == 0:
  99. print(i)
  100. word_count[word] += 1
  101. if cache:
  102. serialize(word_count, filename)
  103. return word_count
  104.  
  105.  
  106. forms = read_forms('lab4/formy.txt')
  107.  
  108. text_corpus = read_text_corpus_data(text_corpus_data, forms)
  109. # print(text_corpus)
  110. word_count = count_words(forms, text_corpus)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement