Advertisement
Mancolo

Untitled

May 11th, 2023
930
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.55 KB | None | 0 0
  1. import re
  2. from typing import Dict
  3. import copy
  4.  
  5. def validate(word):
  6.     return ''.join(i for i in word if i.isalpha())
  7.  
  8.  
  9. def match_words(words: list, reg: str) -> list:
  10.     return [w for w in words if re.match(reg, w)]
  11.  
  12.  
  13. def count_words(text: str) -> tuple:
  14.     words = [validate(w) for w in text.lower().split()]
  15.     word_dict = {}
  16.     tot = 0
  17.     for word in words:
  18.         words_same_size = word_dict.setdefault(len(word), {})
  19.         words_same_size[word] = words_same_size.get(word, 0) + 1
  20.         tot += 1
  21.  
  22.         word_dict[len(word)] = words_same_size
  23.  
  24.     for k in word_dict:
  25.         for j in word_dict[k]:
  26.             word_dict[k][j] /= tot * 0.01
  27.  
  28.     for k in word_dict:
  29.         si = list(word_dict[k].items())
  30.         si.sort(key=lambda x: -x[1])
  31.         word_dict[k] = si
  32.  
  33.     return word_dict, set(words)
  34.  
  35.  
  36. def count_chars(s: str) -> list:
  37.     s = s.lower()
  38.     d = {}
  39.     tot = 0
  40.     for c in s:
  41.         if c.isalpha() and c not in "quote":
  42.             d[c] = d.get(c, 0) + 1
  43.             tot += 1
  44.  
  45.     for k in d:
  46.         d[k] /= tot * 0.01
  47.  
  48.     si = list(d.items())
  49.     si.sort(key=lambda x: -x[1])
  50.  
  51.     return si
  52.  
  53.  
  54. def is_checkable(word: str, key: Dict) -> bool:
  55.     return any(key[k] is not None and k in word for k in key)
  56.  
  57.  
  58. def get_unknown_chars(key: Dict):
  59.     alph = 'абвгдеёжзийклмнопрстуфчцчшщъыьэюяңөү'
  60.     return ''.join((c for c in alph if c not in key.values()))
  61.  
  62.  
  63. def get_unknown_keys(key: Dict) -> str:
  64.     return ''.join(c for c in key if key[c] is None)
  65.  
  66.  
  67. def check_pattern(match: str, word: str) -> bool:
  68.     return all((match[i] == match[j]) == (word[i] == word[j]) for i in range(len(word) - 1) for j in range(i, len(word)))
  69.  
  70.  
  71. def generate_pattern(word, key, counted_encoding_chars):
  72.     reg = ''
  73.     for s in word:
  74.         if s in key.values():
  75.             reg += list(key.keys())[list(key.values()).index(s)]
  76.         elif s in 'нкы':
  77.             reg += f'[{"".join(c[0] for c in counted_encoding_chars[1:4])}]'
  78.         else:
  79.             reg += f'[{get_unknown_keys(key)}]'
  80.  
  81.     print(reg)
  82.     return reg + '$'
  83.  
  84.  
  85. def test():
  86.     print("TEST")
  87.     return "Bye!!!!"
  88.  
  89.  
  90. def main():
  91.     encoded_text = open('D:/Codes/hwPy/teory/text.txt', 'r', encoding='UTF-8').read()
  92.     sample_text = open('D:/Codes/hwPy/teory/Manas-eposu-2010-S-Karalaev.txt', 'r', encoding='UTF-8').read().replace('www.el-sozduk.kg', '')
  93.  
  94.     counted_chars = count_chars(sample_text)
  95.     counted_encoding_chars = count_chars(encoded_text)
  96.  
  97.     _,  encoded_words = count_words(encoded_text)
  98.     _,  sample_words = count_words(sample_text)
  99.  
  100.     key = dict()
  101.     for ec in counted_encoding_chars:
  102.         key[ec[0]] = None
  103.  
  104.     key[counted_encoding_chars[0][0]] = 'а'
  105.  
  106.     counted_encoding_chars_dict = dict(counted_encoding_chars)
  107.     counted_chars_dict = dict(counted_chars)
  108.  
  109.     print(counted_chars)
  110.     print(counted_encoding_chars)
  111.  
  112.     unknown_chars = ''.join([c[0] for c in counted_chars[2:]])
  113.  
  114.     print(unknown_chars)
  115.  
  116.     print(key)
  117.  
  118.     # bias = 0.05
  119.  
  120.     must_words = ["баатыр", 'манас', 'каныкей']
  121.  
  122.     for word in must_words:
  123.         matches = match_words(encoded_words, generate_pattern(word, key, counted_encoding_chars))
  124.         print(matches)
  125.         if len(matches) == 1:
  126.             for i in range(len(word)):
  127.                 key[matches[0][i]] = word[i]
  128.  
  129.     while all(map(lambda x: x is not None, key.keys())):
  130.         print(key)
  131.  
  132.         past_key = copy.deepcopy(key)
  133.  
  134.         examples = []
  135.  
  136.         for word in encoded_words:
  137.             if is_checkable(word, key):
  138.                 reg = ''
  139.  
  140.                 for s in word:
  141.                     if key.get(s, None) is None:
  142.                         reg += f'[{get_unknown_chars(key)}]'
  143.                     else:
  144.                         reg += key[s]
  145.  
  146.                 reg += '$'
  147.  
  148.                 # print(word, reg)
  149.  
  150.                 matches = match_words(sample_words, reg)
  151.                 if word == 'лңк':
  152.                     print(matches, reg)
  153.                 matches = list(filter(lambda x: check_pattern(x, word), matches))
  154.  
  155.                 # if matches:
  156.                 #     print(matches)
  157.  
  158.                 if len(matches) == 1 and any(map(lambda s: s not in key.values(), matches[0])):
  159.                     # print(matches[0])
  160.                     err = 0.0
  161.                     is_valid = True
  162.  
  163.                     for i in range(len(matches[0])):
  164.                         if key[word[i]] is None:
  165.                             err += abs(counted_encoding_chars_dict[word[i]] * 1.0 / counted_chars_dict[matches[0][i]] - 1)
  166.  
  167.                             if matches[0][i] in list(key.values()):
  168.                                 is_valid = False
  169.                                 break
  170.  
  171.                     if is_valid:
  172.                         examples.append((word, matches[0], err/len(word)))
  173.  
  174.         if len(examples) == 0:
  175.             break
  176.  
  177.         examples.sort(key=lambda x: x[2])
  178.  
  179.         for i in range(len(examples[0][0])):
  180.             key[examples[0][0][i]] = examples[0][1][i]
  181.  
  182.         print(examples[0])
  183.  
  184.     text = ''
  185.  
  186.     for w in encoded_text:
  187.         w = w.lower()
  188.         if w.isalpha():
  189.             text += key[w] if key.get(w, None) is not None else 'X'
  190.         else:
  191.             text += w
  192.  
  193.     # print(''.join([k for k in key.values() if k is not None]))
  194.     # print(''.join([k for k in key.keys() if key[k] is not None]))
  195.     print(text)
  196.     # print(key)
  197.     # print(sample_words)
  198.  
  199.     # print(list(key.values()).count(None))
  200. print(main())
  201.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement