Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from collections import defaultdict, Counter
- from typing import Dict, List
- def create_symbols(word):
- return ''.join(filter(str.isalpha, word))
- def match_words(words: list, reg: str) -> list:
- return [w for w in words if re.match(reg, w)]
- def count_words(text: str) -> tuple:
- words = [create_symbols(w) for w in text.lower().split()]
- word_dict = defaultdict(lambda: defaultdict(int))
- tot = len(words)
- for word in words:
- word_dict[len(word)][word] += 1
- for k, words_same_size in word_dict.items():
- for j, count in words_same_size.items():
- word_dict[k][j] = count / (tot * 0.01)
- word_dict[k] = sorted(word_dict[k].items(), key=lambda x: -x[1])
- return word_dict, set(words)
- def count_chars(s: str) -> list:
- s = s.lower()
- d = Counter(filter(str.isalpha, s))
- tot = sum(d.values())
- for k, v in d.items():
- d[k] = v / (tot * 0.01)
- return sorted(d.items(), key=lambda x: -x[1])
- def is_checkable(word: str, key: Dict[str, str]) -> bool:
- return any(key[k] is not None and k in word for k in key)
- def get_unknown_chars(key: Dict[str, str]) -> str:
- alph = 'абвгдеёжзийклмнопрстуфчцчшщъыьэюяңөү'
- return ''.join(c for c in alph if c not in key.values())
- def get_unknown_keys(key: Dict[str, str]) -> str:
- return ''.join(c for c in key if key[c] is None)
- def check_pattern(match: str, word: str) -> bool:
- return all((match[i] == match[j]) == (word[i] == word[j]) for i in range(len(word) - 1) for j in range(i, len(word)))
- def generate_pattern(word: str, key: Dict[str, str], counted_encoding_chars: List[List[str]]) -> str:
- reg = ''
- for s in word:
- if s in key.values():
- reg += list(key.keys())[list(key.values()).index(s)]
- elif s in 'нкы':
- reg += f'[{"".join(c[0] for c in counted_encoding_chars[1:4])}]'
- else:
- reg += f'[{get_unknown_keys(key)}]'
- return reg + '$'
- def test():
- print("TEST")
- return "Bye!!!!"
- def main():
- encoded_text = open('D:/Codes/hwPy/teory/text.txt', 'r', encoding='UTF-8').read()
- sample_text = open('D:/Codes/hwPy/teory/Manas-eposu-2010-S-Karalaev.txt', 'r', encoding='UTF-8').read().replace(
- 'www.el-sozduk.kg', '')
- counted_chars = count_chars(sample_text)
- counted_encoding_chars = count_chars(encoded_text)
- _, encoded_words = count_words(encoded_text)
- _, sample_words = count_words(sample_text)
- key = dict()
- for ec in counted_encoding_chars:
- key[ec[0]] = None
- key[counted_encoding_chars[0][0]] = 'а'
- while True:
- # тут я запрашиваю на который нужно заменить выбранный символ
- print(key)
- char_to_replace = input("Введите символ, который нужно заменить (или нажмите Enter для выхода): ")
- if not char_to_replace:
- break
- if char_to_replace in key:
- # запрашиваем у пользователя символ, на который нужно заменить выбранный символ
- new_char = input(f"Введите новое значение для символа '{char_to_replace}': ")
- # проверяем, есть ли другой символ в ключе, который также нужно заменить на новое значение
- for k, v in key.items():
- if v == new_char and k != char_to_replace:
- print(f"Символ '{k}' также был заменен на '{new_char}'.")
- key[k] = None
- # обновляем словарь, если новое значение не равно None
- if new_char != 'None':
- key[char_to_replace] = new_char
- else:
- print(f"Символ '{char_to_replace}' не найден в ключе.")
- counted_encoding_chars_dict = dict(counted_encoding_chars)
- counted_chars_dict = dict(counted_chars)
- print(counted_chars)
- print(counted_encoding_chars)
- unknown_chars = ''.join([c[0] for c in counted_chars[2:]])
- print(unknown_chars)
- # print(key)
- # bias = 0.05
- must_words = ["каныкей", 'деп', 'бейбак']
- for word in must_words:
- matches = match_words(encoded_words, generate_pattern(word, key, counted_encoding_chars))
- if len(matches) == 1:
- for i, char in enumerate(word):
- key[matches[0][i]] = char
- while all(map(lambda x: x is not None, key.keys())):
- print(key)
- #здесь обновление ключа
- # new_values = {}
- # for k in key.keys():
- # if key[k] is None:
- # new_val = input(f"Введите новое значение для символа {k}: ")
- # new_values[k] = new_val
- #
- # key.update(new_values)
- examples = []
- for word in encoded_words:
- if is_checkable(word, key):
- reg = ''
- for s in word:
- if key.get(s, None) is None:
- reg += f'[{get_unknown_chars(key)}]'
- else:
- reg += key[s]
- reg += '$'
- # print(word, reg)
- matches = match_words(sample_words, reg)
- if word == 'лңк':
- print(matches, reg)
- matches = list(filter(lambda x: check_pattern(x, word), matches))
- # if matches:
- # print(matches)
- if len(matches) == 1 and any(map(lambda s: s not in key.values(), matches[0])):
- # print(matches[0])
- err = 0.0
- is_valid = True
- for i in range(len(matches[0])):
- if key[word[i]] is None:
- err += abs(
- counted_encoding_chars_dict[word[i]] * 1.0 / counted_chars_dict[matches[0][i]] - 1)
- if matches[0][i] in list(key.values()):
- is_valid = False
- break
- if is_valid:
- examples.append((word, matches[0], err / len(word)))
- if len(examples) == 0:
- break
- examples.sort(key=lambda x: x[2])
- for i in range(len(examples[0][0])):
- key[examples[0][0][i]] = examples[0][1][i]
- print(examples[0])
- text = ''
- for w in encoded_text:
- w = w.lower()
- if w.isalpha():
- text += key[w] if key.get(w, None) is not None else 'X'
- else:
- text += w
- # print(''.join([k for k in key.values() if k is not None]))
- # print(''.join([k for k in key.keys() if key[k] is not None]))
- print(text)
- # print(key)
- # print(sample_words)
- # print(list(key.values()).count(None))
- print(main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement