SHARE
TWEET

Untitled

a guest May 25th, 2019 72 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import re
  2. from abc import ABC, abstractmethod
  3. import editdistance
  4. import pymorphy2
  5.  
  6.  
  7. class Soundex(ABC):
  8.     _vowels = ''
  9.     _table = str.maketrans('', '')
  10.     _reduce_regex = re.compile(r'(\w)(\1)+', re.IGNORECASE)
  11.     _vowels_regex = re.compile(r'(0+)', re.IGNORECASE)
  12.  
  13.     def __init__(self, delete_first_letter=False, delete_first_coded_letter=False,
  14.                  delete_zeros=False, cut_result=False, seq_cutted_len=4):
  15.         """
  16.         Initialization of Soundex object
  17.         :param delete_first_letter: remove the first letter from the result code (A169 -> 169)
  18.         :param delete_first_coded_letter: remove the first coded letter from the result code (A5169 -> A169)
  19.         :param delete_zeros: remove vowels from the result code
  20.         :param cut_result: cut result core till N symbols
  21.         :param seq_cutted_len: length of the result code
  22.         """
  23.         self.delete_first_letter = delete_first_letter
  24.         self.delete_first_coded_letter = delete_first_coded_letter
  25.         self.delete_zeros = delete_zeros
  26.         self.cut_result = cut_result
  27.         self.seq_cutted_len = seq_cutted_len
  28.  
  29.     def _is_vowel(self, letter):
  30.         return letter in self._vowels
  31.  
  32.     def _reduce_seq(self, seq):
  33.         return self._reduce_regex.sub(r'\1', seq)
  34.  
  35.     def _translate_vowels(self, word):
  36.         return ''.join('0' if self._is_vowel(letter) else letter for letter in word)
  37.  
  38.     def _remove_vowels_and_paired_sounds(self, seq):
  39.         seq = self._vowels_regex.sub('', seq)
  40.         seq = self._reduce_seq(seq)
  41.         return seq
  42.  
  43.     def _apply_soundex_algorithm(self, word):
  44.         word = word.lower()
  45.         first, last = word[0], word
  46.         last = last.translate(self._table)
  47.         last = self._translate_vowels(last)
  48.         last = self._reduce_seq(last)
  49.         if self.delete_zeros:
  50.             last = self._remove_vowels_and_paired_sounds(last)
  51.         if self.cut_result:
  52.             last = last[:self.seq_cutted_len] if len(last) >= self.seq_cutted_len else last
  53.             last += ('0' * (self.seq_cutted_len - len(last)))
  54.         if self.delete_first_coded_letter:
  55.             last = last[1:]
  56.         first_char = '' if self.delete_first_letter else first.capitalize()
  57.         return first_char + last.upper()
  58.  
  59.     def get_vowels(self):
  60.         return self._vowels
  61.  
  62.     def is_delete_first_coded_letter(self):
  63.         return self.delete_first_coded_letter
  64.  
  65.     def is_delete_first_letter(self):
  66.         return self.delete_first_letter
  67.  
  68.     @abstractmethod
  69.     def transform(self, word):
  70.         """
  71.         Converts a given word th Soundex code
  72.         :param word: string
  73.         :return: Soundex string code
  74.         """
  75.         return None
  76.  
  77.  
  78. class EnglishSoundex(Soundex):
  79.     _hw_replacement = re.compile(r'[hw]', re.IGNORECASE)
  80.  
  81.     _vowels = 'aeiouy'
  82.     _table = str.maketrans('bpfvcksgjqxzdtlmnr', '112233344555667889')
  83.  
  84.     def transform(self, word):
  85.         word = self._hw_replacement.sub('', word)
  86.         return self._apply_soundex_algorithm(word)
  87.  
  88.  
  89. class RussianSoundex(Soundex):
  90.     _vowels = 'аэиоуыеёюя'
  91.     _vowels_table = str.maketrans('аяоыиеёэюу', 'AAABBBBBCC')
  92.     _table = str.maketrans('бпвфгкхдтжшчщзсцлмнр', '11223334455556667889')
  93.     _ego_ogo_endings = re.compile(r'([ео])(г)(о$)', re.IGNORECASE)
  94.     _ia_ending = re.compile(r'[еи][ая]', re.IGNORECASE)
  95.     _ii_ending = re.compile(r'и[еио]', re.IGNORECASE)
  96.  
  97.     _replacement_map = {
  98.         re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(я)', re.IGNORECASE): 'jа',
  99.         re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(ю)', re.IGNORECASE): 'jу',
  100.         re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(е)', re.IGNORECASE): 'jэ',
  101.         re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(ё)', re.IGNORECASE): 'jо',
  102.         re.compile(r'й', re.IGNORECASE): 'j',
  103.         re.compile(r'([тсзжцчшщ])([жцчшщ])', re.IGNORECASE): r'\2',
  104.         re.compile(r'(с)(т)([лнц])', re.IGNORECASE): r'\1\3',
  105.         re.compile(r'(н)([тд])(ств)', re.IGNORECASE): r'\1\3',
  106.         re.compile(r'([нс])([тд])(ск)', re.IGNORECASE): r'\1\3',
  107.         re.compile(r'(р)(д)([чц])', re.IGNORECASE): r'\1\3',
  108.         re.compile(r'(з)(д)([нц])', re.IGNORECASE): r'\1\3',
  109.         re.compile(r'(в)(ств)', re.IGNORECASE): r'\2',
  110.         re.compile(r'(л)(нц)', re.IGNORECASE): r'\2',
  111.         re.compile(r'[ъь]', re.IGNORECASE): '',
  112.         re.compile(r'([дт][зсц])', re.IGNORECASE): 'ц'
  113.     }
  114.  
  115.     def __init__(self, delete_first_letter=False, delete_first_coded_letter=False,
  116.                  delete_zeros=False, cut_result=False, seq_cutted_len=4,
  117.                  code_vowels=False, use_morph_analysis=False):
  118.         """
  119.         Initialization of Russian Soundex object
  120.         :param delete_first_letter:
  121.         :param delete_first_coded_letter:
  122.         :param delete_zeros:
  123.         :param cut_result:
  124.         :param seq_cutted_len:
  125.         :param use_morph_analysis: use morphological grammems for phonemes analysis
  126.         :param code_vowels: group and code vowels as ABC letters
  127.         """
  128.         super(RussianSoundex, self).__init__(delete_first_letter, delete_first_coded_letter,
  129.                                              delete_zeros, cut_result, seq_cutted_len)
  130.  
  131.         self.code_vowels = code_vowels
  132.         self.use_morph_analysis = use_morph_analysis
  133.         self._moprh = pymorphy2.MorphAnalyzer()
  134.  
  135.     def _translate_vowels(self, word):
  136.         if self.code_vowels:
  137.             return word.translate(self._vowels_table)
  138.         else:
  139.             return super(RussianSoundex, self)._translate_vowels(word)
  140.  
  141.     def _replace_ego_ogo_endings(self, word):
  142.         return self._ego_ogo_endings.sub(r'\1в\3', word)
  143.  
  144.     def _use_morph_for_phoneme_replace(self, word):
  145.         parse = self._moprh.parse(word)
  146.         if parse and ('ADJF' in parse[0].tag or 'NUMB' in parse[0].tag or 'NPRO' in parse[0].tag):
  147.             word = self._replace_ego_ogo_endings(word)
  148.         return word
  149.  
  150.     def _replace_vowels_seq(self, word):
  151.         word = self._ii_ending.sub('и', word)
  152.         word = self._ia_ending.sub('я', word)
  153.         return word
  154.  
  155.     def transform(self, word):
  156.         if self.use_morph_analysis:
  157.             word = self._use_morph_for_phoneme_replace(word)
  158.         for replace, result in self._replacement_map.items():
  159.             word = replace.sub(result, word)
  160.         if self.code_vowels:
  161.             word = self._replace_vowels_seq(word)
  162.         return self._apply_soundex_algorithm(word)
  163.  
  164.  
  165. class SoundexSimilarity:
  166.     def __init__(self, soundex, metrics=editdistance.eval):
  167.         """
  168.         Init a similarity object
  169.         :param soundex: an object of Soundex class
  170.         :param metrics: similarity function, optional, default is Levenstein distance
  171.         """
  172.         self.soundex_converter = soundex
  173.         self.metrics = metrics
  174.  
  175.     def similarity(self, word1, word2):
  176.         """
  177.         Compute the similarity between Soundex codes
  178.         :param word1: first original word
  179.         :param word2: second original word
  180.         :return: distance value
  181.         """
  182.         w1, w2 = self.soundex_converter.transform(word1), self.soundex_converter.transform(word2)
  183.         if self.soundex_converter.is_delete_first_letter():
  184.             return self.metrics(w1, w2)
  185.         return self.metrics(w1[1:], w2[1:])
  186.  
  187.  
  188. if __name__ == '__main__':
  189.     en_soundex = EnglishSoundex(delete_first_coded_letter=True,
  190.                                 cut_result=True, delete_zeros=True)
  191.     assert en_soundex.transform('Robert') == 'R196'
  192.     assert en_soundex.transform('Rubin') == 'R180'
  193.     assert en_soundex.transform('Rupert') == en_soundex.transform('Robert')
  194.     assert en_soundex.transform('Ashcraft') == 'A926'
  195.     assert en_soundex.transform('Ashcraft') == en_soundex.transform('Ashcroft')
  196.     assert en_soundex.transform('Tymczak') == 'T835'
  197.  
  198.     ru_soundex = RussianSoundex()
  199.     assert ru_soundex.transform('ёлочка') == 'JJ070530'
  200.     assert ru_soundex.transform('ёлочка') == ru_soundex.transform('йолочка')
  201.     assert ru_soundex.transform('кот') == ru_soundex.transform('код')
  202.     assert ru_soundex.transform('медь') == ru_soundex.transform('меть')
  203.     assert ru_soundex.transform('девчонка') == ru_soundex.transform('девчёнка')
  204.     assert ru_soundex.transform('детский') == ru_soundex.transform('децкий')
  205.     assert ru_soundex.transform('двацать') == ru_soundex.transform('двадцать')
  206.     assert ru_soundex.transform('сница') == ru_soundex.transform('сниться')
  207.     assert ru_soundex.transform('воротца') == ru_soundex.transform('вороца')
  208.     assert ru_soundex.transform('гигантский') == ru_soundex.transform('гиганский')
  209.     assert ru_soundex.transform('марксистский') == ru_soundex.transform('марксисский')
  210.     assert ru_soundex.transform('чувствовать') == ru_soundex.transform('чуствовать')
  211.     assert ru_soundex.transform('праздник') == ru_soundex.transform('празник')
  212.     assert ru_soundex.transform('шчастье') == ru_soundex.transform('счастье')
  213.     assert ru_soundex.transform('том') == ru_soundex.transform('тон')
  214.     assert ru_soundex.transform('щастье') == 'Щ5064J0'
  215.     assert ru_soundex.transform('счастье') == 'Ч5064J0'
  216.     assert ru_soundex.transform('агенство') == ru_soundex.transform('агентство')
  217.     assert ru_soundex.transform('театр') == ru_soundex.transform('тятр')
  218.     assert ru_soundex.transform('сонце') == ru_soundex.transform('солнце')
  219.     assert ru_soundex.transform('серце') == ru_soundex.transform('сердце')
  220.     assert ru_soundex.transform('считать') == 'Ч50404'
  221.     assert ru_soundex.transform('щитать') == 'Щ50404'
  222.  
  223.     ru_soundex = RussianSoundex(use_morph_analysis=True, code_vowels=True)
  224.     assert ru_soundex.transform('зелёного') == 'З6B7B8A2A'
  225.     assert ru_soundex.transform('никого') == 'Н8B3A2A'
  226.     assert ru_soundex.transform('ничего') == 'Н8B5B2A'
  227.     assert ru_soundex.transform('много') == 'М8A3A'
  228.  
  229.     ru_soundex = RussianSoundex(delete_first_letter=True)
  230.     similarity_checker = SoundexSimilarity(ru_soundex)
  231.     assert similarity_checker.similarity('щастье', 'счастье') == 0
  232.     assert similarity_checker.similarity('считать', 'щитать') == 0
  233.     assert similarity_checker.similarity('зуд', 'суд') == 0
  234.     assert similarity_checker.similarity('мощь', 'мочь') == 0
  235.     assert similarity_checker.similarity('ночь', 'мочь') == 0
  236.     assert similarity_checker.similarity('сахар', 'цукер') == 0
  237.     assert similarity_checker.similarity('булочная', 'булошная') == 0
  238.     assert similarity_checker.similarity('булочная', 'булошная') == 0
  239.     assert similarity_checker.similarity('блеснуть', 'блестнуть') == 0
  240.     assert similarity_checker.similarity('ненасный', 'ненастный') == 0
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top