Advertisement
Guest User

Untitled

a guest
May 25th, 2019
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.85 KB | None | 0 0
  1. import re
  2. from abc import ABC, abstractmethod
  3. import editdistance
  4. import pymorphy2
  5.  
  6.  
  7. class Soundex(ABC):
  8. _vowels = ''
  9. _table = str.maketrans('', '')
  10. _reduce_regex = re.compile(r'(\w)(\1)+', re.IGNORECASE)
  11. _vowels_regex = re.compile(r'(0+)', re.IGNORECASE)
  12.  
  13. def __init__(self, delete_first_letter=False, delete_first_coded_letter=False,
  14. delete_zeros=False, cut_result=False, seq_cutted_len=4):
  15. """
  16. Initialization of Soundex object
  17. :param delete_first_letter: remove the first letter from the result code (A169 -> 169)
  18. :param delete_first_coded_letter: remove the first coded letter from the result code (A5169 -> A169)
  19. :param delete_zeros: remove vowels from the result code
  20. :param cut_result: cut result core till N symbols
  21. :param seq_cutted_len: length of the result code
  22. """
  23. self.delete_first_letter = delete_first_letter
  24. self.delete_first_coded_letter = delete_first_coded_letter
  25. self.delete_zeros = delete_zeros
  26. self.cut_result = cut_result
  27. self.seq_cutted_len = seq_cutted_len
  28.  
  29. def _is_vowel(self, letter):
  30. return letter in self._vowels
  31.  
  32. def _reduce_seq(self, seq):
  33. return self._reduce_regex.sub(r'\1', seq)
  34.  
  35. def _translate_vowels(self, word):
  36. return ''.join('0' if self._is_vowel(letter) else letter for letter in word)
  37.  
  38. def _remove_vowels_and_paired_sounds(self, seq):
  39. seq = self._vowels_regex.sub('', seq)
  40. seq = self._reduce_seq(seq)
  41. return seq
  42.  
  43. def _apply_soundex_algorithm(self, word):
  44. word = word.lower()
  45. first, last = word[0], word
  46. last = last.translate(self._table)
  47. last = self._translate_vowels(last)
  48. last = self._reduce_seq(last)
  49. if self.delete_zeros:
  50. last = self._remove_vowels_and_paired_sounds(last)
  51. if self.cut_result:
  52. last = last[:self.seq_cutted_len] if len(last) >= self.seq_cutted_len else last
  53. last += ('0' * (self.seq_cutted_len - len(last)))
  54. if self.delete_first_coded_letter:
  55. last = last[1:]
  56. first_char = '' if self.delete_first_letter else first.capitalize()
  57. return first_char + last.upper()
  58.  
  59. def get_vowels(self):
  60. return self._vowels
  61.  
  62. def is_delete_first_coded_letter(self):
  63. return self.delete_first_coded_letter
  64.  
  65. def is_delete_first_letter(self):
  66. return self.delete_first_letter
  67.  
  68. @abstractmethod
  69. def transform(self, word):
  70. """
  71. Converts a given word th Soundex code
  72. :param word: string
  73. :return: Soundex string code
  74. """
  75. return None
  76.  
  77.  
  78. class EnglishSoundex(Soundex):
  79. _hw_replacement = re.compile(r'[hw]', re.IGNORECASE)
  80.  
  81. _vowels = 'aeiouy'
  82. _table = str.maketrans('bpfvcksgjqxzdtlmnr', '112233344555667889')
  83.  
  84. def transform(self, word):
  85. word = self._hw_replacement.sub('', word)
  86. return self._apply_soundex_algorithm(word)
  87.  
  88.  
  89. class RussianSoundex(Soundex):
  90. _vowels = 'аэиоуыеёюя'
  91. _vowels_table = str.maketrans('аяоыиеёэюу', 'AAABBBBBCC')
  92. _table = str.maketrans('бпвфгкхдтжшчщзсцлмнр', '11223334455556667889')
  93. _ego_ogo_endings = re.compile(r'([ео])(г)(о$)', re.IGNORECASE)
  94. _ia_ending = re.compile(r'[еи][ая]', re.IGNORECASE)
  95. _ii_ending = re.compile(r'и[еио]', re.IGNORECASE)
  96.  
  97. _replacement_map = {
  98. re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(я)', re.IGNORECASE): 'jа',
  99. re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(ю)', re.IGNORECASE): 'jу',
  100. re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(е)', re.IGNORECASE): 'jэ',
  101. re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(ё)', re.IGNORECASE): 'jо',
  102. re.compile(r'й', re.IGNORECASE): 'j',
  103. re.compile(r'([тсзжцчшщ])([жцчшщ])', re.IGNORECASE): r'\2',
  104. re.compile(r'(с)(т)([лнц])', re.IGNORECASE): r'\1\3',
  105. re.compile(r'(н)([тд])(ств)', re.IGNORECASE): r'\1\3',
  106. re.compile(r'([нс])([тд])(ск)', re.IGNORECASE): r'\1\3',
  107. re.compile(r'(р)(д)([чц])', re.IGNORECASE): r'\1\3',
  108. re.compile(r'(з)(д)([нц])', re.IGNORECASE): r'\1\3',
  109. re.compile(r'(в)(ств)', re.IGNORECASE): r'\2',
  110. re.compile(r'(л)(нц)', re.IGNORECASE): r'\2',
  111. re.compile(r'[ъь]', re.IGNORECASE): '',
  112. re.compile(r'([дт][зсц])', re.IGNORECASE): 'ц'
  113. }
  114.  
  115. def __init__(self, delete_first_letter=False, delete_first_coded_letter=False,
  116. delete_zeros=False, cut_result=False, seq_cutted_len=4,
  117. code_vowels=False, use_morph_analysis=False):
  118. """
  119. Initialization of Russian Soundex object
  120. :param delete_first_letter:
  121. :param delete_first_coded_letter:
  122. :param delete_zeros:
  123. :param cut_result:
  124. :param seq_cutted_len:
  125. :param use_morph_analysis: use morphological grammems for phonemes analysis
  126. :param code_vowels: group and code vowels as ABC letters
  127. """
  128. super(RussianSoundex, self).__init__(delete_first_letter, delete_first_coded_letter,
  129. delete_zeros, cut_result, seq_cutted_len)
  130.  
  131. self.code_vowels = code_vowels
  132. self.use_morph_analysis = use_morph_analysis
  133. self._moprh = pymorphy2.MorphAnalyzer()
  134.  
  135. def _translate_vowels(self, word):
  136. if self.code_vowels:
  137. return word.translate(self._vowels_table)
  138. else:
  139. return super(RussianSoundex, self)._translate_vowels(word)
  140.  
  141. def _replace_ego_ogo_endings(self, word):
  142. return self._ego_ogo_endings.sub(r'\1в\3', word)
  143.  
  144. def _use_morph_for_phoneme_replace(self, word):
  145. parse = self._moprh.parse(word)
  146. if parse and ('ADJF' in parse[0].tag or 'NUMB' in parse[0].tag or 'NPRO' in parse[0].tag):
  147. word = self._replace_ego_ogo_endings(word)
  148. return word
  149.  
  150. def _replace_vowels_seq(self, word):
  151. word = self._ii_ending.sub('и', word)
  152. word = self._ia_ending.sub('я', word)
  153. return word
  154.  
  155. def transform(self, word):
  156. if self.use_morph_analysis:
  157. word = self._use_morph_for_phoneme_replace(word)
  158. for replace, result in self._replacement_map.items():
  159. word = replace.sub(result, word)
  160. if self.code_vowels:
  161. word = self._replace_vowels_seq(word)
  162. return self._apply_soundex_algorithm(word)
  163.  
  164.  
  165. class SoundexSimilarity:
  166. def __init__(self, soundex, metrics=editdistance.eval):
  167. """
  168. Init a similarity object
  169. :param soundex: an object of Soundex class
  170. :param metrics: similarity function, optional, default is Levenstein distance
  171. """
  172. self.soundex_converter = soundex
  173. self.metrics = metrics
  174.  
  175. def similarity(self, word1, word2):
  176. """
  177. Compute the similarity between Soundex codes
  178. :param word1: first original word
  179. :param word2: second original word
  180. :return: distance value
  181. """
  182. w1, w2 = self.soundex_converter.transform(word1), self.soundex_converter.transform(word2)
  183. if self.soundex_converter.is_delete_first_letter():
  184. return self.metrics(w1, w2)
  185. return self.metrics(w1[1:], w2[1:])
  186.  
  187.  
  188. if __name__ == '__main__':
  189. en_soundex = EnglishSoundex(delete_first_coded_letter=True,
  190. cut_result=True, delete_zeros=True)
  191. assert en_soundex.transform('Robert') == 'R196'
  192. assert en_soundex.transform('Rubin') == 'R180'
  193. assert en_soundex.transform('Rupert') == en_soundex.transform('Robert')
  194. assert en_soundex.transform('Ashcraft') == 'A926'
  195. assert en_soundex.transform('Ashcraft') == en_soundex.transform('Ashcroft')
  196. assert en_soundex.transform('Tymczak') == 'T835'
  197.  
  198. ru_soundex = RussianSoundex()
  199. assert ru_soundex.transform('ёлочка') == 'JJ070530'
  200. assert ru_soundex.transform('ёлочка') == ru_soundex.transform('йолочка')
  201. assert ru_soundex.transform('кот') == ru_soundex.transform('код')
  202. assert ru_soundex.transform('медь') == ru_soundex.transform('меть')
  203. assert ru_soundex.transform('девчонка') == ru_soundex.transform('девчёнка')
  204. assert ru_soundex.transform('детский') == ru_soundex.transform('децкий')
  205. assert ru_soundex.transform('двацать') == ru_soundex.transform('двадцать')
  206. assert ru_soundex.transform('сница') == ru_soundex.transform('сниться')
  207. assert ru_soundex.transform('воротца') == ru_soundex.transform('вороца')
  208. assert ru_soundex.transform('гигантский') == ru_soundex.transform('гиганский')
  209. assert ru_soundex.transform('марксистский') == ru_soundex.transform('марксисский')
  210. assert ru_soundex.transform('чувствовать') == ru_soundex.transform('чуствовать')
  211. assert ru_soundex.transform('праздник') == ru_soundex.transform('празник')
  212. assert ru_soundex.transform('шчастье') == ru_soundex.transform('счастье')
  213. assert ru_soundex.transform('том') == ru_soundex.transform('тон')
  214. assert ru_soundex.transform('щастье') == 'Щ5064J0'
  215. assert ru_soundex.transform('счастье') == 'Ч5064J0'
  216. assert ru_soundex.transform('агенство') == ru_soundex.transform('агентство')
  217. assert ru_soundex.transform('театр') == ru_soundex.transform('тятр')
  218. assert ru_soundex.transform('сонце') == ru_soundex.transform('солнце')
  219. assert ru_soundex.transform('серце') == ru_soundex.transform('сердце')
  220. assert ru_soundex.transform('считать') == 'Ч50404'
  221. assert ru_soundex.transform('щитать') == 'Щ50404'
  222.  
  223. ru_soundex = RussianSoundex(use_morph_analysis=True, code_vowels=True)
  224. assert ru_soundex.transform('зелёного') == 'З6B7B8A2A'
  225. assert ru_soundex.transform('никого') == 'Н8B3A2A'
  226. assert ru_soundex.transform('ничего') == 'Н8B5B2A'
  227. assert ru_soundex.transform('много') == 'М8A3A'
  228.  
  229. ru_soundex = RussianSoundex(delete_first_letter=True)
  230. similarity_checker = SoundexSimilarity(ru_soundex)
  231. assert similarity_checker.similarity('щастье', 'счастье') == 0
  232. assert similarity_checker.similarity('считать', 'щитать') == 0
  233. assert similarity_checker.similarity('зуд', 'суд') == 0
  234. assert similarity_checker.similarity('мощь', 'мочь') == 0
  235. assert similarity_checker.similarity('ночь', 'мочь') == 0
  236. assert similarity_checker.similarity('сахар', 'цукер') == 0
  237. assert similarity_checker.similarity('булочная', 'булошная') == 0
  238. assert similarity_checker.similarity('булочная', 'булошная') == 0
  239. assert similarity_checker.similarity('блеснуть', 'блестнуть') == 0
  240. assert similarity_checker.similarity('ненасный', 'ненастный') == 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement