Advertisement
oleh_korkh

Untitled

Jan 2nd, 2018
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.69 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3.  
  4. def soundex(word):
  5.     codes = ("bfpv", "cgjkqsxz", "dt", "l", "mn", "r")
  6.     sound_dict = {ch: k+1 for k, chunk in enumerate(codes) for ch in chunk}
  7.     # Convert the word to the lower case
  8.     word = word.lower()
  9.  
  10.     # Check the letters in the original word and retain only first letter
  11.     # if multiple letters with the same code have been found
  12.     preprocessed_word = ''
  13.     for x in word:
  14.         if len(preprocessed_word) == 0 or x not in sound_dict:
  15.             preprocessed_word += x
  16.         elif len(preprocessed_word) > 0 and \
  17.                 sound_dict.get(preprocessed_word[-1]) != sound_dict.get(x):
  18.             preprocessed_word += x
  19.     word = preprocessed_word
  20.  
  21.     # Get codes
  22.     codes_and_underscores = ''.join(
  23.         str(sound_dict.get(x, '_'))
  24.         for x in word[1:] if x not in 'hw')
  25.  
  26.     # Remove duplicates and underscores
  27.     accum = ''
  28.     for x in codes_and_underscores:
  29.         if len(accum) == 0 or len(accum) > 0 and accum[-1] != x:
  30.             accum += x
  31.     return word[0].upper() + \
  32.         ''.join(x for x in accum if x != '_')[:3].ljust(3, '0')
  33.  
  34.  
  35. def lower_and_remove_punctuation(text):
  36.     import string
  37.     text = text.lower()
  38.     for x in string.punctuation:
  39.         text = text.replace(x, ' ')
  40.     return text
  41.  
  42.  
  43. def get_words_from_text(text):
  44.     text = lower_and_remove_punctuation(text)
  45.     words = set(text.split())
  46.     return words
  47.  
  48.  
  49. def save_words_to_file(words, filename='words.txt'):
  50.     with open(filename, 'w') as f:
  51.         f.write('\n'.join(words))
  52.  
  53.  
  54. def load_words_from_file(filename='words.txt'):
  55.     with open(filename, 'r') as f:
  56.         result = [x.strip() for x in f.readlines()]
  57.     return result
  58.  
  59.  
  60. def load_lines_from_file(filename='input.txt'):
  61.     with open(filename, 'r') as f:
  62.         result = [x.strip() for x in f.readlines()]
  63.     return result
  64.  
  65.  
  66. def check_text(lines, words):
  67.     codes = {}
  68.     for w in words:
  69.         code = soundex(w)
  70.         if code not in codes:
  71.             codes[code] = [w]
  72.         else:
  73.             codes[code].append(w)
  74.  
  75.     for i, line in enumerate(lines):
  76.         t = lower_and_remove_punctuation(line)
  77.         ws = t.split()
  78.         for w in ws:
  79.             if w not in words:
  80.                 code = soundex(w)
  81.                 suggestions = codes.get(code, [])
  82.                 print(
  83.                     'Found unknown word "%s" in line %d. '
  84.                     'Suggestions: %s' % (
  85.                         w,
  86.                         i+1,
  87.                         ', '.join(suggestions) or 'NONE'))
  88.  
  89.  
  90. def test_soundex():
  91.     tests = (
  92.         ('Robert', 'R163'),
  93.         ('Rupert', 'R163'),
  94.         ('Rubin', 'R150'),
  95.         ('Ashcraft', 'A261'),
  96.         ('Ashcroft', 'A261'),
  97.         ('Tymczak', 'T522'),
  98.         ('Pfister', 'P236'),
  99.         ('Honeyman', 'H555'),
  100.         ('Burroughs', 'B620'),
  101.         ('Burrows', 'B620'),
  102.         ('Ciondecks', 'C532'),
  103.         ('Ellery', 'E460'),
  104.         ('Euler', 'E460'),
  105.         ('Example', 'E251'),
  106.         ('Gauss', 'G200'),
  107.         ('Ghosh', 'G200'),
  108.         ('Heilbronn', 'H416'),
  109.         ('Hilbert', 'H416'),
  110.         ('Kant', 'K530'),
  111.         ('Knuth', 'K530'),
  112.         ('Ladd', 'L300'),
  113.         ('Lissajous', 'L222'),
  114.         ('Lloyd', 'L300'),
  115.         ('Lukasiewicz', 'L222'),
  116.         ('O\'Hara', 'O600'),
  117.         ('Soundex', 'S532'),
  118.         ('Wheaton', 'W350'),
  119.     )
  120.  
  121.     for w, c in tests:
  122.         r = soundex(w)
  123.         assert r == c, \
  124.             'soundex("%s") returns "%s", should be "%s"' % (w, r, c)
  125.  
  126.  
  127. if __name__ == '__main__':
  128.     # Check the text in file
  129.     lines = load_lines_from_file()
  130.     words = load_words_from_file()
  131.     check_text(lines, words)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement