Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- def soundex(word):
- codes = ("bfpv", "cgjkqsxz", "dt", "l", "mn", "r")
- sound_dict = {ch: k+1 for k, chunk in enumerate(codes) for ch in chunk}
- # Convert the word to the lower case
- word = word.lower()
- # Check the letters in the original word and retain only first letter
- # if multiple letters with the same code have been found
- preprocessed_word = ''
- for x in word:
- if len(preprocessed_word) == 0 or x not in sound_dict:
- preprocessed_word += x
- elif len(preprocessed_word) > 0 and \
- sound_dict.get(preprocessed_word[-1]) != sound_dict.get(x):
- preprocessed_word += x
- word = preprocessed_word
- # Get codes
- codes_and_underscores = ''.join(
- str(sound_dict.get(x, '_'))
- for x in word[1:] if x not in 'hw')
- # Remove duplicates and underscores
- accum = ''
- for x in codes_and_underscores:
- if len(accum) == 0 or len(accum) > 0 and accum[-1] != x:
- accum += x
- return word[0].upper() + \
- ''.join(x for x in accum if x != '_')[:3].ljust(3, '0')
- def lower_and_remove_punctuation(text):
- import string
- text = text.lower()
- for x in string.punctuation:
- text = text.replace(x, ' ')
- return text
- def get_words_from_text(text):
- text = lower_and_remove_punctuation(text)
- words = set(text.split())
- return words
- def save_words_to_file(words, filename='words.txt'):
- with open(filename, 'w') as f:
- f.write('\n'.join(words))
- def load_words_from_file(filename='words.txt'):
- with open(filename, 'r') as f:
- result = [x.strip() for x in f.readlines()]
- return result
- def load_lines_from_file(filename='input.txt'):
- with open(filename, 'r') as f:
- result = [x.strip() for x in f.readlines()]
- return result
- def check_text(lines, words):
- codes = {}
- for w in words:
- code = soundex(w)
- if code not in codes:
- codes[code] = [w]
- else:
- codes[code].append(w)
- for i, line in enumerate(lines):
- t = lower_and_remove_punctuation(line)
- ws = t.split()
- for w in ws:
- if w not in words:
- code = soundex(w)
- suggestions = codes.get(code, [])
- print(
- 'Found unknown word "%s" in line %d. '
- 'Suggestions: %s' % (
- w,
- i+1,
- ', '.join(suggestions) or 'NONE'))
- def test_soundex():
- tests = (
- ('Robert', 'R163'),
- ('Rupert', 'R163'),
- ('Rubin', 'R150'),
- ('Ashcraft', 'A261'),
- ('Ashcroft', 'A261'),
- ('Tymczak', 'T522'),
- ('Pfister', 'P236'),
- ('Honeyman', 'H555'),
- ('Burroughs', 'B620'),
- ('Burrows', 'B620'),
- ('Ciondecks', 'C532'),
- ('Ellery', 'E460'),
- ('Euler', 'E460'),
- ('Example', 'E251'),
- ('Gauss', 'G200'),
- ('Ghosh', 'G200'),
- ('Heilbronn', 'H416'),
- ('Hilbert', 'H416'),
- ('Kant', 'K530'),
- ('Knuth', 'K530'),
- ('Ladd', 'L300'),
- ('Lissajous', 'L222'),
- ('Lloyd', 'L300'),
- ('Lukasiewicz', 'L222'),
- ('O\'Hara', 'O600'),
- ('Soundex', 'S532'),
- ('Wheaton', 'W350'),
- )
- for w, c in tests:
- r = soundex(w)
- assert r == c, \
- 'soundex("%s") returns "%s", should be "%s"' % (w, r, c)
- if __name__ == '__main__':
- # Check the text in file
- lines = load_lines_from_file()
- words = load_words_from_file()
- check_text(lines, words)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement