Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import re
- from collections import Iterable
- def flatten(items):
- # Why in gods name am I forced to write this, python?
- for x in items:
- if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
- for sub_x in flatten(x):
- yield sub_x
- else:
- yield x
- # This error is raised if no suitable name can be found.
- class NameNotFoundException(Exception): pass
- SPECIAL_EDITS = ["district", "county", "city", "township", "village"]
- class Correctable():
- def __init__(self, names):
- """`names` is a dictionary mapping name to prior probability."""
- self.names = {name.lower(): [name, p] for name, p in names.items()}
- def exact_match(self, name):
- "Returns name (w/ proper capitalization) if name is already canonical; None otherwise."
- if name.lower() in self.names:
- return self.names[name.lower()][0]
- def correction(self, name):
- "Returns the most probable correction for name."
- return self.names[max(self._candidates(name), key=self._p)][0]
- def _p(self, name):
- "Returns the prior probability of name"
- return self.names[name][1]
- def _candidates(self, name, depth=0):
- "Generate possible corrections for name."
- c = self._known(self._edits_within(name, depth))
- if c:
- return c
- if depth >= 2:
- raise NameNotFoundException("Unable to find canonical name for: " + name)
- return self.candidates(name, depth+1)
- def _known(self, names):
- "The subset of `names` that appear in the dictionary of names."
- return set(w for w in names if w in self.names)
- def _edits_within(self, name, max_dist=2):
- "All edits that are max_dist edits away from `name`."
- return self._edits_recur([name], max_dist)
- def _edits_recur(self, partials, stop):
- if stop == 0:
- return partials
- return self._edits_recur(flatten([self._edits(n) for n in partials]), stop - 1)
- def _edits(self, name):
- "All edits that are one edit away from `name`."
- letters = 'abcdefghijklmnopqrstuvwxyz'
- splits = [(name[:i], name[i:]) for i in range(len(name) + 1)]
- deletes = [L + R[1:] for L, R in splits if R]
- transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
- replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
- inserts = [L + c + R for L, R in splits for c in letters]
- finals = [self._add_or_remove_final(name, s) for s in SPECIAL_EDITS]
- initials = [self._add_or_remove_final(name, s + " of") for s in SPECIAL_EDITS]
- return list(set(deletes + transposes + replaces + inserts + finals + initials))
- def _add_or_remove_final(self, name, string):
- reg = r"\s*" + re.escape(string) + r"$"
- if re.search(reg, name):
- return re.sub(reg, "", name)
- return name + " " + string
- def _add_or_remove_initial(self, name, string):
- reg = r"^" + re.escape(string) + r"\s*"
- if re.search(reg, name):
- return re.sub(reg, "", name)
- return string + " " + name
Add Comment
Please, Sign In to add comment