Guest User

Untitled

a guest
Jul 15th, 2018
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.10 KB | None | 0 0
  1. import csv
  2. import re
  3.  
  4. from collections import Iterable
  5.  
  6. def flatten(items):
  7. # Why in gods name am I forced to write this, python?
  8. for x in items:
  9. if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
  10. for sub_x in flatten(x):
  11. yield sub_x
  12. else:
  13. yield x
  14.  
  15. # This error is raised if no suitable name can be found.
  16. class NameNotFoundException(Exception): pass
  17.  
  18. SPECIAL_EDITS = ["district", "county", "city", "township", "village"]
  19.  
  20. class Correctable():
  21. def __init__(self, names):
  22. """`names` is a dictionary mapping name to prior probability."""
  23. self.names = {name.lower(): [name, p] for name, p in names.items()}
  24.  
  25. def exact_match(self, name):
  26. "Returns name (w/ proper capitalization) if name is already canonical; None otherwise."
  27. if name.lower() in self.names:
  28. return self.names[name.lower()][0]
  29.  
  30. def correction(self, name):
  31. "Returns the most probable correction for name."
  32. return self.names[max(self._candidates(name), key=self._p)][0]
  33.  
  34. def _p(self, name):
  35. "Returns the prior probability of name"
  36. return self.names[name][1]
  37.  
  38. def _candidates(self, name, depth=0):
  39. "Generate possible corrections for name."
  40. c = self._known(self._edits_within(name, depth))
  41. if c:
  42. return c
  43. if depth >= 2:
  44. raise NameNotFoundException("Unable to find canonical name for: " + name)
  45. return self.candidates(name, depth+1)
  46.  
  47. def _known(self, names):
  48. "The subset of `names` that appear in the dictionary of names."
  49. return set(w for w in names if w in self.names)
  50.  
  51. def _edits_within(self, name, max_dist=2):
  52. "All edits that are max_dist edits away from `name`."
  53. return self._edits_recur([name], max_dist)
  54.  
  55. def _edits_recur(self, partials, stop):
  56. if stop == 0:
  57. return partials
  58. return self._edits_recur(flatten([self._edits(n) for n in partials]), stop - 1)
  59.  
  60. def _edits(self, name):
  61. "All edits that are one edit away from `name`."
  62. letters = 'abcdefghijklmnopqrstuvwxyz'
  63. splits = [(name[:i], name[i:]) for i in range(len(name) + 1)]
  64. deletes = [L + R[1:] for L, R in splits if R]
  65. transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
  66. replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
  67. inserts = [L + c + R for L, R in splits for c in letters]
  68. finals = [self._add_or_remove_final(name, s) for s in SPECIAL_EDITS]
  69. initials = [self._add_or_remove_final(name, s + " of") for s in SPECIAL_EDITS]
  70. return list(set(deletes + transposes + replaces + inserts + finals + initials))
  71.  
  72. def _add_or_remove_final(self, name, string):
  73. reg = r"\s*" + re.escape(string) + r"$"
  74. if re.search(reg, name):
  75. return re.sub(reg, "", name)
  76. return name + " " + string
  77.  
  78. def _add_or_remove_initial(self, name, string):
  79. reg = r"^" + re.escape(string) + r"\s*"
  80. if re.search(reg, name):
  81. return re.sub(reg, "", name)
  82. return string + " " + name
Add Comment
Please, Sign In to add comment