Guest User

Untitled

a guest
Nov 21st, 2017
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.70 KB | None | 0 0
  1. Option Explicit
  2.  
  3. Private Type PersonQuery
  4. firstName As String
  5. lastName As String
  6. fullName As String
  7. firstNames As Range
  8. lastNames As Range
  9. fullNames As Range
  10. namesSwapped As Boolean
  11. End Type
  12.  
  13. Private Type PersonResult
  14. found As Boolean
  15. row As Integer
  16. lookupMethod As String
  17. End Type
  18.  
  19. Const INTEGER_MIN As Integer = -32768
  20. Private NOT_FOUND As PersonResult
  21.  
  22. Const EDIT_DIST_NOMATCH_THRESHOLD As Double = 0.5
  23. Const LEN_RATIO_NOMATCH_THRESHOLD As Double = 0.25
  24. Const STR_LOC_NOMATCH_THRESHOLD As Double = 0.25
  25. Const UNORDERED_SIM_NOMATCH_THRESHOLD As Double = 0.3
  26.  
  27. Const ASC_A As Integer = 65
  28. Const NUM_LETTERS As Integer = 26
  29. Const NUM_LETTERS_AND_SPECIALS As Integer = 27
  30. Const NON_LETTER_VAL As Integer = 26
  31.  
  32. ' Searches for a person based on first name and last name
  33. ' Returns an array of (person's id, full name in database, method of matching name)
  34. ' If the person is not found, id and full name will be "N/A"
  35. ' The methods used are:
  36. ' - search for exact full name,
  37. ' - reverse first and last and search for that full name (sometimes first and last get confused)
  38. ' - filter by first name search on similarity for last name (and vice versa and also reversing first/last)
  39. ' - then search based on similarity for the full name and reversing first/last
  40.  
  41. Private Function FindPersonID(firstName As String, lastName As String, firstNameList As Range, _
  42. lastNameList As Range, fullNameList As Range, ids As Range)
  43.  
  44. Dim pr As PersonResult
  45. Dim pq As PersonQuery
  46. pq.firstName = firstName
  47. pq.lastName = lastName
  48. pq.fullName = firstName + " " + lastName
  49. Set pq.firstNames = firstNameList
  50. Set pq.lastNames = lastNameList
  51. Set pq.fullNames = fullNameList
  52. pq.namesSwapped = False
  53.  
  54. pr = LookupPersonInternal(pq)
  55.  
  56. Dim fullName As String
  57. Dim id As String
  58. If pr.found Then
  59. id = ids.Cells(pr.row, 1).Value
  60. fullName = fullNameList.Cells(pr.row, 1).Value
  61. Else
  62. id = "N/A"
  63. fullName = "N/A"
  64. End If
  65.  
  66. FindPersonID = Array(id, fullName, pr.lookupMethod)
  67. End Function
  68.  
  69. Private Function LookupPersonInternal(pq As PersonQuery) As PersonResult
  70. Dim pr As PersonResult
  71.  
  72. If Trim(pq.fullName) = "" Then
  73. GoTo NotFound
  74. End If
  75.  
  76. pr = LookupExact(pq)
  77. If pr.found Then
  78. pr.lookupMethod = "EXACT_FULL"
  79. LookupPersonInternal = pr
  80. Exit Function
  81. End If
  82.  
  83. pr = LookupExact(SwapFirstAndLast(pq))
  84. If pr.found Then
  85. pr.lookupMethod = "EXACT_FULL_REVERSED"
  86. LookupPersonInternal = pr
  87. Exit Function
  88. End If
  89.  
  90. pr = LookupLastExactFirstSimilar(pq)
  91. If pr.found Then
  92. pr.lookupMethod = "LAST_EXACT_FIRST_SIMILAR"
  93. LookupPersonInternal = pr
  94. Exit Function
  95. End If
  96.  
  97. pr = LookupLastExactFirstSimilar(SwapFirstAndLast(SwapFirstAndLastLists(pq)))
  98. If pr.found Then
  99. pr.lookupMethod = "FIRST_EXACT_LAST_SIMILAR"
  100. LookupPersonInternal = pr
  101. Exit Function
  102. End If
  103.  
  104. pr = LookupLastExactFirstSimilar(SwapFirstAndLast(pq))
  105. If pr.found Then
  106. pr.lookupMethod = "LAST_EXACT_REVERSED_FIRST_SIMILAR_REVERSED"
  107. LookupPersonInternal = pr
  108. Exit Function
  109. End If
  110.  
  111. pr = LookupLastExactFirstSimilar(SwapFirstAndLastLists(pq))
  112. If pr.found Then
  113. pr.lookupMethod = "FIRST_EXACT_REVERSED_LAST_SIMILAR_REVERSED"
  114. LookupPersonInternal = pr
  115. Exit Function
  116. End If
  117.  
  118. pr = LookupFullSimilar(pq)
  119. If pr.found Then
  120. pr.lookupMethod = "SIMILAR_FULL"
  121. LookupPersonInternal = pr
  122. Exit Function
  123. End If
  124.  
  125. pr = LookupFullSimilar(SwapFirstAndLast(pq))
  126. If pr.found Then
  127. pr.lookupMethod = "SIMILAR_FULL_REVERSED"
  128. LookupPersonInternal = pr
  129. Exit Function
  130. End If
  131.  
  132. NotFound:
  133. pr.lookupMethod = "NOT_FOUND"
  134. LookupPersonInternal = pr
  135. End Function
  136.  
  137. Private Function LookupExact(pq As PersonQuery) As PersonResult
  138. Dim foundRange As Range
  139. Dim pr As PersonResult
  140. Set foundRange = pq.fullNames.Find(pq.fullName, LookIn:=xlValues, LookAt:=xlWhole)
  141. pr.found = Not foundRange Is Nothing
  142.  
  143. If pr.found Then
  144. pr.row = foundRange.row
  145. End If
  146.  
  147. LookupExact = pr
  148. End Function
  149.  
  150. Private Function LookupLastExactFirstSimilar(pq As PersonQuery) As PersonResult
  151. Dim pr As PersonResult
  152. pr.found = False
  153.  
  154. Dim lastNameRows As Collection
  155. Set lastNameRows = FindRowsForKey(pq.lastName, pq.lastNames)
  156. If lastNameRows.Count > 0 Then
  157. Dim firstNamesForLast As Collection
  158. Set firstNamesForLast = LookupValuesForRows(lastNameRows, pq.firstNames)
  159.  
  160. Dim mostSimilar As Variant
  161. mostSimilar = MostSimilarIndexInCollection(pq.firstName, firstNamesForLast)
  162.  
  163. If CDbl(mostSimilar(1)) / CDbl(Len(pq.firstName)) > EDIT_DIST_NOMATCH_THRESHOLD Then
  164. pr.found = True
  165. pr.row = lastNameRows(mostSimilar(0))
  166. End If
  167. End If
  168.  
  169. LookupLastExactFirstSimilar = pr
  170. End Function
  171.  
  172. Private Function LookupFullSimilar(pq As PersonQuery) As PersonResult
  173. Dim pr As PersonResult
  174.  
  175. Dim rowAndSimilarity As Variant
  176. rowAndSimilarity = SimilarMatch(pq.fullName, pq.fullNames)
  177.  
  178. pr.row = rowAndSimilarity(0)
  179. pr.found = (rowAndSimilarity(1) / CDbl(Len(pq.fullName))) > EDIT_DIST_NOMATCH_THRESHOLD
  180.  
  181. LookupFullSimilar = pr
  182. End Function
  183.  
  184. Private Function SwapFirstAndLast(pq As PersonQuery) As PersonQuery
  185. Dim pqSwapped As PersonQuery
  186. pqSwapped = pq
  187. pqSwapped.firstName = pq.lastName
  188. pqSwapped.lastName = pq.firstName
  189. pqSwapped.fullName = pqSwapped.firstName + " " + pqSwapped.lastName
  190. pqSwapped.namesSwapped = Not pq.namesSwapped
  191. SwapFirstAndLast = pqSwapped
  192. End Function
  193.  
  194. Private Function SwapFirstAndLastLists(pq As PersonQuery) As PersonQuery
  195. Dim pqListsSwapped As PersonQuery
  196. pqListsSwapped = pq
  197. Set pqListsSwapped.firstNames = pq.lastNames
  198. Set pqListsSwapped.lastNames = pq.firstNames
  199. pqListsSwapped.namesSwapped = Not pq.namesSwapped
  200. SwapFirstAndLastLists = pqListsSwapped
  201. End Function
  202.  
  203. Function MostSimilarIndexInCollection(str As String, col As Collection)
  204. Dim bestSimilarity As Variant
  205. Dim mostSimilarI As Integer
  206. Dim curSimilarity As Integer
  207. bestSimilarity = INTEGER_MIN
  208. mostSimilarI = -1
  209.  
  210. Dim i As Integer
  211. For i = 1 To col.Count
  212. curSimilarity = LevenshteinSimilarity(str, col.Item(i))
  213. If curSimilarity > bestSimilarity Then
  214. mostSimilarI = i
  215. bestSimilarity = curSimilarity
  216. End If
  217. Next
  218.  
  219. MostSimilarIndexInCollection = Array(mostSimilarI, bestSimilarity)
  220. End Function
  221.  
  222. Private Function LookupValuesForRows(rows As Collection, rng As Range) As Collection
  223. Dim i As Integer
  224. Dim vals As New Collection
  225. For i = 1 To rows.Count
  226. vals.Add (rng.Cells(rows(i), 1).Value)
  227. Next
  228. Set LookupValuesForRows = vals
  229. End Function
  230.  
  231. Private Function FindRowsForKey(key As Variant, keys As Range) As Collection
  232. Dim rows As New Collection
  233.  
  234. Dim foundRange As Range
  235. Set foundRange = keys.Find(key, LookIn:=xlValues, LookAt:=xlWhole)
  236.  
  237. Dim firstAddress As Variant
  238. If Not foundRange Is Nothing Then
  239. firstAddress = foundRange.Address
  240. Do
  241. rows.Add (foundRange.row)
  242. Set foundRange = keys.Find(key, foundRange, LookIn:=xlValues, LookAt:=xlWhole)
  243. Loop While Not foundRange Is Nothing And foundRange.Address <> firstAddress
  244. End If
  245.  
  246. Set FindRowsForKey = rows
  247. End Function
  248.  
  249. Function SimilarMatch(search As Variant, list As Range) As Variant
  250. Dim maxSim As Integer
  251. maxSim = 0
  252.  
  253. Dim i As Integer
  254. Dim maxSimI As Integer
  255. maxSimI = -1
  256.  
  257. Dim lastRow As Integer
  258. lastRow = list.Cells.End(xlDown).row
  259.  
  260. Dim firstRow As Integer
  261. firstRow = list.Cells.End(xlUp).row
  262.  
  263. Dim searchStr As String
  264.  
  265. If TypeOf search Is Range Then
  266. searchStr = search.Value
  267. Else
  268. searchStr = search
  269. End If
  270.  
  271. Dim valInList As String
  272. Dim sim As Integer
  273. For i = firstRow To lastRow
  274. valInList = list.Cells(i, 1).Value
  275.  
  276. 'For performance sake check the length, space location and unordered similarity first
  277. 'before running the edit distance algorithm
  278. If Abs(Len(valInList) - Len(searchStr)) / Len(searchStr) _
  279. < LEN_RATIO_NOMATCH_THRESHOLD Then
  280. If Abs(InStr(searchStr, " ") - InStr(valInList, " ")) / Len(searchStr) _
  281. < STR_LOC_NOMATCH_THRESHOLD Then
  282. If UnorderedSimilarity(searchStr, valInList) / Len(searchStr) _
  283. > UNORDERED_SIM_NOMATCH_THRESHOLD Then
  284. sim = LevenshteinSimilarity(searchStr, valInList)
  285. If sim > maxSim Then
  286. maxSim = sim
  287. maxSimI = i
  288. End If
  289. End If
  290. End If
  291. End If
  292. Next
  293.  
  294. SimilarMatch = Array(maxSimI, maxSim)
  295. End Function
  296.  
  297. Private Function LetterValue(letter As String) As Integer
  298. Dim ascVal As Integer
  299. ascVal = Asc(letter)
  300.  
  301. If ascVal < ASC_A Or ascVal > ASC_A + NUM_LETTERS Then
  302. LetterValue = NON_LETTER_VAL
  303. Else
  304. LetterValue = ascVal - ASC_A
  305. End If
  306. End Function
  307.  
  308. Private Function UnorderedSimilarity(ByVal str1 As String, ByVal str2 As String) As Integer
  309. Dim i, j, ascVal As Integer
  310. Dim ascA As Integer
  311. Dim ascZ As Integer
  312. Dim letterCountDiffs(27) As Integer
  313. Dim lettersDiff As Integer
  314. Dim letterVal As Integer
  315. Dim len1 As Integer
  316. Dim len2 As Integer
  317.  
  318. len1 = Len(str1)
  319. len2 = Len(str2)
  320. str1 = UCase(str1)
  321. str2 = UCase(str2)
  322.  
  323. i = 0
  324. While i < len1
  325. letterVal = LetterValue(CharAt(str1, i))
  326. letterCountDiffs(letterVal) = letterCountDiffs(letterVal) + 1
  327. i = i + 1
  328. Wend
  329.  
  330. i = 0
  331. While i < len2
  332. letterVal = LetterValue(CharAt(str2, i))
  333. letterCountDiffs(letterVal) = letterCountDiffs(letterVal) - 1
  334. i = i + 1
  335. Wend
  336.  
  337. lettersDiff = 0
  338. For i = 0 To UBound(letterCountDiffs)
  339. lettersDiff = lettersDiff + Abs(letterCountDiffs(i))
  340. Next
  341.  
  342. Dim minLen As Integer
  343. If len1 < len2 Then minLen = len1 Else minLen = len2
  344.  
  345. UnorderedSimilarity = minLen - lettersDiff
  346. End Function
  347.  
  348. '--------------------------------------------------------------------
  349. ' Calculates the edit distance between str1 and str2 using the
  350. ' Levenshtein distance dynamic programming algorithm
  351. ' This is really "edit similarity" as more similar strings have a
  352. ' larger score.
  353. Private Function LevenshteinSimilarity(str1 As String, str2 As String)
  354. Dim len1, len2, i, j, score, charSim, gap1, gap2, matchVal As Integer
  355.  
  356. len1 = Len(str1)
  357. len2 = Len(str2)
  358.  
  359. Dim gap_score As Integer
  360. gap_score = -1
  361.  
  362. Dim D As Variant
  363. ReDim D(0 To (len1 + 1), 0 To (len2 + 1)) As Integer
  364. D(0, 0) = 0
  365.  
  366. For i = 0 To len1
  367. D(i, 0) = gap_score * i
  368. Next
  369.  
  370. For j = 0 To len2
  371. D(0, j) = gap_score * j
  372. Next
  373.  
  374. For i = 1 To len1
  375. For j = 1 To len2
  376. matchVal = D(i - 1, j - 1) + CharSimilarity(CharAt(str1, i - 1), CharAt(str2, j - 1))
  377. gap2 = D(i, j - 1) + gap_score
  378. gap1 = D(i - 1, j) + gap_score
  379. D(i, j) = Application.WorksheetFunction.Max(matchVal, gap2, gap1)
  380. Next
  381. Next
  382.  
  383. 'Dim alignment As String
  384. 'alignment = ""
  385.  
  386. i = len1
  387. j = len2
  388. score = 0
  389.  
  390. Dim align As String
  391.  
  392. While i > 0 And j > 0
  393. charSim = CharSimilarity(CharAt(str1, i - 1), CharAt(str2, j - 1))
  394. If D(i, j) - charSim = D(i - 1, j - 1) Then
  395. If charSim > 0 Then
  396. 'align = "M"
  397. Else
  398. 'align = "C"
  399. End If
  400.  
  401. i = i - 1
  402. j = j - 1
  403.  
  404. score = score + charSim
  405. ElseIf D(i, j) - gap_score = D(i, j - 1) Then
  406. 'align = "A"
  407. j = j - 1
  408. ElseIf D(i, j) - gap_score = D(i - 1, j) Then
  409. 'align = "D"
  410. i = i - 1
  411. score = score + gap_score
  412. Else
  413. MsgBox "Unexpected score in backtracking"
  414. End If
  415. 'alignment = align + alignment
  416. Wend
  417.  
  418. While j > 0
  419. 'alignment = "A" + alignment
  420. j = j - 1
  421. score = score + gap_score
  422. Wend
  423.  
  424. While i > 0
  425. 'alignment = "D" + alignment
  426. i = i - 1
  427. score = score + gap_score
  428. Wend
  429.  
  430. LevenshteinSimilarity = score
  431. End Function
  432.  
  433. Function CharAt(str, zeroStartingIndex)
  434. CharAt = Mid(str, zeroStartingIndex + 1, 1)
  435. End Function
  436.  
  437. Function CharSimilarity(chr1 As Variant, chr2 As Variant)
  438. If UCase(chr1) = UCase(chr2) Then
  439. CharSimilarity = 1
  440. Else
  441. CharSimilarity = -1
  442. End If
  443. End Function
Add Comment
Please, Sign In to add comment