Guest User

Untitled

a guest
Nov 24th, 2017
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.74 KB | None | 0 0
  1. from datetime import datetime
  2. from random import randrange
  3.  
  4. import recordlinkage as rl
  5. import recordlinkage.algorithms.conflict_resolution as cr
  6. from recordlinkage.datasets import load_febrl4
  7.  
  8. dfA, dfB = load_febrl4()
  9.  
  10. # Adapt dataset for example
  11. dfA['date_of_birth'] = dfA['date_of_birth'].apply(float)
  12. dfB['date_of_birth'] = dfB['date_of_birth'].apply(float)
  13.  
  14. dfA['dates_updated'] = [datetime(randrange(2000, 2017), randrange(1, 12), randrange(1, 28)) for _ in range(len(dfA))]
  15. dfB['dates_updated'] = [datetime(randrange(2000, 2017), randrange(1, 12), randrange(1, 28)) for _ in range(len(dfB))]
  16.  
  17. dfA['salary'] = [randrange(40000, 120000) for _ in range(len(dfA))]
  18. dfB['salary'] = [randrange(40000, 120000) for _ in range(len(dfB))]
  19.  
  20. dfA['min'] = [randrange(10, 20) for _ in range(len(dfA))]
  21. dfB['min'] = [randrange(10, 20) for _ in range(len(dfB))]
  22.  
  23. dfA['max'] = [randrange(20, 30) for _ in range(len(dfA))]
  24. dfB['max'] = [randrange(20, 30) for _ in range(len(dfB))]
  25.  
  26. # Sample data subsets
  27. dfA = dfA.sample(200)
  28. dfB = dfB.sample(200)
  29.  
  30. # Indexation step
  31. indexer = rl.BlockIndex(on='given_name')
  32. pairs = indexer.index(dfA, dfB)
  33.  
  34. # Comparison step
  35. compare_cl = rl.Compare(pairs=pairs, df_a=dfA, df_b=dfB)
  36.  
  37. compare_cl.exact('given_name', 'given_name')
  38. compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85)
  39. compare_cl.exact('date_of_birth', 'date_of_birth')
  40. compare_cl.exact('suburb', 'suburb')
  41. compare_cl.exact('state', 'state')
  42. compare_cl.string('address_1', 'address_1', threshold=0.85)
  43.  
  44. features = compare_cl.vectors
  45.  
  46. # Classification step
  47. matches = features.sum(axis=1) > 3
  48.  
  49. # Fusion step
  50. fuse = rl.FuseLinks()
  51.  
  52. # Prefer values in dataframe a
  53. fuse.trust_your_friends('given_name', 'given_name', trusted='a', name='given_name')
  54.  
  55. # Choose values from the row that was updated most recently
  56. fuse.keep_up_to_date('surname', 'surname', 'dates_updated', 'dates_updated', name='surname')
  57.  
  58. # Take the average of salary values
  59. fuse.meet_in_the_middle('salary', 'salary', metric='mean', name='salary')
  60.  
  61. # Choose randomly between street numbers
  62. fuse.roll_the_dice('street_number', 'street_number', name='street_number')
  63.  
  64. # Keep all social security id values for future processing.
  65. fuse.pass_it_on('soc_sec_id', 'soc_sec_id', name='soc_sec_id')
  66.  
  67. # Handle data conflicts between multiple columns in each data frame
  68. fuse.meet_in_the_middle(['min', 'max'], ['min', 'max'], metric='stdev', name='spread')
  69.  
  70. # Create custom conflict handling strategies with the resolve method
  71. fuse.resolve(
  72. cr.choose_longest,
  73. ['address_1', 'address_2'],
  74. ['address_1', 'address_2'],
  75. tie_break=cr.choose_random,
  76. name='longest_address'
  77. )
  78.  
  79. # Execute the scheduled conflict resolution jobs for the given
  80. # candidate links, data, and classifications.
  81. fused = fuse.fuse(pairs, dfA, dfB, matches)
Add Comment
Please, Sign In to add comment