Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from datetime import datetime
- from random import randrange
- import recordlinkage as rl
- import recordlinkage.algorithms.conflict_resolution as cr
- from recordlinkage.datasets import load_febrl4
- dfA, dfB = load_febrl4()
- # Adapt dataset for example
- dfA['date_of_birth'] = dfA['date_of_birth'].apply(float)
- dfB['date_of_birth'] = dfB['date_of_birth'].apply(float)
- dfA['dates_updated'] = [datetime(randrange(2000, 2017), randrange(1, 12), randrange(1, 28)) for _ in range(len(dfA))]
- dfB['dates_updated'] = [datetime(randrange(2000, 2017), randrange(1, 12), randrange(1, 28)) for _ in range(len(dfB))]
- dfA['salary'] = [randrange(40000, 120000) for _ in range(len(dfA))]
- dfB['salary'] = [randrange(40000, 120000) for _ in range(len(dfB))]
- dfA['min'] = [randrange(10, 20) for _ in range(len(dfA))]
- dfB['min'] = [randrange(10, 20) for _ in range(len(dfB))]
- dfA['max'] = [randrange(20, 30) for _ in range(len(dfA))]
- dfB['max'] = [randrange(20, 30) for _ in range(len(dfB))]
- # Sample data subsets
- dfA = dfA.sample(200)
- dfB = dfB.sample(200)
- # Indexation step
- indexer = rl.BlockIndex(on='given_name')
- pairs = indexer.index(dfA, dfB)
- # Comparison step
- compare_cl = rl.Compare(pairs=pairs, df_a=dfA, df_b=dfB)
- compare_cl.exact('given_name', 'given_name')
- compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85)
- compare_cl.exact('date_of_birth', 'date_of_birth')
- compare_cl.exact('suburb', 'suburb')
- compare_cl.exact('state', 'state')
- compare_cl.string('address_1', 'address_1', threshold=0.85)
- features = compare_cl.vectors
- # Classification step
- matches = features.sum(axis=1) > 3
- # Fusion step
- fuse = rl.FuseLinks()
- # Prefer values in dataframe a
- fuse.trust_your_friends('given_name', 'given_name', trusted='a', name='given_name')
- # Choose values from the row that was updated most recently
- fuse.keep_up_to_date('surname', 'surname', 'dates_updated', 'dates_updated', name='surname')
- # Take the average of salary values
- fuse.meet_in_the_middle('salary', 'salary', metric='mean', name='salary')
- # Choose randomly between street numbers
- fuse.roll_the_dice('street_number', 'street_number', name='street_number')
- # Keep all social security id values for future processing.
- fuse.pass_it_on('soc_sec_id', 'soc_sec_id', name='soc_sec_id')
- # Handle data conflicts between multiple columns in each data frame
- fuse.meet_in_the_middle(['min', 'max'], ['min', 'max'], metric='stdev', name='spread')
- # Create custom conflict handling strategies with the resolve method
- fuse.resolve(
- cr.choose_longest,
- ['address_1', 'address_2'],
- ['address_1', 'address_2'],
- tie_break=cr.choose_random,
- name='longest_address'
- )
- # Execute the scheduled conflict resolution jobs for the given
- # candidate links, data, and classifications.
- fused = fuse.fuse(pairs, dfA, dfB, matches)
Add Comment
Please, Sign In to add comment