Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import sys
- import numpy as np
- import reflist as rf
- import gc
- # Set usual parameters here
- np.set_printoptions(threshold=sys.maxsize)
- pd.set_option('display.max_rows', 500)
- pd.set_option('display.max_columns', 500)
- pd.set_option('display.width', 1000)
- # Import options can be swapped out for read_fwf
- df1 = pd.read_csv('G:/file1.txt', sep='|', encoding='latin1', low_memory=False,
- header = None,
- names = rf.header,
- dtype = 'str')
- df2 = pd.read_csv('G:/file2.txt', sep='|', encoding='latin1', low_memory=False,
- #names = ['f1','f2','f3'],
- names = rf.acct_header,
- header = None,
- dtype = 'str')
- df1 = df1[['accountno','name','country','city']]
- df2 = df2[['accountno','name','country','city']]
- df1.set_index('accountno',inplace=True,verify_integrity=True,drop=False)
- df2.set_index('accountno',inplace=True,verify_integrity=True,drop=False)
- print(df1.head(1))
- common = df1.merge(df2,on=['accountno'])
- df1.sort_index(inplace=True)
- df2.sort_index(inplace=True)
- x, y = df1.align(df2)
- print("\nIndexed!!!!!!!!!!!!!\n")
- gc.collect()
- # By default, this compare will regard populated spaces and blanks/nulls as !=
- def compare_two_dfs(input_df_1, input_df_2):
- ne_stacked = (input_df_1 != input_df_2).stack()
- print(ne_stacked.head(50))
- changed = ne_stacked[ne_stacked]
- print(changed.head(50))
- changed.index.names = ['Index_Number', 'Column_Name']
- difference_locations = np.where(input_df_1 != input_df_2)
- changed_from = input_df_1.values[difference_locations]
- changed_to = input_df_2.values[difference_locations]
- df = pd.DataFrame({'From': changed_from, 'To': changed_to}, index=changed.index)
- df.reset_index(level=['Index_Number', 'Column_Name'])
- df = df.dropna(how='all')
- df = df.drop_duplicates()
- df.to_csv('G:/outfile.txt',sep='|',index = True)
- compare_two_dfs(x,y)
- '''
- Below are some notes regarding performance.
- %timeit difference_locations = np.where(y != x)
- 22.6 ms ± 838 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
- %timeit ne_stacked = (x != y).stack()
- 29.8 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
- ne_stacked = (x != y).stack()
- %timeit changed = ne_stacked[ne_stacked]
- 2.3 s ± 55.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- %timeit changed = ne_stacked[ne_stacked]
- 2.3 s ± 55.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- %timeit changed_from = x.values[difference_locations]
- 1.86 ms ± 84.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
- '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement