Untitled

import pandas as pd
import sys
import numpy as np
import reflist as rf
import gc
# Set usual parameters here
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Import options can be swapped out for read_fwf
df1 = pd.read_csv('G:/file1.txt', sep='|', encoding='latin1', low_memory=False,
                 header = None,
                 names = rf.header,
                 dtype = 'str')
df2 = pd.read_csv('G:/file2.txt', sep='|', encoding='latin1', low_memory=False,
                 #names = ['f1','f2','f3'],
                 names = rf.acct_header,
                 header = None,
                 dtype = 'str')

df1 = df1[['accountno','name','country','city']]
df2 = df2[['accountno','name','country','city']]

df1.set_index('accountno',inplace=True,verify_integrity=True,drop=False)
df2.set_index('accountno',inplace=True,verify_integrity=True,drop=False)
print(df1.head(1))
common = df1.merge(df2,on=['accountno'])


df1.sort_index(inplace=True)
df2.sort_index(inplace=True)

x, y = df1.align(df2)

print("\nIndexed!!!!!!!!!!!!!\n")
gc.collect()
# By default, this compare will regard populated spaces and blanks/nulls as !=
def compare_two_dfs(input_df_1, input_df_2):

    ne_stacked = (input_df_1 != input_df_2).stack()
    print(ne_stacked.head(50))
    changed = ne_stacked[ne_stacked]
    print(changed.head(50))
    changed.index.names = ['Index_Number', 'Column_Name']
    difference_locations = np.where(input_df_1 != input_df_2)
    changed_from = input_df_1.values[difference_locations]

    changed_to = input_df_2.values[difference_locations]
    df = pd.DataFrame({'From': changed_from, 'To': changed_to}, index=changed.index)
    df.reset_index(level=['Index_Number', 'Column_Name'])
    df = df.dropna(how='all')

    df = df.drop_duplicates()
    df.to_csv('G:/outfile.txt',sep='|',index = True)
compare_two_dfs(x,y)

'''
Below are some notes regarding performance.
%timeit difference_locations = np.where(y != x)
22.6 ms ± 838 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit ne_stacked = (x != y).stack()
29.8 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
ne_stacked = (x != y).stack()
%timeit changed = ne_stacked[ne_stacked]
2.3 s ± 55.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 %timeit changed = ne_stacked[ne_stacked]
2.3 s ± 55.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit changed_from = x.values[difference_locations]
1.86 ms ± 84.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
'''