Advertisement
Guest User

Untitled

a guest
Mar 18th, 2019
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.56 KB | None | 0 0
  1. import pandas as pd
  2. import sys
  3. import numpy as np
  4. import reflist as rf
  5. import gc
  6. # Set usual parameters here
  7. np.set_printoptions(threshold=sys.maxsize)
  8. pd.set_option('display.max_rows', 500)
  9. pd.set_option('display.max_columns', 500)
  10. pd.set_option('display.width', 1000)
  11.  
  12. # Import options can be swapped out for read_fwf
  13. df1 = pd.read_csv('G:/file1.txt', sep='|', encoding='latin1', low_memory=False,
  14. header = None,
  15. names = rf.header,
  16. dtype = 'str')
  17. df2 = pd.read_csv('G:/file2.txt', sep='|', encoding='latin1', low_memory=False,
  18. #names = ['f1','f2','f3'],
  19. names = rf.acct_header,
  20. header = None,
  21. dtype = 'str')
  22.  
  23. df1 = df1[['accountno','name','country','city']]
  24. df2 = df2[['accountno','name','country','city']]
  25.  
  26. df1.set_index('accountno',inplace=True,verify_integrity=True,drop=False)
  27. df2.set_index('accountno',inplace=True,verify_integrity=True,drop=False)
  28. print(df1.head(1))
  29. common = df1.merge(df2,on=['accountno'])
  30.  
  31.  
  32. df1.sort_index(inplace=True)
  33. df2.sort_index(inplace=True)
  34.  
  35. x, y = df1.align(df2)
  36.  
  37. print("\nIndexed!!!!!!!!!!!!!\n")
  38. gc.collect()
  39. # By default, this compare will regard populated spaces and blanks/nulls as !=
  40. def compare_two_dfs(input_df_1, input_df_2):
  41.  
  42. ne_stacked = (input_df_1 != input_df_2).stack()
  43. print(ne_stacked.head(50))
  44. changed = ne_stacked[ne_stacked]
  45. print(changed.head(50))
  46. changed.index.names = ['Index_Number', 'Column_Name']
  47. difference_locations = np.where(input_df_1 != input_df_2)
  48. changed_from = input_df_1.values[difference_locations]
  49.  
  50. changed_to = input_df_2.values[difference_locations]
  51. df = pd.DataFrame({'From': changed_from, 'To': changed_to}, index=changed.index)
  52. df.reset_index(level=['Index_Number', 'Column_Name'])
  53. df = df.dropna(how='all')
  54.  
  55. df = df.drop_duplicates()
  56. df.to_csv('G:/outfile.txt',sep='|',index = True)
  57. compare_two_dfs(x,y)
  58.  
  59. '''
  60. Below are some notes regarding performance.
  61. %timeit difference_locations = np.where(y != x)
  62. 22.6 ms ± 838 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
  63. %timeit ne_stacked = (x != y).stack()
  64. 29.8 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
  65. ne_stacked = (x != y).stack()
  66. %timeit changed = ne_stacked[ne_stacked]
  67. 2.3 s ± 55.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
  68. %timeit changed = ne_stacked[ne_stacked]
  69. 2.3 s ± 55.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
  70. %timeit changed_from = x.values[difference_locations]
  71. 1.86 ms ± 84.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  72. '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement