Advertisement
Guest User

Untitled

a guest
Jul 18th, 2024
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.90 KB | Source Code | 0 0
  1. import pandas as pd
  2. from tqdm import tqdm
  3. from datetime import datetime
  4. import os
  5.  
  6. def merge_and_trace_changes(df1, df2, merge_cols, check_cols, drop_cols, path=None, filename='historical_df.pickle', compression=None):
  7.     """
  8.    This function merges two dataframes, identifies corrected rows, removes duplicates, resets the indices,
  9.    and adds a column indicating the date when data was modified.
  10.  
  11.    Parameters:
  12.    df1 (DataFrame): The first dataframe.
  13.    df2 (DataFrame): The second dataframe.
  14.    merge_cols (list): The columns on which to perform the merge.
  15.    check_cols (list): The columns to check for differences.
  16.    drop_cols (list): The columns to consider when dropping duplicates.
  17.    path (str, optional): The path to save the 'corrected_rows' dataframe. If not provided, the dataframe will be returned but not saved.
  18.    filename (str, optional): The name of the file to save the 'corrected_rows' dataframe. The default value is 'historical_df.pickle'.
  19.    compression (str, optional): The compression method to use for saving the 'corrected_rows' dataframe. If not provided, no compression will be used.
  20.  
  21.    Returns:
  22.    df (DataFrame): The merged dataframe with duplicates removed and indices reset.
  23.    corrected_rows (DataFrame): The rows from df1 that have been corrected in df2, with indices reset.
  24.    """
  25.  
  26.     # Merge the two dataframes on the merge_cols
  27.     merged_df = pd.merge(df1, df2, on=merge_cols, suffixes=("_old", "_new"))
  28.  
  29.     # Initialize a list to hold the index of rows that have changes
  30.     corrected_indices = []
  31.  
  32.     # Iterate over rows to check for differences in check_cols
  33.     for index, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
  34.         for col in check_cols:
  35.             if row[f"{col}_old"] != row[f"{col}_new"]:
  36.                 corrected_indices.append(index)
  37.                 break
  38.  
  39.     # Get the corrected rows
  40.     corrected_rows = merged_df.loc[corrected_indices, merge_cols + [f"{col}_old" for col in check_cols] + [f"{col}_new" for col in check_cols]]
  41.  
  42.     # Add the modification date column
  43.     corrected_rows['ModificationDate'] = datetime.now().strftime('%Y-%m-%d')
  44.  
  45.     # Combine the original dataframes
  46.     df_combined = pd.concat([df1, df2])
  47.  
  48.     # Drop duplicates based on drop_cols, keeping the last occurrence
  49.     df_combined.drop_duplicates(subset=drop_cols, keep="last", inplace=True)
  50.  
  51.     # Reset index
  52.     df_combined.reset_index(drop=True, inplace=True)
  53.     corrected_rows.reset_index(drop=True, inplace=True)
  54.  
  55.     # If a path is provided, save the 'corrected_rows' dataframe to the specified path
  56.     if path is not None:
  57.         corrected_rows.to_pickle(os.path.join(path, filename), compression=compression)
  58.  
  59.     return df_combined, corrected_rows
  60.  
  61. # Example usage
  62. # df1 = pd.DataFrame(...)
  63. # df2 = pd.DataFrame(...)
  64. # merge_and_trace_changes(df1, df2, merge_cols=['id'], check_cols=['value'], drop_cols=['id'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement