Untitled

import pandas as pd
from tqdm import tqdm
from datetime import datetime
import os

def merge_and_trace_changes(df1, df2, merge_cols, check_cols, drop_cols, path=None, filename='historical_df.pickle', compression=None):
    """
    This function merges two dataframes, identifies corrected rows, removes duplicates, resets the indices,
    and adds a column indicating the date when data was modified.

    Parameters:
    df1 (DataFrame): The first dataframe.
    df2 (DataFrame): The second dataframe.
    merge_cols (list): The columns on which to perform the merge.
    check_cols (list): The columns to check for differences.
    drop_cols (list): The columns to consider when dropping duplicates.
    path (str, optional): The path to save the 'corrected_rows' dataframe. If not provided, the dataframe will be returned but not saved.
    filename (str, optional): The name of the file to save the 'corrected_rows' dataframe. The default value is 'historical_df.pickle'.
    compression (str, optional): The compression method to use for saving the 'corrected_rows' dataframe. If not provided, no compression will be used.

    Returns:
    df (DataFrame): The merged dataframe with duplicates removed and indices reset.
    corrected_rows (DataFrame): The rows from df1 that have been corrected in df2, with indices reset.
    """

    # Merge the two dataframes on the merge_cols
    merged_df = pd.merge(df1, df2, on=merge_cols, suffixes=("_old", "_new"))

    # Initialize a list to hold the index of rows that have changes
    corrected_indices = []

    # Iterate over rows to check for differences in check_cols
    for index, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
        for col in check_cols:
            if row[f"{col}_old"] != row[f"{col}_new"]:
                corrected_indices.append(index)
                break

    # Get the corrected rows
    corrected_rows = merged_df.loc[corrected_indices, merge_cols + [f"{col}_old" for col in check_cols] + [f"{col}_new" for col in check_cols]]

    # Add the modification date column
    corrected_rows['ModificationDate'] = datetime.now().strftime('%Y-%m-%d')

    # Combine the original dataframes
    df_combined = pd.concat([df1, df2])

    # Drop duplicates based on drop_cols, keeping the last occurrence
    df_combined.drop_duplicates(subset=drop_cols, keep="last", inplace=True)

    # Reset index
    df_combined.reset_index(drop=True, inplace=True)
    corrected_rows.reset_index(drop=True, inplace=True)

    # If a path is provided, save the 'corrected_rows' dataframe to the specified path
    if path is not None:
        corrected_rows.to_pickle(os.path.join(path, filename), compression=compression)

    return df_combined, corrected_rows

# Example usage
# df1 = pd.DataFrame(...)
# df2 = pd.DataFrame(...)
# merge_and_trace_changes(df1, df2, merge_cols=['id'], check_cols=['value'], drop_cols=['id'])