Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from tqdm import tqdm
- from datetime import datetime
- import os
- def merge_and_trace_changes(df1, df2, merge_cols, check_cols, drop_cols, path=None, filename='historical_df.pickle', compression=None):
- """
- This function merges two dataframes, identifies corrected rows, removes duplicates, resets the indices,
- and adds a column indicating the date when data was modified.
- Parameters:
- df1 (DataFrame): The first dataframe.
- df2 (DataFrame): The second dataframe.
- merge_cols (list): The columns on which to perform the merge.
- check_cols (list): The columns to check for differences.
- drop_cols (list): The columns to consider when dropping duplicates.
- path (str, optional): The path to save the 'corrected_rows' dataframe. If not provided, the dataframe will be returned but not saved.
- filename (str, optional): The name of the file to save the 'corrected_rows' dataframe. The default value is 'historical_df.pickle'.
- compression (str, optional): The compression method to use for saving the 'corrected_rows' dataframe. If not provided, no compression will be used.
- Returns:
- df (DataFrame): The merged dataframe with duplicates removed and indices reset.
- corrected_rows (DataFrame): The rows from df1 that have been corrected in df2, with indices reset.
- """
- # Merge the two dataframes on the merge_cols
- merged_df = pd.merge(df1, df2, on=merge_cols, suffixes=("_old", "_new"))
- # Initialize a list to hold the index of rows that have changes
- corrected_indices = []
- # Iterate over rows to check for differences in check_cols
- for index, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
- for col in check_cols:
- if row[f"{col}_old"] != row[f"{col}_new"]:
- corrected_indices.append(index)
- break
- # Get the corrected rows
- corrected_rows = merged_df.loc[corrected_indices, merge_cols + [f"{col}_old" for col in check_cols] + [f"{col}_new" for col in check_cols]]
- # Add the modification date column
- corrected_rows['ModificationDate'] = datetime.now().strftime('%Y-%m-%d')
- # Combine the original dataframes
- df_combined = pd.concat([df1, df2])
- # Drop duplicates based on drop_cols, keeping the last occurrence
- df_combined.drop_duplicates(subset=drop_cols, keep="last", inplace=True)
- # Reset index
- df_combined.reset_index(drop=True, inplace=True)
- corrected_rows.reset_index(drop=True, inplace=True)
- # If a path is provided, save the 'corrected_rows' dataframe to the specified path
- if path is not None:
- corrected_rows.to_pickle(os.path.join(path, filename), compression=compression)
- return df_combined, corrected_rows
- # Example usage
- # df1 = pd.DataFrame(...)
- # df2 = pd.DataFrame(...)
- # merge_and_trace_changes(df1, df2, merge_cols=['id'], check_cols=['value'], drop_cols=['id'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement