WK Review Data Merger

import pandas as pd
import json
from datetime import datetime

def convert_to_unix_timestamp(date_str):
    # Update the format to include seconds if necessary
    dt = datetime.strptime(date_str, "%d/%m/%Y %H:%M:%S")
    return int(dt.timestamp() * 1000)

def csv_to_json(csv_file, past_data_file, output_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Create a list to store the resulting data
    result = []

    for index, row in df.iterrows():
        # Convert column F to Unix timestamp
        f_value = convert_to_unix_timestamp(row["created_at"])

        # Create the desired format [F, A, D, B, C]
        formatted_row = [f_value, row["subject_id"], row["starting_srs_stage"], row["incorrect_meaning_answers"], row["incorrect_reading_answers"]]
        result.append(formatted_row)

    # Load existing data from past_data.json
    with open(past_data_file, 'r') as file:
        past_data = json.load(file)

    # Merge new data with past data
    combined_data = past_data + result

    # Remove duplicates based on (timestamp, A value)
    unique_data = {}
    duplicates_count = 0

    for item in combined_data:
        key = (item[0], item[1])  # (timestamp, A value)
        if key not in unique_data:
            unique_data[key] = item
        else:
            duplicates_count += 1

    # Convert unique data to a list
    final_data = list(unique_data.values())

    # Convert the result to JSON
    json_output = json.dumps(final_data, indent=None)

    with open(output_file, 'w') as output:
        output.write(json_output)

    return duplicates_count

if __name__ == "__main__":
    csv_file = 'reviews.csv'  # Replace with your CSV file path
    past_data_file = 'past_data.json'  # Replace with your past data JSON file path
    output_file = 'output.txt'  # Output file path for the result
    duplicates_removed = csv_to_json(csv_file, past_data_file, output_file)

    print(f"Duplicates removed: {duplicates_removed}")