Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import json
- from datetime import datetime
- def convert_to_unix_timestamp(date_str):
- # Update the format to include seconds if necessary
- dt = datetime.strptime(date_str, "%d/%m/%Y %H:%M:%S")
- return int(dt.timestamp() * 1000)
- def csv_to_json(csv_file, past_data_file, output_file):
- # Read the CSV file
- df = pd.read_csv(csv_file)
- # Create a list to store the resulting data
- result = []
- for index, row in df.iterrows():
- # Convert column F to Unix timestamp
- f_value = convert_to_unix_timestamp(row["created_at"])
- # Create the desired format [F, A, D, B, C]
- formatted_row = [f_value, row["subject_id"], row["starting_srs_stage"], row["incorrect_meaning_answers"], row["incorrect_reading_answers"]]
- result.append(formatted_row)
- # Load existing data from past_data.json
- with open(past_data_file, 'r') as file:
- past_data = json.load(file)
- # Merge new data with past data
- combined_data = past_data + result
- # Remove duplicates based on (timestamp, A value)
- unique_data = {}
- duplicates_count = 0
- for item in combined_data:
- key = (item[0], item[1]) # (timestamp, A value)
- if key not in unique_data:
- unique_data[key] = item
- else:
- duplicates_count += 1
- # Convert unique data to a list
- final_data = list(unique_data.values())
- # Convert the result to JSON
- json_output = json.dumps(final_data, indent=None)
- with open(output_file, 'w') as output:
- output.write(json_output)
- return duplicates_count
- if __name__ == "__main__":
- csv_file = 'reviews.csv' # Replace with your CSV file path
- past_data_file = 'past_data.json' # Replace with your past data JSON file path
- output_file = 'output.txt' # Output file path for the result
- duplicates_removed = csv_to_json(csv_file, past_data_file, output_file)
- print(f"Duplicates removed: {duplicates_removed}")
Advertisement
Add Comment
Please, Sign In to add comment