evanfuchs

ChatGPT Conversation Exporter - JSON to Markdown

Mar 8th, 2025
342
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.83 KB | None | 0 0
  1. """
  2. ChatGPT Conversation Exporter - JSON to Markdown
  3. ---------------------------------------------------
  4.  
  5. - Reads the `conversations.json` file.
  6. - Extracts metadata (title, timestamps, model used, etc.).
  7. - Organizes messages in chronological order.
  8. - Saves each conversation as a formatted .md file.
  9.  
  10. Instructions:
  11. - Extract conversations.json from the ChatGPT Export .zip file
  12. - Run the script in the same directory:
  13.     python export_conversations.py (or→ python3 export_conversations.py)
  14. - Markdown files are saved in a folder `exported_chats`
  15.  
  16. """
  17. import json
  18. import re
  19. import os
  20. from datetime import datetime
  21.  
  22. # Load the conversations.json file
  23. file_path = "conversations.json"  # Ensure this is the correct path
  24. with open(file_path, "r", encoding="utf-8") as file:
  25.     conversations = json.load(file)
  26.  
  27. # Function to clean filenames (remove invalid characters)
  28. def clean_filename(title):
  29.     return re.sub(r'[<>:"/\\|?*]', '_', title)  # Replaces invalid characters with "_"
  30.  
  31. # Function to convert a UNIX timestamp to a readable format (UTC)
  32. def format_timestamp(timestamp):
  33.     if isinstance(timestamp, (int, float)):
  34.         return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S UTC")
  35.     return "Unknown Time"
  36.  
  37. # Function to remove hidden Unicode artifacts (, , etc.) **without breaking formatting**
  38. def sanitize_text(text):
  39.     return re.sub(r"[\uE000-\uF8FF]", "", text)  # Removes Private Use Unicode characters (PUA)
  40.  
  41. # Function to process a single conversation and convert it to Markdown format
  42. def convert_conversation_to_markdown(convo):
  43.     title = convo.get("title", "Untitled Conversation")
  44.     filename = clean_filename(title) + ".md"  # Clean title for a safe filename
  45.  
  46.     # Extract metadata
  47.     conversation_id = convo.get("id", "Unknown ID")
  48.     created_time = format_timestamp(convo.get("create_time"))
  49.     updated_time = format_timestamp(convo.get("update_time"))
  50.     model = convo.get("model", "Unknown Model")
  51.     custom_gpt = convo.get("gpt_title", None)  # Custom GPT name if used
  52.  
  53.     # Build metadata block at the top of the file
  54.     metadata_block = f"""# {title}
  55.  
  56. ---
  57. Title: {title}
  58. Conversation ID: {conversation_id}
  59. Created: {created_time}
  60. Last Updated: {updated_time}
  61. Model: {model}"""
  62.  
  63.     if custom_gpt:
  64.         metadata_block += f"\nCustom GPT: {custom_gpt}"
  65.  
  66.     metadata_block += "\n---\n\n"
  67.  
  68.     markdown_output = metadata_block
  69.  
  70.     # Extract messages in chronological order using the mapping structure
  71.     mapping = convo.get("mapping", {})
  72.  
  73.     # Find the root message (no parent)
  74.     root_id = next((key for key, val in mapping.items() if val.get("parent") is None), None)
  75.  
  76.     if root_id:
  77.         queue = [root_id]
  78.         while queue:
  79.             current_id = queue.pop(0)
  80.             node = mapping.get(current_id, {})
  81.             message_data = node.get("message", {})
  82.  
  83.             if message_data:
  84.                 role = message_data.get("author", {}).get("role", "unknown")
  85.                 timestamp = message_data.get("create_time", None)
  86.                 formatted_time = format_timestamp(timestamp)
  87.                 content_parts = message_data.get("content", {}).get("parts", [])
  88.  
  89.                 # Sanitize text **without breaking formatting**
  90.                 content = "\n".join([sanitize_text(str(part)) for part in content_parts if isinstance(part, (str, int, float))]).strip()
  91.  
  92.                 if content:  # Avoid empty messages
  93.                     if role == "user":
  94.                         # Convert User's message into block quotes
  95.                         formatted_content = "\n".join([f"> {line}" for line in content.split("\n")])
  96.                         markdown_output += f"> **User ({formatted_time}):**\n>\n{formatted_content}\n\n"
  97.                     elif role == "assistant":
  98.                         markdown_output += f"**Assistant ({formatted_time}):** {content}\n\n"
  99.  
  100.             # Add children to the queue for processing in order
  101.             queue.extend(node.get("children", []))
  102.  
  103.     markdown_output += "**────────────**\n\n"  # Separator at the end of the conversation
  104.  
  105.     return filename, markdown_output
  106.  
  107. # Create an output folder for Markdown files
  108. output_folder = "exported_chats"
  109. os.makedirs(output_folder, exist_ok=True)
  110.  
  111. # Process each conversation and save it as a separate Markdown file
  112. for convo in conversations:
  113.     try:
  114.         filename, markdown_content = convert_conversation_to_markdown(convo)
  115.         md_file_path = os.path.join(output_folder, filename)
  116.  
  117.         with open(md_file_path, "w", encoding="utf-8") as md_file:
  118.             md_file.write(markdown_content)
  119.  
  120.         print(f" Saved: {md_file_path}")
  121.     except Exception as e:
  122.         print(f" Error processing conversation '{convo.get('title', 'Untitled')}': {e}")
  123.  
  124. print("\n All conversations have been processed!")
Advertisement
Add Comment
Please, Sign In to add comment