Guest User

forsen

a guest
Sep 8th, 2025
36
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 46.31 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. """
  4. Created on Sun Sep 6 2025 (v1)
  5. Updated on Sat Sep 7 2025 (v5)
  6. Updated on Sat Sep 8 2025 (v7.1)
  7.  
  8. @author: https://chatgpt.com/ # (free: auto)
  9. @author: https://gemini.google.com/ # (free: 2.5 PRO, 2.5 Flash)
  10. """
  11.  
  12. """
  13.  
  14. /chat
  15. TWITCH CHAT DOWNLOADER: https://www.twitchchatdownloader.com/
  16.  
  17. /emote
  18. TWITCH GLOBAL EMOTES: https://twitchemotes.com/
  19. TWITCH FORSEN EMOTES: https://twitchemotes.com/channels/22484632
  20. TWITCH SHARED EMOTES: https://twitch-tools.rootonline.de/emotes_search.php
  21. BTTV GLOBAL EMOTES: https://betterttv.com/emotes/global
  22. BTTV FORSEN EMOTES: https://betterttv.com/users/555943515393e61c772ee968
  23. BTTV SHARED EMOTES: https://betterttv.com/emotes/shared
  24. EMOJI TO IMAGE: https://jpeg-optimizer.com/emoji/
  25.  
  26. /user
  27. TWITCH USER IMAGE CDN: https://static-cdn.jtvnw.net/jtv_user_pictures/{ID}-profile_image-{DIM}.png
  28.    
  29. """
  30.  
  31. import math
  32. import re
  33. from collections import Counter
  34. from pathlib import Path
  35. from typing import List, Dict, Callable, Any, Tuple
  36.  
  37. import matplotlib.pyplot as plt
  38. import numpy as np
  39. import pandas as pd
  40. import requests
  41. import unicodedata
  42. from PIL import Image, ImageDraw, ImageFont, ImageSequence
  43. from bs4 import BeautifulSoup
  44. from matplotlib.figure import Figure
  45. from matplotlib.offsetbox import AnnotationBbox, OffsetImage
  46. from matplotlib.ticker import FuncFormatter
  47.  
  48. #from matplotlib import rcParams
  49. #rcParams['font.family'] = 'Segoe UI Emoji'
  50.  
  51. # --- INPUT --- # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  52. CSV_NAME = "forsen_chat_08_09_2025" # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  53. # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  54.  
  55. # --- Configuration ---
  56. BASE_DIR = Path(__file__).parent
  57. EMOTE_FOLDER = BASE_DIR / "emote"  # Downloaded manually
  58. USER_FOLDER = BASE_DIR / "user"    # Automatically downloaded
  59. OUTPUT_FOLDER = BASE_DIR / "chat_statistics_7"
  60. FILE_PATH = BASE_DIR / ("chat/forsen/" + CSV_NAME + ".csv")
  61.  
  62. # **NEW**: Specify emotes/users for lifecycle plots.
  63. # Will plot some combination of top10's if empyt list is given.
  64. LIFECYCLE_EMOTES = [] # e.g., ["LULE"]
  65. LIFECYCLE_USERS = None # e.g., ["forsen"]
  66.  
  67. # --- FILTER ---
  68. NOT_EMOTES = ["", " ", "on", "the", "you", "to", "a", "for", "in", "is",\
  69.               "chat", "it", "and", "i", "this", ".", "by", "so", "stream",\
  70.               "forsen", "discord", "sure", "stay", "other", "following",\
  71.               "typing", "subscribers", "instructions.", "forsenboys", "pleb",\
  72.               "zone?", "free", "!discord", "https://twitter.com/forsen",\
  73.               "up-to-date", "information:", "make", "follow", "@forsen",\
  74.               "join", "!join", "looking", "no", "yes", "of", "game",\
  75.               "twitter", "ta", "go", "let's"]
  76.  
  77. # --- Constants ---
  78. EXPECTED_COLS = {"time", "user_name", "user_color", "message"}
  79. IMAGE_EXTENSIONS = [".gif", ".jpg", ".png", ".webp"]
  80. MAX_IMAGE_SIZE = 32
  81. IMAGE_PADDING_PIXELS = 6
  82. PLOT_WIDTH_EXTENSION_PIXELS = 79
  83.  
  84. # --- Helper Functions ---
  85.  
  86. def config_plot_figure(fig, ax):
  87.     ax.spines['top'].set_visible(False)
  88.     ax.spines['right'].set_visible(False)
  89.     ax.spines['bottom'].set_visible(True)  # keep x-axis line
  90.     ax.spines['left'].set_visible(True)    # keep y-axis line
  91.  
  92. def strip_word(word: str) -> str:
  93.     if word[0] == "@":
  94.         word = word[1:]
  95.     if word[-1] in [","]:
  96.         word = word[0:-1]
  97.     return word
  98.  
  99. def comma_format(x: float, pos: int) -> str:
  100.     """Formats a number with commas for plot axes."""
  101.     return f"{int(x):,}"
  102.  
  103. def percent_format(x: float, pos: int) -> str:
  104.     """Formats a number as a percentage string for plot axes."""
  105.     return f"{int(x)}%"
  106.  
  107. def clean_text(text: Any) -> str:
  108.     """Normalizes and cleans text content."""
  109.     if pd.isna(text):
  110.         return ""
  111.     return unicodedata.normalize("NFKC", str(text)).strip()
  112.  
  113. # --- Image Loading and Fetching ---
  114.  
  115. def _load_image(filepath: Path) -> Image.Image | None:
  116.     """Loads an image file, handling animated GIFs."""
  117.     try:
  118.         img = Image.open(filepath)
  119.         if filepath.suffix == ".gif":
  120.             # Use the middle frame of a GIF for a static representation
  121.             frames = [frame.copy() for frame in ImageSequence.Iterator(img)]
  122.             img = frames[len(frames)//2]
  123.         return img.convert("RGBA")
  124.     except Exception as e:
  125.         print(f"Warning: Could not load image {filepath}. Reason: {e}")
  126.         return None
  127.  
  128. def load_emote_image(emote_name: str) -> Image.Image | None:
  129.     """Loads a pre-downloaded emote image from the emote folder."""
  130.     for ext in IMAGE_EXTENSIONS:
  131.         filepath = EMOTE_FOLDER / f"{emote_name}{ext}"
  132.         if filepath.exists():
  133.             return _load_image(filepath)
  134.     return None
  135.  
  136. def create_combo_image(combo_string: str, drop_duplicates: bool = True) -> Image.Image | None:
  137.     """
  138.    Creates a single composite image from one or more emote images.
  139.    """
  140.     emote_names = combo_string.split()
  141.     if not emote_names:
  142.         return None
  143.    
  144.     # DEBUG
  145.     # print("[DEBUG] create_combo_image:")
  146.     # print(emote_names)
  147.  
  148.     # Handle special character emotes
  149.     name_map = {
  150.     "🔔": "emoji_bell",
  151.     "🔇": "emoji_no-sound",
  152.     "🔴": "emoji_red-circle",
  153.     "🟠": "emoji_orange-circle",
  154.     "🟡": "emoji_yellow-circle",
  155.     "🟢": "emoji_green-circle",
  156.     "🔵": "emoji_blue-circle",
  157.     "🟣": "emoji_purple-circle",
  158.     "🟤": "emoji_brown-circle",
  159.     "⚫": "emoji_black-circle",
  160.     "⚪": "emoji_white-circle",
  161.     "💯": "emoji_100%",
  162.     "100%": "emoji_100%"
  163.     }
  164.     emote_names = [name_map.get(name, name) for name in emote_names]
  165.  
  166.     images = [load_emote_image(name) for name in emote_names if load_emote_image(name) is not None]
  167.     if drop_duplicates:
  168.         unique_images = []
  169.         for image in images:
  170.             if image not in unique_images:
  171.                 unique_images.append(image)
  172.         images = unique_images
  173.     if not images:
  174.         return None
  175.  
  176.     # If there's only one image, return it directly
  177.     if len(images) == 1:
  178.         return images[0]
  179.  
  180.     # Calculate dimensions for the composite image
  181.     max_h = max(img.height for img in images)
  182.     total_w = sum(img.width for img in images) + IMAGE_PADDING_PIXELS * (len(images) - 1)
  183.  
  184.     # Create a new blank image
  185.     combo_img = Image.new('RGBA', (total_w, max_h), (0, 0, 0, 0))
  186.  
  187.     # Paste images with padding
  188.     x_offset = 0
  189.     for img in images:
  190.         y_offset = int((max_h - img.height) / 2)
  191.         combo_img.paste(img, (x_offset, y_offset))
  192.         x_offset += img.width + IMAGE_PADDING_PIXELS
  193.  
  194.     # Resize the final composite image
  195.     combo_img.thumbnail((MAX_IMAGE_SIZE * len(images), MAX_IMAGE_SIZE), Image.Resampling.LANCZOS)
  196.     return combo_img
  197.  
  198. def fetch_user_profile_image(username: str, save_path: Path) -> bool:
  199.     """
  200.    Scrapes a Twitch user's page for their profile image URL and downloads it.
  201.    """
  202.     if save_path.exists():
  203.         return True
  204.  
  205.     match = re.search(r"\((.*?)\)", username)
  206.     sanitized_username = match.group(1).strip() if match else username.strip()
  207.  
  208.     try:
  209.         url = f"https://www.twitch.tv/{sanitized_username}"
  210.         response = requests.get(url, timeout=10)
  211.         response.raise_for_status()
  212.  
  213.         soup = BeautifulSoup(response.text, "html.parser")
  214.         img_url = None
  215.         for meta_tag in soup.find_all("meta"):
  216.             content = meta_tag.get("content", "")
  217.             if "static-cdn.jtvnw.net" in content and "-profile_image-" in content:
  218.                 img_url = content
  219.                 break
  220.        
  221.         if not img_url:
  222.             print(f"Warning: Could not find profile image URL for {username}.")
  223.             return False
  224.  
  225.         img_data = requests.get(img_url, timeout=10).content
  226.         save_path.parent.mkdir(parents=True, exist_ok=True)
  227.         with open(save_path, "wb") as f:
  228.             f.write(img_data)
  229.         return True
  230.  
  231.     except requests.RequestException as e:
  232.         print(f"Error fetching data for {username}: {e}")
  233.         return False
  234.     except Exception as e:
  235.         print(f"An unexpected error occurred for {username}: {e}")
  236.         return False
  237.  
  238. def load_user_image(username: str) -> Image.Image | None:
  239.     """
  240.    Loads a user's profile image. Checks local cache first, then fetches
  241.    from Twitch if not found.
  242.    """
  243.  
  244.     for ext in IMAGE_EXTENSIONS:
  245.         filepath = USER_FOLDER / f"{username}{ext}"
  246.         if filepath.exists():
  247.             return _load_image(filepath)
  248.    
  249.     save_path = USER_FOLDER / f"{username}.png"
  250.     if fetch_user_profile_image(username, save_path):
  251.         return _load_image(save_path)
  252.        
  253.     return None
  254.  
  255. # --- Data Loading and Analysis ---
  256.  
  257. def load_and_clean_data(file_path: Path) -> pd.DataFrame:
  258.     """Loads chat data from a CSV and performs initial cleaning."""
  259.     df = pd.read_csv(file_path, encoding="utf-8", quotechar='"', on_bad_lines="skip")
  260.     if not EXPECTED_COLS.issubset(df.columns):
  261.         raise ValueError(f"CSV missing one of the expected columns: {EXPECTED_COLS}")
  262.     df["message"] = df["message"].map(clean_text)
  263.     return df
  264.  
  265. def analyze_top_words_by_freq(df: pd.DataFrame, top_n: int = 10) -> List[Tuple[str, int]]:
  266.     """Calculates the most frequent words by total occurrences."""
  267.     # UPDATED: Added a condition to ignore words in NOT_EMOTES
  268.     words = [
  269.         word for msg in df["message"]
  270.         for word in msg.split()
  271.         if word.lower() not in NOT_EMOTES and len(word.lower())>1
  272.     ]
  273.     return Counter(words).most_common(top_n)
  274.  
  275. def analyze_popular_words(df: pd.DataFrame, top_n: int = 10) -> List[Tuple[str, int]]:
  276.     """
  277.    Finds the most popular words, measured by the number of unique messages they appear in.
  278.    This is calculated over all words in the chat log for accuracy.
  279.    """
  280.     # Create a Series of words, keeping the original message index
  281.     words_series = df['message'].str.split().explode().dropna()
  282.  
  283.     # Convert the Series to a DataFrame, naming the new word column 'word'
  284.     words_df = words_series.reset_index(name='word')
  285.  
  286.     # UPDATED: Filter out any words that are in the NOT_EMOTES set
  287.     words_df = words_df[~words_df['word'].str.lower().isin(NOT_EMOTES)]
  288.     words_df = words_df[words_df['word'].str.len() > 1]
  289.  
  290.     # Get unique (message index, word) pairs
  291.     unique_word_per_message = words_df.drop_duplicates(subset=['index', 'word'])
  292.  
  293.     # Count how many unique messages each word appeared in
  294.     message_counts = unique_word_per_message['word'].value_counts()
  295.     return list(message_counts.head(top_n).items())
  296.  
  297. def analyze_consistent_words(df: pd.DataFrame, top_n: int = 10) -> List[Tuple[str, float]]:
  298.     """
  299.    Finds the most consistent words, measured by the percentage of total stream minutes
  300.    in which they appeared at least once. This is calculated over all words for accuracy.
  301.    """
  302.     if 'time' not in df.columns or df['time'].max() == 0:
  303.         return []
  304.  
  305.     total_stream_seconds = df['time'].max()
  306.     total_minutes = math.ceil(total_stream_seconds / 60)
  307.     if total_minutes == 0:
  308.         return []
  309.  
  310.     # Create a working copy with a 'minute' column
  311.     work_df = df[['time', 'message']].copy()
  312.     work_df['minute'] = work_df['time'] // 60
  313.  
  314.     # Explode messages into words, keeping the 'minute' for each word
  315.     words_in_minutes = work_df.drop(columns=['time'])
  316.     words_in_minutes['word'] = words_in_minutes['message'].str.split()
  317.     words_in_minutes = words_in_minutes.explode('word').dropna(subset=['word'])
  318.  
  319.     # UPDATED: Filter out any words that are in the NOT_EMOTES set
  320.     words_in_minutes = words_in_minutes[~words_in_minutes['word'].str.lower().isin(NOT_EMOTES)]
  321.     words_in_minutes = words_in_minutes[words_in_minutes['word'].str.len() > 1]
  322.  
  323.     # Find unique (word, minute) pairs
  324.     unique_word_minute_pairs = words_in_minutes[['word', 'minute']].drop_duplicates()
  325.     # Count the number of unique minutes each word appeared in
  326.     minute_counts = unique_word_minute_pairs['word'].value_counts()
  327.  
  328.     # Calculate presence percentage
  329.     presence_percentages = (minute_counts / total_minutes) * 100
  330.    
  331.     ### print(sorted(presence_percentages.items(), key=lambda x: x[1], reverse=True)[:20]) # debug
  332.    
  333.     # Get the top N and format for output
  334.     top_consistent = presence_percentages.head(top_n)
  335.     return sorted(top_consistent.items(), key=lambda x: x[1], reverse=True)
  336.  
  337. def analyze_emotes_by_users(df: pd.DataFrame, top_n: int = 10) -> List[Tuple[str, int]]:
  338.     """
  339.    Finds the top emotes based on the number of distinct users who used them.
  340.    """
  341.     # Use a dictionary where keys are emotes and values are sets of user names
  342.     emote_to_users = {}
  343.  
  344.     # Iterate through each message
  345.     for _, row in df.iterrows():
  346.         user = row['user_name']
  347.         # Split message into unique words to count a user only once per message for an emote
  348.         words = set(str(row['message']).split())
  349.        
  350.         for word in words:
  351.             # Basic filtering for non-emotes
  352.             if word.lower() in NOT_EMOTES or len(word) <= 1:
  353.                 continue
  354.            
  355.             # Initialize the set if the emote is new
  356.             if word not in emote_to_users:
  357.                 emote_to_users[word] = set()
  358.            
  359.             # Add the user to the set for that emote
  360.             emote_to_users[word].add(user)
  361.  
  362.     # Count the number of unique users for each emote
  363.     emote_distinct_user_counts = {
  364.         emote: len(users) for emote, users in emote_to_users.items()
  365.     }
  366.  
  367.     # Sort the emotes by the distinct user count in descending order
  368.     sorted_emotes = sorted(
  369.         emote_distinct_user_counts.items(),
  370.         key=lambda item: item[1],
  371.         reverse=True
  372.     )
  373.    
  374.     return sorted_emotes[:top_n]
  375.  
  376. def analyze_user_activity(df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
  377.     """Identifies the most active chatters and returns a DataFrame."""
  378.     user_counts = df['user_name'].value_counts()
  379.     top_chatters_df = user_counts.head(top_n).to_frame(name='message_count')
  380.     top_chatters_df.index.name = 'user_name'
  381.     return top_chatters_df
  382.  
  383. def analyze_consistent_users(df: pd.DataFrame, top_n: int = 10) -> List[Tuple[str, float]]:
  384.     """
  385.    Finds the most consistent chatters, measured by the percentage of total stream minutes
  386.    in which they sent at least one message. Calculated over all users for accuracy.
  387.    """
  388.     if 'time' not in df.columns or df['time'].max() == 0:
  389.         return []
  390.  
  391.     total_stream_seconds = df['time'].max()
  392.     total_minutes = math.ceil(total_stream_seconds / 60)
  393.     if total_minutes == 0:
  394.         return []
  395.  
  396.     # Create a working copy with a 'minute' column
  397.     work_df = df[['time', 'user_name']].copy()
  398.     work_df['minute'] = work_df['time'] // 60
  399.  
  400.     # Find unique (user, minute) pairs
  401.     unique_user_minute_pairs = work_df[['user_name', 'minute']].drop_duplicates()
  402.  
  403.     # Count the number of unique minutes each user was active in
  404.     minute_counts = unique_user_minute_pairs['user_name'].value_counts()
  405.  
  406.     # Calculate presence percentage
  407.     presence_percentages = (minute_counts / total_minutes) * 100
  408.  
  409.     # Get the top N and format for output
  410.     top_consistent = presence_percentages.head(top_n)
  411.     return sorted(top_consistent.items(), key=lambda x: x[1], reverse=True)
  412.  
  413. def analyze_original_chatters(df: pd.DataFrame, top_n: int = 10) -> Tuple[pd.Series, pd.DataFrame]:
  414.     """Identifies users who post the most unique (single-occurrence) messages."""
  415.     message_counts = df['message'].value_counts()
  416.     unique_messages = message_counts[message_counts == 1].index
  417.     unique_df = df[df['message'].isin(unique_messages)]
  418.     top_original_chatters = unique_df['user_name'].value_counts().head(top_n)
  419.     return top_original_chatters, unique_df
  420.  
  421. def analyze_message_bins(user_msg_counts: pd.Series) -> Dict[str, int]:
  422.     """Groups users into bins based on how many messages they've sent."""
  423.     counts_by_num_messages = user_msg_counts.value_counts().sort_index()
  424.     return {
  425.         "1": counts_by_num_messages.get(1, 0),
  426.         "2–10": counts_by_num_messages[(counts_by_num_messages.index >= 2) & (counts_by_num_messages.index <= 10)].sum(),
  427.         "11–100": counts_by_num_messages[(counts_by_num_messages.index >= 11) & (counts_by_num_messages.index <= 100)].sum(),
  428.         "101–1000": counts_by_num_messages[(counts_by_num_messages.index >= 101) & (counts_by_num_messages.index <= 1000)].sum(),
  429.         "1000+": counts_by_num_messages[counts_by_num_messages.index > 1000].sum()
  430.     }
  431.  
  432. def merge_overlapping_chains(pairs_with_counts, conservative_merge: bool = False):
  433.     """
  434.    Merge overlapping sequences in a list of (sequence, count)
  435.    Example input: [("x y", 5), ("y z", 3), ("a b", 2)]
  436.    Output: [("x y z", 3), ("a b", 2)]
  437.    Excludes last item if it is the same as first.
  438.    """
  439.     # Convert sequences to lists
  440.     seqs = [(seq.split(), count) for seq, count in pairs_with_counts]
  441.     merged = []
  442.  
  443.     while seqs:
  444.         seq, count = seqs.pop(0)
  445.         i = 0
  446.         while i < len(seqs):
  447.             other_seq, other_count = seqs[i]
  448.  
  449.             # Check if sequences overlap (end of seq == start of other_seq)
  450.             if seq[-1] == other_seq[0]:
  451.                 seq = seq + other_seq[1:]
  452.                 if conservative_merge:
  453.                     count = min(count, other_count) # conservative count
  454.                 else:
  455.                     count = max(count, other_count) # optimistic count
  456.                 # else: average count ?
  457.  
  458.                 seqs.pop(i)
  459.                 i = 0  # restart loop
  460.             elif other_seq[-1] == seq[0]:
  461.                 seq = other_seq[:-1] + seq
  462.                 count = min(count, other_count)
  463.                 seqs.pop(i)
  464.                 i = 0
  465.             else:
  466.                 i += 1
  467.  
  468.         # Exclude last item if same as first
  469.         if len(seq) > 1 and seq[-1] == seq[0]:
  470.             seq = seq[:-1]
  471.  
  472.         merged.append((" ".join(seq), count))
  473.    
  474.     return merged
  475.  
  476. from typing import List, Tuple, Set
  477.  
  478. def prune_top_n_subsets(combos: List[Tuple[List[str], int]], top_n: int = 10) -> List[Tuple[List[str], int]]:
  479.     """
  480.    Eliminates cyclical subsets from a list of combinations iteratively.
  481.  
  482.    This function prunes a list of (list_of_strings, integer_value) tuples.
  483.    It identifies pairs where one list is a "cyclical subset" of another
  484.    (i.e., its elements are a subset of the other's, order-agnostic).
  485.  
  486.    The pruning logic is applied iteratively within a "top_n" window:
  487.    1. The list is sorted by the integer value in descending order.
  488.    2. The top N items are compared against each other.
  489.    3. For any subset/superset pair, the one with the higher value is kept.
  490.    4. If values are equal, the one with the longer list is kept.
  491.    5. Inferior items are removed from the full list.
  492.    6. This process repeats until a pass over the top N items removes nothing.
  493.  
  494.    Args:
  495.        combos: A list of tuples, where each tuple contains a list of strings
  496.                and an associated integer value.
  497.        top_n: The size of the window at the top of the sorted list to
  498.               perform comparisons within.
  499.  
  500.    Returns:
  501.        A pruned and sorted list of tuples.
  502.    """
  503.     # Use a mutable copy to work with
  504.     pruned_combos = list(combos)
  505.  
  506.     while True:
  507.         # 1. Sort the list by value (desc) and then length (desc) as a tie-breaker.
  508.         # This sort is crucial for the "top_n" window logic and ensures stable processing.
  509.         pruned_combos.sort(key=lambda x: (x[1], len(x[0])), reverse=True)
  510.  
  511.         # Optimization: if the list is smaller than the window, no need for a window
  512.         window_size = min(top_n, len(pruned_combos))
  513.         if window_size < 2:
  514.             break
  515.  
  516.         window = pruned_combos[:window_size]
  517.        
  518.         # Pre-calculate sets for efficient comparison
  519.         window_sets = [(frozenset(item[0]), item[1], i) for i, item in enumerate(window)]
  520.         window_strs = [(str(item[0]), item[1], i) for i, item in enumerate(window)]
  521.        
  522.         indices_to_remove = set()
  523.        
  524.         # 2. Compare every item in the window against every other item
  525.         for i in range(window_size):
  526.             set_i, val_i, _ = window_sets[i]
  527.             len_i = len(set_i)
  528.             str_i, sal_i, _ = window_strs[i]
  529.             sen_i = len(str_i)
  530.  
  531.             for j in range(window_size):
  532.                 if i == j:
  533.                     continue
  534.  
  535.                 set_j, val_j, _ = window_sets[j]
  536.                 len_j = len(set_j)
  537.                 str_j, sal_j, _ = window_strs[j]
  538.                 sen_j = len(str_j)
  539.                
  540.                 # Check for subset relationship
  541.                 if set_i.issubset(set_j):
  542.                     # i is a subset of j. Determine which one to remove.
  543.                     # We compare them based on (value, length). The smaller one gets removed.
  544.                     if len_i < len_j: #(val_i, len_i) < (val_j, len_j):
  545.                         indices_to_remove.add(i)
  546.                     if len_i == len_j and val_i < val_j:
  547.                         indices_to_remove.add(i)
  548.                     # Note: If (val_j, len_j) < (val_i, len_i), j would be marked
  549.                     # for removal in its own inner loop iteration.
  550.                
  551.                 # Check for substr relationship
  552.                 if str_i in str_j:
  553.                     # i is a subset of j. Determine which one to remove.
  554.                     # We compare them based on (value, length). The smaller one gets removed.
  555.                     if sen_i < sen_j: #(val_i, len_i) < (val_j, len_j):
  556.                         indices_to_remove.add(i)
  557.                     if sen_i == sen_j and sal_i < sal_j:
  558.                         indices_to_remove.add(i)
  559.                     # Note: If (val_j, len_j) < (val_i, len_i), j would be marked
  560.                     # for removal in its own inner loop iteration.
  561.        
  562.         if not indices_to_remove:
  563.             # 5. If no items were removed in this pass, the process is stable.
  564.             break
  565.            
  566.         # 4. Rebuild the list, excluding the identified inferior items.
  567.         # We get the actual items to remove from the original window list
  568.         items_to_remove: Set[Tuple[List[str], int]] = {window[i] for i in indices_to_remove}
  569.        
  570.         # Create a frozenset of tuples for efficient lookup
  571.         items_to_remove_set = { (tuple(sorted(item[0])), item[1]) for item in items_to_remove }
  572.  
  573.         pruned_combos = [
  574.             item for item in pruned_combos
  575.             if (tuple(sorted(item[0])), item[1]) not in items_to_remove_set
  576.         ]
  577.  
  578.     return pruned_combos
  579.  
  580. def canonical_repeating_words_segment(s: str) -> str:    
  581.     """
  582.    Trim string s from left and right until a repeating substring is found.
  583.    Trim process requires the substring of same set size as previous.
  584.    """
  585.    
  586.     def srs(s: str) -> str: # smallest_repeating_substring
  587.         n = len(s)
  588.         for i in range(1, n + 1):
  589.             # Check if prefix of length i can generate the whole string
  590.             if n % i == 0:  # substring length must divide total length
  591.                 candidate = s[:i]
  592.                 if candidate * (n // i) == s:
  593.                     return candidate
  594.         return s  # fallback (the whole string is the smallest unit)
  595.    
  596.     def tokenize(x: str) -> set[str]:
  597.         return set(srs(x).split())
  598.  
  599.     words = s.split()
  600.     best = srs(s)
  601.     best_tokens = tokenize(s)
  602.  
  603.     # Try shrinking from both sides
  604.     for direction in ("left", "right"):
  605.         for k in range(1, len(words)):
  606.             if direction == "left":
  607.                 candidate_str = " ".join(words[k:])
  608.             else:
  609.                 candidate_str = " ".join(words[:-k])
  610.  
  611.             candidate_srs = srs(candidate_str)
  612.             candidate_tokens = set(candidate_srs.split())
  613.  
  614.             # If set size is the same, update best
  615.             if len(candidate_tokens) == len(best_tokens):
  616.                 best = candidate_srs
  617.                 best_tokens = candidate_tokens
  618.             else:
  619.                 # stop shrinking further in this direction
  620.                 break
  621.  
  622.     return best
  623.  
  624. def analyze_top_emote_combos(
  625.     df: pd.DataFrame,
  626.     top_n: int = 10
  627. ) -> List[Tuple[str, int]]:
  628.     combo_counter = Counter()
  629.  
  630.     for message in df['message']:
  631.         message_sanitized = message.replace("  ", " ")
  632.         combo = canonical_repeating_words_segment(message_sanitized)
  633.         if len(set(combo.split())) > 1:
  634.             combo_counter[combo] += 1
  635.  
  636.     results = [(key, count) for key, count in combo_counter.most_common(top_n)]
  637.  
  638.     return results[:top_n]
  639.  
  640. def analyze_first_time_chatters(df: pd.DataFrame) -> pd.Series:
  641.     """Finds the timestamp of the first message for each unique user."""
  642.     return df.groupby('user_name')['time'].min()
  643.  
  644. def analyze_most_mentioned_chatters(df: pd.DataFrame, top_n: int = 10) -> List[Tuple[str, int]]:
  645.     """
  646.    Finds the most mentioned chatters in messages (words starting with @).
  647.    """
  648.     mentions = []
  649.     for msg in df["message"]:
  650.         for word in msg.split():
  651.             if word.startswith("@") and len(word) > 1:
  652.                 word = strip_word(word)
  653.                 mentions.append(word.lower())  # lowercase for consistency
  654.     return Counter(mentions).most_common(top_n)
  655.  
  656. def analyze_emote_lifecycle(df: pd.DataFrame, emote_name: str) -> pd.Series:
  657.     """Analyzes the usage of a specific emote over time."""
  658.     emote_usage = df['message'].str.contains(rf"\b{re.escape(emote_name)}\b", regex=True, na=False)
  659.     emote_df = df[emote_usage]
  660.     if emote_df.empty:
  661.         return pd.Series(dtype=float)
  662.     bin_size = 60
  663.     bins = np.arange(0, df['time'].max() + bin_size, bin_size)
  664.     counts, edges = np.histogram(emote_df['time'], bins=bins)
  665.     return pd.Series(counts, index=edges[:-1] / 60)
  666.  
  667. def analyze_user_lifecycle(df: pd.DataFrame, user_name: str) -> pd.Series:
  668.     """Analyzes the activity of a specific user over time."""
  669.     user_df = df[df['user_name'] == user_name]
  670.     if user_df.empty:
  671.         return pd.Series(dtype=float)
  672.     bin_size = 60
  673.     bins = np.arange(0, df['time'].max() + bin_size, bin_size)
  674.     counts, edges = np.histogram(user_df['time'], bins=bins)
  675.     return pd.Series(counts, index=edges[:-1] / 60)
  676.  
  677. # --- Plotting ---
  678.  
  679. def add_images_to_bars(ax: plt.Axes, bars: plt.bar, labels: List[str], image_loader: Callable[[str], Image.Image | None]):
  680.     """Adds images next to the bars of a horizontal bar chart."""
  681.     fig = ax.figure
  682.     renderer = fig.canvas.get_renderer()
  683.     for bar, label in zip(bars, labels):
  684.         img = image_loader(label)
  685.         if img is not None:
  686.             bbox = bar.get_window_extent(renderer=renderer)
  687.             bar_height_pixels = bbox.height
  688.             img_width, img_height = img.size
  689.             zoom = bar_height_pixels / img_height # zoom = min(MAX_IMAGE_SIZE / max(img_width, img_height), bar_height_pixels / img_height)
  690.             offset_img = OffsetImage(img, zoom=zoom)
  691.             inv = ax.transData.inverted()
  692.             px_offset = inv.transform((IMAGE_PADDING_PIXELS, 0))[0] - inv.transform((0, 0))[0]
  693.             ab = AnnotationBbox(offset_img, (bar.get_width() + px_offset, bar.get_y() + bar.get_height() / 2), frameon=False, box_alignment=(0, 0.5))
  694.             ax.add_artist(ab)
  695.  
  696. def extend_plot_width(ax: plt.Axes, extra_pixels: int):
  697.     """Extends the x-axis limit to make space for images and labels."""
  698.     fig = ax.figure
  699.     renderer = fig.canvas.get_renderer()
  700.     xlim = ax.get_xlim()
  701.     bbox = ax.get_window_extent(renderer=renderer)
  702.     ratio = (xlim[1] - xlim[0]) / bbox.width
  703.     ax.set_xlim(xlim[0], xlim[1] + extra_pixels * ratio)
  704.  
  705. import textwrap
  706. import matplotlib.pyplot as plt
  707. from typing import List, Tuple, Callable
  708. from PIL import Image
  709.  
  710. def create_horizontal_bar_chart(
  711.     data: List[Tuple[str, int]] | pd.Series,
  712.     title: str,
  713.     color: str,
  714.     image_loader: Callable[[str], Image.Image | None] = None,
  715.     max_label_width: int = 30,  # default chars per line
  716.     label_fontsize: int = 10
  717. ) -> plt.Figure:
  718.     """Generic function to create a styled horizontal bar chart with adaptive label wrapping."""
  719.  
  720.     if isinstance(data, pd.Series):
  721.         labels, counts = data.index, data.values
  722.     else:
  723.         labels, counts = zip(*data)
  724.  
  725.     wrapped_labels = []
  726.     line_counts = []
  727.     for label in labels:
  728.         wrapped = textwrap.wrap(label, width=max_label_width)
  729.         # If label is more than 3 lines, allow wider wrapping
  730.         if len(wrapped) > 3:
  731.             wrapped = textwrap.wrap(label, width=int(max_label_width * 1.3))
  732.         wrapped_labels.append("\n".join(wrapped))
  733.         line_counts.append(len(wrapped))
  734.  
  735.     fig, ax = plt.subplots(figsize=(10, 6))
  736.     bars = ax.barh(wrapped_labels, counts, color=color)
  737.     ax.set_title(title)
  738.     ax.invert_yaxis()
  739.     ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
  740.  
  741.     if image_loader:
  742.         extend_plot_width(ax, PLOT_WIDTH_EXTENSION_PIXELS)
  743.         add_images_to_bars(ax, bars, labels, image_loader)
  744.  
  745.     inv = ax.transData.inverted()
  746.     text_offset_x = inv.transform((5, 0))[0] - inv.transform((0, 0))[0]
  747.     for bar in bars:
  748.         ax.text(
  749.             text_offset_x,
  750.             bar.get_y() + bar.get_height() / 2,
  751.             f"{int(bar.get_width()):,}",
  752.             va='center',
  753.             ha='left',
  754.             fontweight='bold'
  755.         )
  756.  
  757.     # Adjust font size per-label depending on number of lines
  758.     for tick_label, n_lines in zip(ax.get_yticklabels(), line_counts):
  759.         if n_lines > 3:
  760.             tick_label.set_fontsize(label_fontsize - 2)  # shrink a bit
  761.         else:
  762.             tick_label.set_fontsize(label_fontsize)
  763.  
  764.     plt.tight_layout()
  765.     config_plot_figure(fig, ax)
  766.  
  767.     return fig
  768.  
  769. def create_presence_bar_chart(data: List[Tuple[str, float]], title: str, color: str, image_loader: Callable[[str], Image.Image | None] = None) -> Figure:
  770.     """Creates a horizontal bar chart for presence percentage data."""
  771.     if not data:
  772.         fig, ax = plt.subplots(figsize=(10, 6))
  773.         ax.set_title(title); ax.text(0.5, 0.5, "No data.", ha='center', va='center'); ax.set_xlim(0, 100); ax.set_yticks([]); return fig
  774.  
  775.     labels, percentages = zip(*data)
  776.     fig, ax = plt.subplots(figsize=(10, 6))
  777.     bars = ax.barh(labels, percentages, color=color)
  778.     ax.set_title(title); ax.invert_yaxis(); ax.xaxis.set_major_formatter(FuncFormatter(percent_format)); ax.set_xlim(0, 105)
  779.  
  780.     if image_loader:
  781.         extend_plot_width(ax, PLOT_WIDTH_EXTENSION_PIXELS)
  782.         add_images_to_bars(ax, bars, labels, image_loader)
  783.  
  784.     inv = ax.transData.inverted()
  785.     text_offset_x = inv.transform((5, 0))[0] - inv.transform((0, 0))[0]
  786.     for bar in bars:
  787.         ax.text(text_offset_x, bar.get_y() + bar.get_height() / 2, f"{bar.get_width():.1f}%", va='center', ha='left', fontweight='bold')
  788.  
  789.     plt.tight_layout()
  790.  
  791.     config_plot_figure(fig, ax)
  792.  
  793.     return fig
  794.  
  795. def plot_user_bins(bins_data: Dict[str, int]) -> Figure:
  796.     """Plots the number of users by message count bins."""
  797.     fig, ax = plt.subplots(figsize=(10, 6))
  798.     ax.bar(bins_data.keys(), bins_data.values(), color='mediumpurple')
  799.     ax.set_title("Number of Chatters (Bins by Total Messages Sent)")
  800.     for x, y in bins_data.items():
  801.         ax.text(x, y + 0.5, f"{y:,}", ha='center', va='bottom', fontweight='bold')
  802.     plt.tight_layout()
  803.  
  804.     config_plot_figure(fig, ax)
  805.  
  806.     return fig
  807.  
  808. def plot_messages_over_time(df: pd.DataFrame) -> Figure:
  809.     """Plots the volume of messages over time."""
  810.     time_seconds = df['time']
  811.     bins = np.arange(0, time_seconds.max() + 60, 60)
  812.     counts, edges = np.histogram(time_seconds, bins=bins)
  813.     minutes = edges[:-1] / 60
  814.    
  815.     fig, ax = plt.subplots(figsize=(10, 6))
  816.     ax.fill_between(minutes, counts, color='royalblue', alpha=0.6)
  817.     ax.plot(minutes, counts, color='royalblue', alpha=0.9)
  818.     ax.set_title("Number of Messages Over Time (by Minute into Stream)")
  819.     ax.grid(True, linestyle='--', alpha=0.5)
  820.     plt.tight_layout()
  821.  
  822.     config_plot_figure(fig, ax)
  823.  
  824.     return fig
  825.  
  826. def plot_first_time_chatters(first_message_times: pd.Series) -> Figure:
  827.     """Plots when users sent their first message, styled like the messages-over-time plot."""
  828.     time_seconds = first_message_times
  829.     bins = np.arange(0, time_seconds.max() + 60, 60)  # 1-min bins
  830.     counts, edges = np.histogram(time_seconds, bins=bins)
  831.     minutes = edges[:-1] / 60
  832.  
  833.     fig, ax = plt.subplots(figsize=(10, 6))
  834.     ax.fill_between(minutes, counts, color='teal', alpha=0.6)
  835.     ax.plot(minutes, counts, color='teal', alpha=0.9)
  836.     ax.set_title("First-Time Chatters during this Stream (by Minute into Stream)")
  837.     ax.grid(True, linestyle='--', alpha=0.5)
  838.     plt.tight_layout()
  839.  
  840.     config_plot_figure(fig, ax)
  841.  
  842.     return fig
  843.  
  844. def format_minutes_to_hhmm(minutes: float) -> str:
  845.     """Convert minutes into HH:MM format (rounded)."""
  846.     total_minutes = int(round(minutes))
  847.     hours, mins = divmod(total_minutes, 60)
  848.     return f"{hours:02d}:{mins:02d}"
  849.  
  850. def add_peak_annotation(ax: plt.Axes, usage_data: pd.Series, label_name: str, color: str = "black"):
  851.     """Adds a sideways arrow pointing to the peak value with HH:MM and descriptive annotation.
  852.    
  853.    label_name = emote or username (used in the description line).
  854.    """
  855.     if usage_data.empty or usage_data.max() == 0:
  856.         return
  857.  
  858.     peak_idx = usage_data.idxmax()
  859.     peak_val = usage_data.max()
  860.     hhmm = format_minutes_to_hhmm(peak_idx)
  861.  
  862.     # Label text with time, count, and descriptive note
  863.     label_text = "?"
  864.     if "'" in label_name:
  865.         label_text = f'{hhmm} ({peak_val} msgs)\nMost "{label_name}" moment of the stream.'
  866.     else:
  867.         label_text = f"{hhmm} ({peak_val} msgs)\nMost '{label_name}' moment of the stream."
  868.  
  869.     # Positioning logic
  870.     xlim = ax.get_xlim()
  871.     midpoint = (xlim[0] + xlim[1]) / 2
  872.     x_offset = (xlim[1] - xlim[0]) * 0.08  # 8% of axis width as offset
  873.  
  874.     if peak_idx < midpoint:  
  875.         # Peak is in left half → place label to the right
  876.         xytext = (peak_idx + x_offset, peak_val)
  877.         ha = "left"
  878.     else:
  879.         # Peak is in right half → place label to the left
  880.         xytext = (peak_idx - x_offset, peak_val)
  881.         ha = "right"
  882.  
  883.     ax.annotate(
  884.         label_text,
  885.         xy=(peak_idx, peak_val),
  886.         xytext=xytext,
  887.         arrowprops=dict(
  888.             facecolor=color,
  889.             arrowstyle="->",
  890.             lw=1.2
  891.         ),
  892.         va="center",
  893.         ha=ha,
  894.         fontsize=12,
  895.         fontweight="bold",
  896.         color=color,
  897.     )
  898.  
  899.  
  900. def plot_emote_lifecycle(usage_data: pd.Series, emote: str) -> Figure:
  901.     """Plots the usage of a specific emote over time, with peak annotation."""
  902.     fig, ax = plt.subplots(figsize=(10, 6))
  903.     ax.fill_between(usage_data.index, usage_data.values, color='darkviolet', alpha=0.6)
  904.     ax.plot(usage_data.index, usage_data.values, color='darkviolet', alpha=0.9)
  905.     ax.set_title(f"Emote Lifecycle: '{emote}' containing Messages (by Minute into Stream)")
  906.     ax.grid(True, linestyle='--', alpha=0.5)
  907.  
  908.     # Annotate peak
  909.     add_peak_annotation(ax, usage_data, emote, color="darkviolet")
  910.  
  911.     plt.tight_layout()
  912.  
  913.     config_plot_figure(fig, ax)
  914.  
  915.     return fig
  916.  
  917. def plot_user_lifecycle(activity_data: pd.Series, user: str, color='darkcyan') -> Figure:
  918.     """Plots the activity of a specific user over time, with peak annotation."""
  919.     fig, ax = plt.subplots(figsize=(10, 6))
  920.     ax.fill_between(activity_data.index, activity_data.values, color=color, alpha=0.6)
  921.     ax.plot(activity_data.index, activity_data.values, color=color, alpha=0.9)
  922.     ax.set_title(f"User Lifecycle: '{user}' sent Messages (by Minute into Stream)")
  923.     ax.grid(True, linestyle='--', alpha=0.5)
  924.  
  925.     # Annotate peak
  926.     add_peak_annotation(ax, activity_data, user, color=color)
  927.  
  928.     plt.tight_layout()
  929.  
  930.     config_plot_figure(fig, ax)
  931.  
  932.     return fig
  933.  
  934. # --- Image Combination ---
  935.  
  936. def create_title_banner(text: str, width: int, height: int = 80, bg_color=(30,30,30), text_color=(255,255,255)) -> Image.Image:
  937.     """Creates a banner image with centered section title text."""
  938.     img = Image.new("RGB", (width, height), color=bg_color)
  939.     draw = ImageDraw.Draw(img)
  940.     try:
  941.         font = ImageFont.truetype("arial.ttf", 36)
  942.     except IOError:
  943.         font = ImageFont.load_default()
  944.  
  945.     # Pillow >= 10 uses textbbox
  946.     try:
  947.         bbox = draw.textbbox((0, 0), text, font=font)
  948.         text_w, text_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
  949.     except AttributeError:
  950.         text_w, text_h = draw.textsize(text, font=font)
  951.  
  952.     draw.text(((width - text_w) // 2, (height - text_h) // 2), text, fill=text_color, font=font)
  953.     return img
  954.  
  955. def combine_plots(items: List[Any], output_path: Path):
  956.     """Combines matplotlib figures and section banners into one image."""
  957.     if not items:
  958.         print("No figures to combine.")
  959.         return
  960.    
  961.     images = []
  962.     # Convert figures to images first, so we know max width
  963.     fig_images = []
  964.     for item in items:
  965.         if isinstance(item, Figure):
  966.             item.canvas.draw()
  967.             img = Image.fromarray(np.array(item.canvas.buffer_rgba())[:, :, :3])
  968.             fig_images.append(img)
  969.             plt.close(item)
  970.     max_width = max(img.width for img in fig_images) if fig_images else 1000
  971.  
  972.     # Now rebuild with banners at correct width and add figures back
  973.     item_iter = iter(items)
  974.     fig_iter = iter(fig_images)
  975.     for item in item_iter:
  976.         if isinstance(item, Figure):
  977.             images.append(next(fig_iter))
  978.         elif isinstance(item, tuple) and item[1] == "banner":
  979.             banner = create_title_banner(item[0], width=max_width, height=80)
  980.             images.append(banner)
  981.  
  982.     separator = 4
  983.     total_height = sum(img.height for img in images) + separator * (len(images) - 1)
  984.  
  985.     combined = Image.new("RGB", (max_width, total_height), color="white")
  986.     draw = ImageDraw.Draw(combined)
  987.     y_offset = 0
  988.     for i, img in enumerate(images):
  989.         combined.paste(img, (0, y_offset))
  990.         y_offset += img.height
  991.         if i < len(images) - 1:
  992.             draw.rectangle([0, y_offset, max_width, y_offset + separator - 1], fill=(0,0,0))
  993.             y_offset += separator
  994.  
  995.     output_path.parent.mkdir(parents=True, exist_ok=True)
  996.     combined.save(output_path)
  997.     print(f"\nCombined statistics image saved to: {output_path}")
  998.  
  999. # --- Main Execution ---
  1000.  
  1001. import os
  1002.  
  1003. def plt_main(figure, file_name):
  1004.     plt.show()
  1005.    
  1006.     PLT_DIR = OUTPUT_FOLDER / CSV_NAME
  1007.     os.makedirs(PLT_DIR, exist_ok=True)
  1008.     figure.savefig(PLT_DIR / str(file_name))
  1009.  
  1010. def main():
  1011.     """Main function to run the chat analysis and generate plots."""
  1012.     if not FILE_PATH.exists():
  1013.         print(f"Error: Input file not found at {FILE_PATH}"); return
  1014.  
  1015.     df = load_and_clean_data(FILE_PATH)
  1016.     figures = []
  1017.    
  1018.    
  1019.     # --- Meta Plots ---
  1020.     figures.append(("Copypastas", "banner"))
  1021.     print("0. Top Emote Combos...")
  1022.     top_emote_combos = analyze_top_emote_combos(df)
  1023.     if top_emote_combos:
  1024.         figure = create_horizontal_bar_chart(
  1025.             top_emote_combos,
  1026.             "Top 10 Emote Combos (by Messages Reducing To Them)",
  1027.             'darkviolet',
  1028.             create_combo_image # Use the new combo image function here
  1029.         )
  1030.         figures.append(figure); plt_main(figure, 0)
  1031.    
  1032.  
  1033.     # --- Emote/Word Summary Plots ---
  1034.     figures.append(("Emotes", "banner"))
  1035.     print("--- Generating Emote/Word Summary Plots ---")
  1036.     print("1. Top Words (by Total Occurrences)...")
  1037.     top_words_freq = analyze_top_words_by_freq(df)
  1038.     if top_words_freq:
  1039.         figure = create_horizontal_bar_chart(top_words_freq, "Top 10 Used Emotes (by Total Occurrences)", 'lightgreen', load_emote_image)
  1040.         figures.append(figure); plt_main(figure, 1)
  1041.  
  1042.     print("2. Top Words (by Messages)...")
  1043.     top_popular_words = analyze_popular_words(df)
  1044.     if top_popular_words:
  1045.         figure = create_horizontal_bar_chart(top_popular_words, "Top 10 Popular Emotes (by Messages Containing Them)", 'skyblue', load_emote_image)
  1046.         figures.append(figure); plt_main(figure, 2)
  1047.  
  1048.     print("3. Top Words (by Presence)...")
  1049.     top_consistent_words = analyze_consistent_words(df)
  1050.     if top_consistent_words:
  1051.         figure = create_presence_bar_chart(top_consistent_words, "Top 10 Consistent Emotes (% of Minutes With At Least One)", 'mediumturquoise', load_emote_image)
  1052.         figures.append(figure); plt_main(figure, 3)
  1053.  
  1054.     print("4. Top Words (by Spread)...")
  1055.     top_widespread_words = analyze_emotes_by_users(df)
  1056.     if top_widespread_words:
  1057.         figure = create_horizontal_bar_chart(top_widespread_words, "Top 10 Widespread Emotes (by Unique Users Using Them)", "#6495ED", load_emote_image)
  1058.         figures.append(figure); plt_main(figure, 4)
  1059.  
  1060.  
  1061.     # --- User Summary Plots ---
  1062.     figures.append(("Chatters", "banner"))
  1063.     print("\n--- Generating User Summary Plots ---")
  1064.     print("5. Top Active Chatters...")
  1065.     top_10_chatters_df = analyze_user_activity(df)
  1066.     top_10_chatters_series = top_10_chatters_df['message_count']
  1067.     figure = create_horizontal_bar_chart(top_10_chatters_series, "Top 10 Active Chatters (Total Messages)", 'lightcoral', load_user_image)
  1068.     figures.append(figure); plt_main(figure, 5)
  1069.  
  1070.     print("6. Top Chatters (by Presence)...")
  1071.     top_consistent_users = analyze_consistent_users(df)
  1072.     if top_consistent_users:
  1073.         figure = create_presence_bar_chart(top_consistent_users, "Top 10 Consistent Chatters (% of Minutes Chatting)", 'plum', load_user_image)
  1074.         figures.append(figure); plt_main(figure, 6)
  1075.  
  1076.     print("7. Top Original Chatters...")
  1077.     top_original, unique_df = analyze_original_chatters(df)
  1078.     figure = create_horizontal_bar_chart(top_original, "Top 10 Original Chatters (Unique Messages)", 'goldenrod', load_user_image)
  1079.     figures.append(figure); plt_main(figure, 7)
  1080.    
  1081.     print("8. Most Mentioned Chatters...")
  1082.     most_mentioned = analyze_most_mentioned_chatters(df)
  1083.     if most_mentioned:
  1084.         figure = create_horizontal_bar_chart(
  1085.             most_mentioned,
  1086.             "Top 10 Mentioned Chatters (by @mentions)",
  1087.             'deepskyblue',
  1088.             load_user_image
  1089.         )
  1090.         figures.append(figure)
  1091.         plt_main(figure, 8)
  1092.  
  1093.     # --- General Summary Plots ---
  1094.     figures.append(("Stream", "banner"))
  1095.     print("\n--- Generating General Summary Plots ---")
  1096.     print("9. Number of Chatters (by Bins)...")
  1097.     user_msg_counts = df['user_name'].value_counts()
  1098.     figure = plot_user_bins(analyze_message_bins(user_msg_counts))
  1099.     figures.append(figure); plt_main(figure, 9)
  1100.  
  1101.     print("10. Number of Messages (Over Time)...")
  1102.     figure = plot_messages_over_time(df)
  1103.     figures.append(figure); plt_main(figure, 10)
  1104.  
  1105.     # --- DISABLED ---
  1106.     #print("10. First-Time Chatter Rate...")
  1107.     #figures.append(plot_first_time_chatters(analyze_first_time_chatters(df))); plt_main(plt, 11)
  1108.    
  1109.     # --- General Moments Plots ---
  1110.     figures.append(("Moments", "banner"))
  1111.  
  1112.  
  1113.     # --- Detailed Emote Lifecycle Plots ---
  1114.     print("\n--- Generating Emote Lifecycle Plots ---")
  1115.    
  1116.     #top_word_labels = set([w for w, _ in top_words_freq]) \
  1117.     #| set([w for w, _ in top_popular_words]) \
  1118.     #| set([w for w, _ in top_consistent_words])
  1119.     #print("top emote set:", len(top_word_labels))
  1120.    
  1121.     def find_emote(combo):
  1122.  
  1123.         for _ in combo[0].split(" "):
  1124.             if _ in [p.stem for p in EMOTE_FOLDER.iterdir()]:
  1125.                 return _
  1126.  
  1127.         return combo[0]
  1128.  
  1129.     # PICK MOMENTS from top emote lists
  1130.     def unique_append(l, x):
  1131.         if x not in l:
  1132.             l.append(x)
  1133.             return True
  1134.         return False
  1135.     moments_suggestion = []
  1136.     # Define the sources as lists of lists
  1137.     sources = [
  1138.         [_[0] for _ in top_words_freq],
  1139.         [_[0] for _ in top_popular_words],
  1140.         [_[0] for _ in top_consistent_words],
  1141.         [_[0] for _ in top_widespread_words],
  1142.         [find_emote(combo) for combo in top_emote_combos]  # flatten emote combos
  1143.     ]
  1144.     print("moments candidates:", sources)
  1145.     for idx, source in enumerate(sources):
  1146.         appended = 0
  1147.         for item in source:
  1148.             success = unique_append(moments_suggestion, item)
  1149.             if success:
  1150.                 appended += 1
  1151.                 # For all sources except the last one, stop after the first append
  1152.                 if idx < len(sources) - 1 and appended == 1:
  1153.                     break
  1154.                 # For the last source, stop after 2 appends
  1155.                 if idx == len(sources) - 1 and appended == 2:
  1156.                     break
  1157.     moments_suggestion = [_ for _ in moments_suggestion[:-2]] + [moments_suggestion[-2]] + [moments_suggestion[-1]]
  1158.     print("moments:", moments_suggestion)
  1159.    
  1160.     # Default to showing lifecycle for top 10 most frequent words if LIFECYCLE_EMOTES is empty
  1161.     emotes_to_plot = LIFECYCLE_EMOTES if LIFECYCLE_EMOTES else list(moments_suggestion) if LIFECYCLE_EMOTES is not None else []
  1162.     for idx, emote in enumerate(emotes_to_plot):
  1163.         print(f"  - Plotting lifecycle for '{emote}'...")
  1164.         emote_data = analyze_emote_lifecycle(df, emote)
  1165.         if not emote_data.empty and emote_data.sum() > 0:
  1166.             figure = plot_emote_lifecycle(emote_data, emote)
  1167.             figures.append(figure); plt_main(figure, 100+idx)
  1168.         else:
  1169.             print(f"    (No usage data for '{emote}')")
  1170.  
  1171.  
  1172.     # --- Detailed User Lifecycle Plots ---
  1173.     print("\n--- Generating User Lifecycle Plots ---")
  1174.     # Default to showing lifecycle for top 10 most active users if LIFECYCLE_USERS is empty
  1175.     top_chatter_labels = top_10_chatters_df.index.tolist()
  1176.     users_to_plot = LIFECYCLE_USERS if LIFECYCLE_USERS else (top_chatter_labels if LIFECYCLE_USERS is not None else [])
  1177.     for idx, user in enumerate(users_to_plot):
  1178.         print(f"  - Plotting lifecycle for '{user}'...")
  1179.         user_data = analyze_user_lifecycle(df, user)
  1180.         if not user_data.empty and user_data.sum() > 0:
  1181.             figure = plot_user_lifecycle(user_data, user)
  1182.             figures.append(figure); plt_main(figure, 200+idx)
  1183.         else:
  1184.             print(f"    (No activity data for '{user}')")
  1185.            
  1186.     # --- Combine All Plots ---
  1187.     base_filename = FILE_PATH.stem
  1188.     output_file = OUTPUT_FOLDER / f"{base_filename}_statistics_full.png"
  1189.     combine_plots(figures, output_file)
  1190.  
  1191. if __name__ == "__main__":
  1192.     main()
  1193.  
  1194.  
Tags: forsen
Advertisement
Add Comment
Please, Sign In to add comment