Rockland County Analysis Script

import pandas as pd
import numpy as np
import io

def create_df_from_text(csv_text):
    """Creates a pandas DataFrame from a block of CSV text."""
    return pd.read_csv(io.StringIO(csv_text))

def process_election_data(df, race_name):
    """
    Loads, cleans, and processes an election results DataFrame.
    - Aggregates votes for candidates across different party lines.
    - Identifies the main Democratic and Republican candidates.
    - Pivots the data to have one row per reporting unit with columns for major party votes.
    """
    # Filter out summary rows like 'United States Senator'
    df = df[~df['Reporting Unit'].str.contains('United States Senator', case=False, na=False)]
    # Filter out rows with 0 votes for both major candidates, like Ramapo 100 & 107
    df = df[~df['Candidate'].isin(['Adjudicated Write-Ins', 'Chase Oliver', 'Claudia De La Cruz', 'Cornel West', 'Jill Stein', 'Peter Sonski'])]

    # Aggregate votes for each candidate within each reporting unit
    candidate_totals = df.groupby(['Reporting Unit', 'Candidate'])['Votes'].sum().reset_index()

    # Dynamically identify the main party candidates
    try:
        rep_candidate_name = df[df['Party'] == 'Republican']['Candidate'].unique()[0]
        dem_candidate_name = df[df['Party'] == 'Democratic']['Candidate'].unique()[0]
    except IndexError:
        print(f"Error: Could not identify a unique Republican or Democratic candidate for {race_name}.")
        return None

    # Pivot the table to make candidates columns
    pivoted = candidate_totals.pivot_table(
        index='Reporting Unit', columns='Candidate', values='Votes', fill_value=0
    ).reset_index()

    # Ensure the major candidate columns exist
    if rep_candidate_name not in pivoted.columns: pivoted[rep_candidate_name] = 0
    if dem_candidate_name not in pivoted.columns: pivoted[dem_candidate_name] = 0

    processed_df = pivoted[['Reporting Unit', rep_candidate_name, dem_candidate_name]]
    processed_df.columns = ['Reporting Unit', f'Rep_Votes_{race_name}', f'Dem_Votes_{race_name}']
    return processed_df

def run_anomaly_analysis(pres_df, sen_df):
    """
    Performs a series of tests to find implausible Republican overperformance.
    """
    # --- 1. Data Preparation ---
    pres_data = process_election_data(pres_df, 'President')
    sen_data = process_election_data(sen_df, 'Senator')

    if pres_data is None or sen_data is None:
        return

    results = pd.merge(pres_data, sen_data, on='Reporting Unit')

    # Filter out precincts with very low total vote counts to avoid noise
    results = results[(results['Rep_Votes_President'] + results['Dem_Votes_President']) >= 20]

    # --- 2. Calculate Core Metrics ---
    P_R = results['Rep_Votes_President']
    P_D = results['Dem_Votes_President']
    S_R = results['Rep_Votes_Senator']
    S_D = results['Dem_Votes_Senator']

    # Republican Overperformance (the primary metric of interest)
    results['Rep_Overperformance_Votes'] = P_R - S_R
    # Use np.divide to prevent division by zero errors
    results['Rep_Overperformance_Pct'] = np.divide(P_R - S_R, P_R, out=np.zeros_like(P_R, dtype=float), where=P_R!=0) * 100

    # Margin calculations for the "Flip" test
    total_pres = P_R + P_D
    total_sen = S_R + S_D
    results['Pres_Margin_Rep_Pct'] = np.divide(P_R - P_D, total_pres, out=np.zeros_like(P_R, dtype=float), where=total_pres!=0) * 100
    results['Sen_Margin_Rep_Pct'] = np.divide(S_R - S_D, total_sen, out=np.zeros_like(S_R, dtype=float), where=total_sen!=0) * 100
    results['Margin_Swing_pp'] = results['Pres_Margin_Rep_Pct'] - results['Sen_Margin_Rep_Pct']

    # Partisan Skew test
    results['Pres_Rep_Share'] = np.divide(P_R, total_pres, out=np.zeros_like(P_R, dtype=float), where=total_pres!=0) * 100

    # --- 3. Display Results ---
    pd.set_option('display.width', 1000)

    print("\n" + "="*80)
    print("Test 1: Top 15 Precincts with the Highest Republican Presidential Overperformance")
    print("This measures the % of Republican presidential voters who did not vote for the Rep. senator.")
    print("="*80)
    overperformers = results.sort_values(by='Rep_Overperformance_Pct', ascending=False).head(15)
    print(overperformers[[
        'Reporting Unit', 'Rep_Votes_President', 'Dem_Votes_President', 'Rep_Votes_Senator', 'Dem_Votes_Senator', 'Rep_Overperformance_Votes', 'Rep_Overperformance_Pct'
    ]].to_string(index=False, formatters={
        'Rep_Overperformance_Pct': '{:,.1f}%'.format
    }))

    print("\n" + "="*80)
    print("Test 2: Top 15 Precincts with the Largest Vote Margin 'Flips'")
    print("This measures the swing in the Republican margin between the Pres. and Sen. races.")
    print("="*80)
    flippers = results.sort_values(by='Margin_Swing_pp', ascending=False).head(15)
    print(flippers[[
        'Reporting Unit', 'Pres_Margin_Rep_Pct', 'Sen_Margin_Rep_Pct', 'Margin_Swing_pp'
    ]].to_string(index=False, formatters={
        'Pres_Margin_Rep_Pct': '{:+.1f}%'.format,
        'Sen_Margin_Rep_Pct': '{:+.1f}%'.format,
        'Margin_Swing_pp': '{:,.1f} pp'.format,
    }))

    print("\n" + "="*80)
    print("Test 3: Precincts with Extreme Partisan Skew (>95% for Republican Pres. Candidate)")
    print("="*80)
    skewed = results[results['Pres_Rep_Share'] > 95].sort_values(by='Pres_Rep_Share', ascending=False)
    print(skewed[[
        'Reporting Unit', 'Rep_Votes_President', 'Dem_Votes_President', 'Pres_Rep_Share'
    ]].to_string(index=False, formatters={'Pres_Rep_Share': '{:,.1f}%'.format}))


if __name__ == '__main__':
    # --- Data Provided by User ---
    president_results_file = "raw_results/election_results_president.csv"
    with open(president_results_file, "r") as f:
        presidential_text = f.read()

    senate_results_file = "raw_results/election_results_senator.csv"
    with open(senate_results_file, "r") as f:
        senate_text = f.read()

    # Create DataFrames
    pres_df = create_df_from_text(presidential_text.strip())
    sen_df = create_df_from_text(senate_text.strip())

    # Run the analysis
    run_anomaly_analysis(pres_df, sen_df)