Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import io
- def create_df_from_text(csv_text):
- """Creates a pandas DataFrame from a block of CSV text."""
- return pd.read_csv(io.StringIO(csv_text))
- def process_election_data(df, race_name):
- """
- Loads, cleans, and processes an election results DataFrame.
- - Aggregates votes for candidates across different party lines.
- - Identifies the main Democratic and Republican candidates.
- - Pivots the data to have one row per reporting unit with columns for major party votes.
- """
- # Filter out summary rows like 'United States Senator'
- df = df[~df['Reporting Unit'].str.contains('United States Senator', case=False, na=False)]
- # Filter out rows with 0 votes for both major candidates, like Ramapo 100 & 107
- df = df[~df['Candidate'].isin(['Adjudicated Write-Ins', 'Chase Oliver', 'Claudia De La Cruz', 'Cornel West', 'Jill Stein', 'Peter Sonski'])]
- # Aggregate votes for each candidate within each reporting unit
- candidate_totals = df.groupby(['Reporting Unit', 'Candidate'])['Votes'].sum().reset_index()
- # Dynamically identify the main party candidates
- try:
- rep_candidate_name = df[df['Party'] == 'Republican']['Candidate'].unique()[0]
- dem_candidate_name = df[df['Party'] == 'Democratic']['Candidate'].unique()[0]
- except IndexError:
- print(f"Error: Could not identify a unique Republican or Democratic candidate for {race_name}.")
- return None
- # Pivot the table to make candidates columns
- pivoted = candidate_totals.pivot_table(
- index='Reporting Unit', columns='Candidate', values='Votes', fill_value=0
- ).reset_index()
- # Ensure the major candidate columns exist
- if rep_candidate_name not in pivoted.columns: pivoted[rep_candidate_name] = 0
- if dem_candidate_name not in pivoted.columns: pivoted[dem_candidate_name] = 0
- processed_df = pivoted[['Reporting Unit', rep_candidate_name, dem_candidate_name]]
- processed_df.columns = ['Reporting Unit', f'Rep_Votes_{race_name}', f'Dem_Votes_{race_name}']
- return processed_df
- def run_anomaly_analysis(pres_df, sen_df):
- """
- Performs a series of tests to find implausible Republican overperformance.
- """
- # --- 1. Data Preparation ---
- pres_data = process_election_data(pres_df, 'President')
- sen_data = process_election_data(sen_df, 'Senator')
- if pres_data is None or sen_data is None:
- return
- results = pd.merge(pres_data, sen_data, on='Reporting Unit')
- # Filter out precincts with very low total vote counts to avoid noise
- results = results[(results['Rep_Votes_President'] + results['Dem_Votes_President']) >= 20]
- # --- 2. Calculate Core Metrics ---
- P_R = results['Rep_Votes_President']
- P_D = results['Dem_Votes_President']
- S_R = results['Rep_Votes_Senator']
- S_D = results['Dem_Votes_Senator']
- # Republican Overperformance (the primary metric of interest)
- results['Rep_Overperformance_Votes'] = P_R - S_R
- # Use np.divide to prevent division by zero errors
- results['Rep_Overperformance_Pct'] = np.divide(P_R - S_R, P_R, out=np.zeros_like(P_R, dtype=float), where=P_R!=0) * 100
- # Margin calculations for the "Flip" test
- total_pres = P_R + P_D
- total_sen = S_R + S_D
- results['Pres_Margin_Rep_Pct'] = np.divide(P_R - P_D, total_pres, out=np.zeros_like(P_R, dtype=float), where=total_pres!=0) * 100
- results['Sen_Margin_Rep_Pct'] = np.divide(S_R - S_D, total_sen, out=np.zeros_like(S_R, dtype=float), where=total_sen!=0) * 100
- results['Margin_Swing_pp'] = results['Pres_Margin_Rep_Pct'] - results['Sen_Margin_Rep_Pct']
- # Partisan Skew test
- results['Pres_Rep_Share'] = np.divide(P_R, total_pres, out=np.zeros_like(P_R, dtype=float), where=total_pres!=0) * 100
- # --- 3. Display Results ---
- pd.set_option('display.width', 1000)
- print("\n" + "="*80)
- print("Test 1: Top 15 Precincts with the Highest Republican Presidential Overperformance")
- print("This measures the % of Republican presidential voters who did not vote for the Rep. senator.")
- print("="*80)
- overperformers = results.sort_values(by='Rep_Overperformance_Pct', ascending=False).head(15)
- print(overperformers[[
- 'Reporting Unit', 'Rep_Votes_President', 'Dem_Votes_President', 'Rep_Votes_Senator', 'Dem_Votes_Senator', 'Rep_Overperformance_Votes', 'Rep_Overperformance_Pct'
- ]].to_string(index=False, formatters={
- 'Rep_Overperformance_Pct': '{:,.1f}%'.format
- }))
- print("\n" + "="*80)
- print("Test 2: Top 15 Precincts with the Largest Vote Margin 'Flips'")
- print("This measures the swing in the Republican margin between the Pres. and Sen. races.")
- print("="*80)
- flippers = results.sort_values(by='Margin_Swing_pp', ascending=False).head(15)
- print(flippers[[
- 'Reporting Unit', 'Pres_Margin_Rep_Pct', 'Sen_Margin_Rep_Pct', 'Margin_Swing_pp'
- ]].to_string(index=False, formatters={
- 'Pres_Margin_Rep_Pct': '{:+.1f}%'.format,
- 'Sen_Margin_Rep_Pct': '{:+.1f}%'.format,
- 'Margin_Swing_pp': '{:,.1f} pp'.format,
- }))
- print("\n" + "="*80)
- print("Test 3: Precincts with Extreme Partisan Skew (>95% for Republican Pres. Candidate)")
- print("="*80)
- skewed = results[results['Pres_Rep_Share'] > 95].sort_values(by='Pres_Rep_Share', ascending=False)
- print(skewed[[
- 'Reporting Unit', 'Rep_Votes_President', 'Dem_Votes_President', 'Pres_Rep_Share'
- ]].to_string(index=False, formatters={'Pres_Rep_Share': '{:,.1f}%'.format}))
- if __name__ == '__main__':
- # --- Data Provided by User ---
- president_results_file = "raw_results/election_results_president.csv"
- with open(president_results_file, "r") as f:
- presidential_text = f.read()
- senate_results_file = "raw_results/election_results_senator.csv"
- with open(senate_results_file, "r") as f:
- senate_text = f.read()
- # Create DataFrames
- pres_df = create_df_from_text(presidential_text.strip())
- sen_df = create_df_from_text(senate_text.strip())
- # Run the analysis
- run_anomaly_analysis(pres_df, sen_df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement