PayPal Activity CSV to GnuCash-ready splits (GBP, USD, FX reference)

#!/usr/bin/env python3
r"""
Split a PayPal "Activity Download" CSV into:
  1) GBP purchases/refunds (excluding funding and FX rows)
  2) USD purchases/refunds (excluding FX rows)
  3) FX reference table (GBP->USD conversions for USD purchases)

Works with the PayPal CSV format that has columns like:
Date, Time, Time Zone, Description, Currency, Gross, Fee, Net, Balance, Transaction ID, Reference Txn ID, Name, ...

Usage (Windows):
  py -3 paypal_split_by_currency.py "C:\path\PayPal.CSV"

Outputs files next to the input CSV by default.
"""
import argparse
import csv
from collections import Counter
from pathlib import Path

EXCLUDE_GBP_DESCRIPTIONS = {
    "Bank Deposit to PP Account",
    "User Initiated Withdrawal",
    "General Currency Conversion",
}

EXCLUDE_USD_DESCRIPTIONS = {
    "General Currency Conversion",
}

USD_PAYMENT_DESCRIPTIONS = {
    "Pre-approved Payment Bill User Payment",
    "Express Checkout Payment",
    "General Payment",
}

def parse_float(s: str) -> float:
    s = (s or "").strip()
    if s == "":
        return 0.0
    return float(s)

def read_rows(csv_path: Path):
    with csv_path.open("r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f)
        rows = list(reader)
        if not rows:
            raise SystemExit("CSV appears to be empty.")
        return reader.fieldnames, rows

def write_rows(out_path: Path, fieldnames, rows):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(rows)

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("csv", type=Path, help="PayPal activity CSV file")
    ap.add_argument("--outdir", type=Path, default=None, help="Output directory (default: same folder as input)")
    ap.add_argument("--prefix", type=str, default=None, help="Filename prefix (default: input filename stem)")
    args = ap.parse_args()

    csv_path: Path = args.csv
    outdir: Path = args.outdir if args.outdir else csv_path.parent
    prefix: str = args.prefix if args.prefix else csv_path.stem

    fieldnames, rows = read_rows(csv_path)

    # Basic sanity check for columns we rely on
    required = {"Currency", "Description", "Net", "Transaction ID", "Reference Txn ID", "Date", "Time"}
    missing_cols = required - set(fieldnames or [])
    if missing_cols:
        raise SystemExit(f"Missing expected columns: {', '.join(sorted(missing_cols))}")

    # Split for import
    gbp_rows = [
        r for r in rows
        if (r.get("Currency") == "GBP") and (r.get("Description") not in EXCLUDE_GBP_DESCRIPTIONS)
    ]
    usd_rows = [
        r for r in rows
        if (r.get("Currency") == "USD") and (r.get("Description") not in EXCLUDE_USD_DESCRIPTIONS)
    ]

    # Build quick lookup for FX components keyed by purchase transaction id
    # For a USD purchase with Transaction ID = PID, PayPal records:
    #   GBP "Bank Deposit to PP Account"      Reference Txn ID == PID
    #   GBP "General Currency Conversion"     Reference Txn ID == PID (Net negative)
    #   USD "General Currency Conversion"     Reference Txn ID == PID (Net positive)
    fx_lookup = {}
    for r in rows:
        ref = (r.get("Reference Txn ID") or "").strip()
        if not ref:
            continue
        fx_lookup.setdefault(ref, []).append(r)

    fx_ref_rows = []
    fx_missing = 0
    for r in rows:
        if r.get("Currency") != "USD":
            continue
        if r.get("Description") not in USD_PAYMENT_DESCRIPTIONS:
            continue

        pid = (r.get("Transaction ID") or "").strip()
        if not pid:
            continue

        bundle = fx_lookup.get(pid, [])
        gbp_deposit = next((x for x in bundle if x.get("Currency") == "GBP" and x.get("Description") == "Bank Deposit to PP Account"), None)
        gbp_conv    = next((x for x in bundle if x.get("Currency") == "GBP" and x.get("Description") == "General Currency Conversion"), None)
        usd_conv    = next((x for x in bundle if x.get("Currency") == "USD" and x.get("Description") == "General Currency Conversion"), None)

        if not (gbp_deposit and gbp_conv and usd_conv):
            fx_missing += 1
            continue

        gbp_topup = parse_float(gbp_deposit.get("Net"))
        gbp_converted = abs(parse_float(gbp_conv.get("Net")))
        usd_received = parse_float(usd_conv.get("Net"))
        usd_spent = abs(parse_float(r.get("Net")))
        rate = (usd_received / gbp_converted) if gbp_converted else 0.0

        merchant = (r.get("Name") or "").strip() or "(blank name)"
        fx_ref_rows.append({
            "Date": r.get("Date"),
            "Time": r.get("Time"),
            "Merchant": merchant,
            "GBP topup (bank->PayPal GBP)": f"{gbp_topup:.2f}",
            "GBP converted (PayPal GBP->USD)": f"{gbp_converted:.2f}",
            "USD received (PayPal USD)": f"{usd_received:.2f}",
            "USD spent at merchant": f"{usd_spent:.2f}",
            "Implied rate (USD per GBP)": f"{rate:.6f}",
            "Purchase TxID": pid,
        })

    # Output paths
    gbp_out = outdir / f"{prefix}_GBP_purchases.csv"
    usd_out = outdir / f"{prefix}_USD_purchases.csv"
    fx_out  = outdir / f"{prefix}_FX_reference.csv"

    write_rows(gbp_out, fieldnames, gbp_rows)
    write_rows(usd_out, fieldnames, usd_rows)

    fx_fields = ["Date","Time","Merchant","GBP topup (bank->PayPal GBP)","GBP converted (PayPal GBP->USD)","USD received (PayPal USD)","USD spent at merchant","Implied rate (USD per GBP)","Purchase TxID"]
    write_rows(fx_out, fx_fields, fx_ref_rows)

    # Summary
    ccy_counts = Counter(r.get("Currency") for r in rows)
    desc_counts = Counter(r.get("Description") for r in rows)
    print("Input:", csv_path)
    print("Rows:", len(rows), "| Currencies:", dict(ccy_counts))
    print("Wrote:")
    print("  ", gbp_out, f"({len(gbp_rows)} rows)")
    print("  ", usd_out, f"({len(usd_rows)} rows)")
    print("  ", fx_out,  f"({len(fx_ref_rows)} FX rows; {fx_missing} missing bundles skipped)")
    print("Top descriptions:", ", ".join([f"{k}={v}" for k,v in desc_counts.most_common(6)]))

if __name__ == "__main__":
    main()