verifyOverlap

import pandas as pd
from pathlib import Path

# Directories
train_dir = Path("/ibex/user/harbihh/r6.2")
test_dir = Path("/ibex/user/harbihh/split/r6.2")

# Files to check
files = ["http.csv", "email.csv", "file.csv", "logon.csv", "device.csv", "psychometric.csv"]

for name in files:
    print(f"\nChecking {name}...")
    train_file = train_dir / name
    test_file = test_dir / name

    # Read just 1000 sample rows from each file
    df_train = pd.read_csv(train_file, nrows=1000)
    df_test = pd.read_csv(test_file, nrows=1000)

    # Check for any exact row overlap
    overlap = pd.merge(df_train, df_test, how='inner')

    if not overlap.empty:
        print(f"OVERLAP FOUND: {len(overlap)} rows match between train and test.")
    else:
        print("No overlap in sample.")