madoka_han

verifyOverlap

Jul 28th, 2025
195
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.79 KB | None | 0 0
  1. import pandas as pd
  2. from pathlib import Path
  3.  
  4. # Directories
  5. train_dir = Path("/ibex/user/harbihh/r6.2")
  6. test_dir = Path("/ibex/user/harbihh/split/r6.2")
  7.  
  8. # Files to check
  9. files = ["http.csv", "email.csv", "file.csv", "logon.csv", "device.csv", "psychometric.csv"]
  10.  
  11. for name in files:
  12.     print(f"\nChecking {name}...")
  13.     train_file = train_dir / name
  14.     test_file = test_dir / name
  15.  
  16.     # Read just 1000 sample rows from each file
  17.     df_train = pd.read_csv(train_file, nrows=1000)
  18.     df_test = pd.read_csv(test_file, nrows=1000)
  19.  
  20.     # Check for any exact row overlap
  21.     overlap = pd.merge(df_train, df_test, how='inner')
  22.  
  23.     if not overlap.empty:
  24.         print(f"OVERLAP FOUND: {len(overlap)} rows match between train and test.")
  25.     else:
  26.         print("No overlap in sample.")
  27.  
Advertisement
Add Comment
Please, Sign In to add comment