Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from pathlib import Path
- # Directories
- train_dir = Path("/ibex/user/harbihh/r6.2")
- test_dir = Path("/ibex/user/harbihh/split/r6.2")
- # Files to check
- files = ["http.csv", "email.csv", "file.csv", "logon.csv", "device.csv", "psychometric.csv"]
- for name in files:
- print(f"\nChecking {name}...")
- train_file = train_dir / name
- test_file = test_dir / name
- # Read just 1000 sample rows from each file
- df_train = pd.read_csv(train_file, nrows=1000)
- df_test = pd.read_csv(test_file, nrows=1000)
- # Check for any exact row overlap
- overlap = pd.merge(df_train, df_test, how='inner')
- if not overlap.empty:
- print(f"OVERLAP FOUND: {len(overlap)} rows match between train and test.")
- else:
- print("No overlap in sample.")
Advertisement
Add Comment
Please, Sign In to add comment