madoka_han

largeFileSplit

Jul 28th, 2025
175
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.97 KB | None | 0 0
  1.                                                                                                       ./scripts/split_large_csv.py                                                                                                          import sys
  2. import random
  3. from pathlib import Path
  4.  
  5. def count_lines(file_path):
  6.     with open(file_path, 'r') as f:
  7.         for i, _ in enumerate(f):
  8.             pass
  9.     return i  # zero-indexed
  10.  
  11. def split_large_csv(input_file, output_test_file, test_frac=0.2, seed=42):
  12.     input_file = Path(input_file)
  13.     output_test_file = Path(output_test_file)
  14.     output_train_file = input_file  # overwrite the input with the 80%
  15.  
  16.     print(f"Counting total rows in {input_file}...")
  17.     total_rows = count_lines(input_file)  # includes header
  18.     print(f"Total lines (including header): {total_rows + 1}")
  19.  
  20.     num_data_rows = total_rows  # exclude header
  21.     num_test = int(num_data_rows * test_frac)
  22.  
  23.     print(f"Sampling {num_test} rows for test set...")
  24.     random.seed(seed)
  25.     test_indices = set(random.sample(range(num_data_rows), num_test))
  26.  
  27.     print(f"Splitting file into train and test...")
  28.     with input_file.open("r") as f_in, \
  29.          output_train_file.with_suffix(".tmp").open("w") as f_train, \
  30.          output_test_file.open("w") as f_test:
  31.  
  32.         header = f_in.readline()
  33.         f_train.write(header)
  34.         f_test.write(header)
  35.  
  36.         for idx, line in enumerate(f_in):
  37.             if idx in test_indices:
  38.                 f_test.write(line)
  39.             else:
  40.                 f_train.write(line)
  41.  
  42.     print(f"Replacing original file with train split...")
  43.     output_train_file.with_suffix(".tmp").replace(output_train_file)
  44.  
  45.     print("Done.")
  46.  
  47.  
  48. if __name__ == "__main__":
  49.     if len(sys.argv) != 3:
  50.         print("Usage: python split_large_csv.py <input_csv> <output_test_csv>")
  51.         sys.exit(1)
  52.  
  53.     input_csv = sys.argv[1]
  54.     output_test_csv = sys.argv[2]
  55.     split_large_csv(input_csv, output_test_csv)
  56.  
Advertisement
Add Comment
Please, Sign In to add comment