Advertisement
lamiastella

stratified

Apr 16th, 2019
219
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.32 KB | None | 0 0
  1. [jalal@scc1 src]$ cat create_5fold_csv.py
  2. #creating stratified 5-fold CSV files from the original CSV file
  3.  
  4. __author__ = "Sha Lai"
  5. __email__ = "lais823@bu.edu"
  6. #code editted by Mona Jalal
  7.  
  8. import pandas as pd
  9. import os
  10. from sklearn.model_selection import StratifiedKFold
  11. from random import shuffle
  12.  
  13. PATH_TO_DATASET = os.path.join("..", "dataset")
  14. df = pd.read_csv(os.path.join(PATH_TO_DATASET, "EMNLP_master.csv"))
  15.  
  16. #uncomment if you need to create 5fold only for V3relevance == 1
  17. indices = [i for i in range(df.shape[0])  if df["V3relevance"][i] == 1]
  18. shuffle(indices)
  19. df = df.iloc[indices, :]
  20.  
  21. skf = StratifiedKFold(n_splits=5)
  22.  
  23. #5folds_all present 1300 datapoints
  24. if not os.path.exists("../dataset/5folds_related"):
  25.     os.mkdir("../dataset/5folds_related")
  26. X = [i for i in range(df.shape[0])]
  27. Y = df["V3relevance"].values
  28. counter = 0
  29. for train, test in skf.split(X, Y):
  30.     dfTrain = df.iloc[train, :]
  31.     dfTest = df.iloc[test, :]
  32.     cur_path = os.path.join(PATH_TO_DATASET, "5folds_related", str(counter))
  33.     if not os.path.exists(cur_path):
  34.         os.mkdir(cur_path)
  35.     dfTrain.to_csv(os.path.join(PATH_TO_DATASET, "5folds_related", str(counter), "train.csv"), index=False)
  36.     dfTest.to_csv(os.path.join(PATH_TO_DATASET, "5folds_related", str(counter), "test.csv"), index=False)
  37.     counter += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement