Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- [jalal@scc1 src]$ cat create_5fold_csv.py
- #creating stratified 5-fold CSV files from the original CSV file
- __author__ = "Sha Lai"
- __email__ = "lais823@bu.edu"
- #code editted by Mona Jalal
- import pandas as pd
- import os
- from sklearn.model_selection import StratifiedKFold
- from random import shuffle
- PATH_TO_DATASET = os.path.join("..", "dataset")
- df = pd.read_csv(os.path.join(PATH_TO_DATASET, "EMNLP_master.csv"))
- #uncomment if you need to create 5fold only for V3relevance == 1
- indices = [i for i in range(df.shape[0]) if df["V3relevance"][i] == 1]
- shuffle(indices)
- df = df.iloc[indices, :]
- skf = StratifiedKFold(n_splits=5)
- #5folds_all present 1300 datapoints
- if not os.path.exists("../dataset/5folds_related"):
- os.mkdir("../dataset/5folds_related")
- X = [i for i in range(df.shape[0])]
- Y = df["V3relevance"].values
- counter = 0
- for train, test in skf.split(X, Y):
- dfTrain = df.iloc[train, :]
- dfTest = df.iloc[test, :]
- cur_path = os.path.join(PATH_TO_DATASET, "5folds_related", str(counter))
- if not os.path.exists(cur_path):
- os.mkdir(cur_path)
- dfTrain.to_csv(os.path.join(PATH_TO_DATASET, "5folds_related", str(counter), "train.csv"), index=False)
- dfTest.to_csv(os.path.join(PATH_TO_DATASET, "5folds_related", str(counter), "test.csv"), index=False)
- counter += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement