Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- NROWS=10000000
- df = pd.read_csv('featureset.csv', nrows=NROWS)
- df.columns = ['user_id','session_id','step','is_val','is_test','clickout_id'] + df.columns.tolist()[6:]
- df.isnull().sum().sum()
- def get_sort_index(row):
- if (row["is_val"] == False) and (row["is_test"] == False):
- find = int(row["clickout_id"]) % 25
- return f"01_train_{find:04d}"
- elif (row["is_val"] == True) and (row["is_test"] == False):
- find = int(row["clickout_id"]) % 2
- return f"02_val_{find:04d}"
- elif row["is_test"] == True:
- find = int(row["clickout_id"]) % 4
- return f"03_test_{find:04d}"
- df["sort_index"] = df.apply(get_sort_index, axis=1)
- df.sort_values('sort_index', inplace=True)
- df.to_csv('sorted_featureset.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement