Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.77 KB | None | 0 0
  1. NROWS=10000000
  2.  
  3. df = pd.read_csv('featureset.csv', nrows=NROWS)
  4. df.columns = ['user_id','session_id','step','is_val','is_test','clickout_id'] + df.columns.tolist()[6:]
  5. df.isnull().sum().sum()
  6.  
  7.  
  8. def get_sort_index(row):
  9.     if (row["is_val"] == False) and (row["is_test"] == False):
  10.         find = int(row["clickout_id"]) % 25
  11.         return f"01_train_{find:04d}"
  12.     elif (row["is_val"] == True) and (row["is_test"] == False):
  13.         find = int(row["clickout_id"]) % 2
  14.         return f"02_val_{find:04d}"
  15.     elif row["is_test"] == True:
  16.         find = int(row["clickout_id"]) % 4
  17.         return f"03_test_{find:04d}"
  18.  
  19. df["sort_index"] = df.apply(get_sort_index, axis=1)
  20. df.sort_values('sort_index', inplace=True)
  21. df.to_csv('sorted_featureset.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement