Advertisement
Guest User

Untitled

a guest
Dec 10th, 2019
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.93 KB | None | 0 0
  1. import pandas as pd
  2.  
  3. df = pd.read_csv("./train.csv", sep=',', engine='python')
  4. df = df.iloc[:5000000]
  5.  
  6. df = df.sort_values(by=['session_id', 'timestamp'])
  7.  
  8. df = df.drop_duplicates(['session_id', 'step'])
  9.  
  10. session_ids = df["session_id"].unique()
  11.  
  12. index = int(len(session_ids) * 4 / 5)
  13.  
  14. train_list_ids = session_ids[:index]
  15. test_list_ids = session_ids[index:]
  16.  
  17. train_set = df.loc[df["session_id"].isin(train_list_ids)]
  18. test_set = df.loc[df["session_id"].isin(test_list_ids)]
  19. ground_truth = test_set.copy()
  20.  
  21. test_set.loc[(~test_set.duplicated(['session_id', 'action_type'], keep='last'))
  22. & (test_set['action_type']=='clickout item'), 'reference'] = ''
  23.  
  24. train_set.to_csv('C:\\Users\\Marcin\\PycharmProjects\\system-reko\\dane\\train.csv')
  25. ground_truth.to_csv('C:\\Users\\Marcin\\PycharmProjects\\system-reko\\dane\\ground truth.csv')
  26. test_set.to_csv('C:\\Users\\Marcin\\PycharmProjects\\system-reko\\dane\\test.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement