Advertisement
Guest User

Untitled

a guest
Nov 11th, 2019
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.32 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3.  
  4. # run this three lines only once
  5. # df = pd.read_csv("data/train.csv")
  6. # clicks = df[df.action_type == "clickout item"]
  7. # clicks.to_csv("myData/clicks.csv", index=False)
  8.  
  9. clicks = pd.read_csv("myData/clicks.csv")
  10.  
  11. # calculate timestamp diff between first and last row
  12. first_timestamp = clicks.head(n=1)["timestamp"].item()
  13. last_timestamp = clicks.tail(n=1)["timestamp"].item()
  14. timestamp_diff = last_timestamp - first_timestamp
  15.  
  16. # split data into train and test
  17. split_border = 0.8 * timestamp_diff + first_timestamp
  18. train_df = clicks.loc[clicks['timestamp'] <= split_border]
  19. test_df = clicks.loc[clicks['timestamp'] > split_border]
  20. del clicks
  21.  
  22. # copy splitted session part from test set to train set
  23. train_last_session_id = train_df.tail(n=1)["session_id"].item()
  24. train_last_session_rows = test_df[test_df.session_id == train_last_session_id]
  25. train_df = train_df.append(train_last_session_rows)
  26.  
  27. # remove splitted session part from test set
  28. test_df = test_df[test_df.session_id != train_last_session_id]
  29.  
  30. # create ground truth, remove reference from test set
  31. gt = test_df
  32. test_df = test_df.assign(reference=np.nan)
  33.  
  34. # save data frames
  35. train_df.to_csv("myData/train.csv", index=False)
  36. test_df.to_csv("myData/test.csv", index=False)
  37. gt.to_csv("myData/ground_truth.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement