Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- # run this three lines only once
- # df = pd.read_csv("data/train.csv")
- # clicks = df[df.action_type == "clickout item"]
- # clicks.to_csv("myData/clicks.csv", index=False)
- clicks = pd.read_csv("myData/clicks.csv")
- # calculate timestamp diff between first and last row
- first_timestamp = clicks.head(n=1)["timestamp"].item()
- last_timestamp = clicks.tail(n=1)["timestamp"].item()
- timestamp_diff = last_timestamp - first_timestamp
- # split data into train and test
- split_border = 0.8 * timestamp_diff + first_timestamp
- train_df = clicks.loc[clicks['timestamp'] <= split_border]
- test_df = clicks.loc[clicks['timestamp'] > split_border]
- del clicks
- # copy splitted session part from test set to train set
- train_last_session_id = train_df.tail(n=1)["session_id"].item()
- train_last_session_rows = test_df[test_df.session_id == train_last_session_id]
- train_df = train_df.append(train_last_session_rows)
- # remove splitted session part from test set
- test_df = test_df[test_df.session_id != train_last_session_id]
- # create ground truth, remove reference from test set
- gt = test_df
- test_df = test_df.assign(reference=np.nan)
- # save data frames
- train_df.to_csv("myData/train.csv", index=False)
- test_df.to_csv("myData/test.csv", index=False)
- gt.to_csv("myData/ground_truth.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement