Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- df = pd.read_csv("./train.csv", sep=',', engine='python')
- df = df.iloc[:5000000]
- df = df.sort_values(by=['session_id', 'timestamp'])
- df = df.drop_duplicates(['session_id', 'step'])
- session_ids = df["session_id"].unique()
- index = int(len(session_ids) * 4 / 5)
- train_list_ids = session_ids[:index]
- test_list_ids = session_ids[index:]
- train_set = df.loc[df["session_id"].isin(train_list_ids)]
- test_set = df.loc[df["session_id"].isin(test_list_ids)]
- ground_truth = test_set.copy()
- test_set.loc[(~test_set.duplicated(['session_id', 'action_type'], keep='last'))
- & (test_set['action_type']=='clickout item'), 'reference'] = ''
- train_set.to_csv('C:\\Users\\Marcin\\PycharmProjects\\system-reko\\dane\\train.csv')
- ground_truth.to_csv('C:\\Users\\Marcin\\PycharmProjects\\system-reko\\dane\\ground truth.csv')
- test_set.to_csv('C:\\Users\\Marcin\\PycharmProjects\\system-reko\\dane\\test.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement