Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- pd.set_option('display.max_columns', None) #show all columns
- pd.set_option('display.max_rows', None) #show all columns
- data = pd.read_csv("train.csv", delimiter = ',', nrows=1000, engine="python") #read data
- data = data.drop_duplicates(['session_id','step'])
- number_of_rows = data.shape[0] #get number of rows
- print(number_of_rows)
- split_rows = int(number_of_rows * 0.8) #split the data initially 80/20
- #print(data.tail())
- training = data.iloc[:split_rows,:].copy() #create training set
- testing = data.iloc[split_rows:,:].copy() #create test set
- #get the last session id of training set and first session id of testing set
- last_session_training = training.tail(1)["session_id"].values[0]
- first_session_testing = testing.head(1)["session_id"].values[0]
- #check if the last session from training set is not in the test set
- while last_session_training == first_session_testing:
- training = training.append(testing.head(1)) #append the first test row at the end of training set
- testing = testing.iloc[1:,] #remove first row of testing set
- first_session_testing = testing.head(1)["session_id"].values[0] #get current first session id of testing set
- print("Training set elements: " + str(training.shape[0]))
- print("Test set elements: " + str(testing.shape[0]))
- print("Last session in training set: " + last_session_training)
- print("First session in testing set: " + first_session_testing)
- training.groupby(["user_id","session_id"])
- #fresh testing set
- testing_label = testing.copy()
- last_clickout_list = [] #list of last click out actions in given session
- current_session = testing.head(1)["session_id"].values[0]
- clickout_index = -1
- last_action = ""
- for index, row in testing.iterrows():
- if row["session_id"] != current_session: #check if new session started
- current_session = row["session_id"] #set new session
- if clickout_index != -1 and last_action == "clickout item": #if sessions las action was clickout
- last_clickout_list.append(clickout_index) #add index of row with last clickout action
- clickout_index = -1 #clear last clickout action index
- if row["action_type"] == "clickout item": #check if current action is a clickout
- clickout_index = index
- last_action = row["action_type"]
- #check clickout for last session
- if clickout_index != -1:
- last_clickout_list.append(clickout_index)
- #print(last_clickout_list)
- #set empty reference for all last clickouts
- for index in last_clickout_list:
- testing.at[index,'reference'] = ""
- df_test = testing[['session_id','action_type','reference']]
- #print(df_test)
- reference_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\newsmall\\reference.csv'
- training_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\newsmall\\train.csv'
- testing_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\newsmall\\test.csv'
- training.to_csv(training_csv)
- testing.to_csv(testing_csv)
- testing_label.to_csv(reference_csv)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement