caparol6991

Untitled

Nov 18th, 2019
158
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2.  
  3. pd.set_option('display.max_columns', None) #show all columns
  4. pd.set_option('display.max_rows', None) #show all rows
  5. data = pd.read_csv("train.csv", delimiter = ',', nrows=1000,engine="python") #read data
  6. number_of_rows = data.shape[0] #get number of rows
  7.  
  8. split_rows = int(number_of_rows * 0.8) #split the data initially 80/20
  9.  
  10. #print(data.tail())
  11.  
  12. training = data.iloc[:split_rows,:].copy() #create training set
  13. testing = data.iloc[split_rows:,:].copy() #create test set
  14.  
  15. #get the last session id of training set and first session id of testing set
  16. last_session_training = training.tail(1)["session_id"].values[0]
  17. first_session_testing = testing.head(1)["session_id"].values[0]
  18.  
  19. #check if the last session  from training set is not in the test set
  20. while last_session_training == first_session_testing:
  21.  
  22.     training = training.append(testing.head(1)) #append the first test row at the end of training set
  23.     testing = testing.iloc[1:,] #remove first row of testing set
  24.     first_session_testing = testing.head(1)["session_id"].values[0] #get current first session id of testing set
  25.  
  26.  
  27. print("Training set elements: " + str(training.shape[0]))
  28. print("Test set elements: " + str(testing.shape[0]))
  29.  
  30. print("Last session in training set: " + last_session_training)
  31. print("First session in testing set: " + first_session_testing)
  32.  
  33. training.groupby(["user_id","session_id"])
  34.  
  35. #fresh testing set
  36. testing_label = testing.copy()
  37.  
  38. last_clickout_list = [] #list of last click out actions in given session
  39. current_session = testing.head(1)["session_id"].values[0]
  40.  
  41. clickout_index = -1
  42. last_action = ""
  43. session_history = []
  44. duplicate_index = []
  45.  
  46. for index, row in testing.iterrows():
  47.     if row["session_id"] != current_session: #check if new session started
  48.         current_session = row["session_id"] #set new session
  49.         if clickout_index != -1 and last_action == "clickout item": #if sessions las action was clickout
  50.             last_clickout_list.append(clickout_index) #add index of row with last clickout action
  51.             clickout_index = -1 #clear last clickout action index
  52.  
  53.     if row["action_type"] == "clickout item": #check if current action is a clickout
  54.         clickout_index = index
  55.  
  56.     last_action = row["action_type"]
  57.  
  58. #check clickout for last session
  59. if clickout_index != -1:
  60.     last_clickout_list.append(clickout_index)
  61.  
  62. #print(last_clickout_list)
  63.  
  64. #set empty reference for all last clickouts
  65. for index in last_clickout_list:
  66.     testing.at[index,'reference'] = ""
  67.  
  68. df_test = testing[['session_id','action_type','reference']]
  69. print(df_test)
  70.  
  71. reference_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\reference.csv'
  72. training_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\train.csv'
  73. testing_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\test.csv'
  74.  
  75. training.to_csv(training_csv)
  76. testing.to_csv(testing_csv)
  77. testing_label.to_csv(reference_csv)
RAW Paste Data