caparol6991

Untitled

Jan 13th, 2020
186
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2.  
  3. pd.set_option('display.max_columns', None) #show all columns
  4. pd.set_option('display.max_rows', None) #show all columns
  5.  
  6. data = pd.read_csv("train.csv", delimiter = ',', nrows=1000, engine="python") #read data
  7. data = data.drop_duplicates(['session_id','step'])
  8.  
  9. number_of_rows = data.shape[0] #get number of rows
  10.  
  11. print(number_of_rows)
  12.  
  13. split_rows = int(number_of_rows * 0.8) #split the data initially 80/20
  14.  
  15. #print(data.tail())
  16.  
  17. training = data.iloc[:split_rows,:].copy() #create training set
  18. testing = data.iloc[split_rows:,:].copy() #create test set
  19.  
  20. #get the last session id of training set and first session id of testing set
  21. last_session_training = training.tail(1)["session_id"].values[0]
  22. first_session_testing = testing.head(1)["session_id"].values[0]
  23.  
  24. #check if the last session from training set is not in the test set
  25. while last_session_training == first_session_testing:
  26.  
  27. training = training.append(testing.head(1)) #append the first test row at the end of training set
  28. testing = testing.iloc[1:,] #remove first row of testing set
  29. first_session_testing = testing.head(1)["session_id"].values[0] #get current first session id of testing set
  30.  
  31.  
  32. print("Training set elements: " + str(training.shape[0]))
  33. print("Test set elements: " + str(testing.shape[0]))
  34.  
  35. print("Last session in training set: " + last_session_training)
  36. print("First session in testing set: " + first_session_testing)
  37.  
  38. training.groupby(["user_id","session_id"])
  39.  
  40. #fresh testing set
  41. testing_label = testing.copy()
  42.  
  43. last_clickout_list = [] #list of last click out actions in given session
  44. current_session = testing.head(1)["session_id"].values[0]
  45.  
  46. clickout_index = -1
  47. last_action = ""
  48.  
  49.  
  50. for index, row in testing.iterrows():
  51. if row["session_id"] != current_session: #check if new session started
  52. current_session = row["session_id"] #set new session
  53. if clickout_index != -1 and last_action == "clickout item": #if sessions las action was clickout
  54. last_clickout_list.append(clickout_index) #add index of row with last clickout action
  55. clickout_index = -1 #clear last clickout action index
  56.  
  57. if row["action_type"] == "clickout item": #check if current action is a clickout
  58. clickout_index = index
  59.  
  60. last_action = row["action_type"]
  61.  
  62. #check clickout for last session
  63. if clickout_index != -1:
  64. last_clickout_list.append(clickout_index)
  65.  
  66. #print(last_clickout_list)
  67.  
  68. #set empty reference for all last clickouts
  69. for index in last_clickout_list:
  70. testing.at[index,'reference'] = ""
  71.  
  72. df_test = testing[['session_id','action_type','reference']]
  73. #print(df_test)
  74.  
  75. reference_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\newsmall\\reference.csv'
  76. training_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\newsmall\\train.csv'
  77. testing_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\newsmall\\test.csv'
  78.  
  79. training.to_csv(training_csv)
  80. testing.to_csv(testing_csv)
  81. testing_label.to_csv(reference_csv)
RAW Paste Data