caparol6991

dddd

Nov 24th, 2019
145
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2.  
  3. pd.set_option('display.max_columns', None) #show all columns
  4. pd.set_option('display.max_rows', None) #show all columns
  5. data = pd.read_csv("train.csv", delimiter = ',', nrows=1000,engine="python") #read data
  6. number_of_rows = data.shape[0] #get number of rows
  7.  
  8. split_rows = int(number_of_rows * 0.8) #split the data initially 80/20
  9.  
  10. #print(data.tail())
  11.  
  12. training = data.iloc[:split_rows,:].copy() #create training set
  13. testing = data.iloc[split_rows:,:].copy() #create test set
  14.  
  15. #get the last session id of training set and first session id of testing set
  16. last_session_training = training.tail(1)["session_id"].values[0]
  17. first_session_testing = testing.head(1)["session_id"].values[0]
  18.  
  19. #check if the last session  from training set is not in the test set
  20. while last_session_training == first_session_testing:
  21.  
  22.     training = training.append(testing.head(1)) #append the first test row at the end of training set
  23.     testing = testing.iloc[1:,] #remove first row of testing set
  24.     first_session_testing = testing.head(1)["session_id"].values[0] #get current first session id of testing set
  25.  
  26.  
  27. print("Training set elements: " + str(training.shape[0]))
  28. print("Test set elements: " + str(testing.shape[0]))
  29.  
  30. print("Last session in training set: " + last_session_training)
  31. print("First session in testing set: " + first_session_testing)
  32.  
  33. training.groupby(["user_id","session_id"])
  34.  
  35. #fresh testing set
  36. testing_label = testing.copy()
  37.  
  38. last_clickout_list = [] #list of last click out actions in given session
  39. current_session = testing.head(1)["session_id"].values[0]
  40.  
  41. clickout_index = -1
  42. last_action = ""
  43. session_history = []
  44.  
  45. for index, row in testing.iterrows():
  46.     if row["session_id"] != current_session: #check if new session started
  47.         current_session = row["session_id"] #set new session
  48.         if clickout_index != -1 and last_action == "clickout item": #if sessions las action was clickout
  49.             last_clickout_list.append(clickout_index) #add index of row with last clickout action
  50.             clickout_index = -1 #clear last clickout action index
  51.  
  52.     if row["action_type"] == "clickout item": #check if current action is a clickout
  53.         clickout_index = index
  54.  
  55.     last_action = row["action_type"]
  56.  
  57. #check clickout for last session
  58. if clickout_index != -1:
  59.     last_clickout_list.append(clickout_index)
  60.  
  61. #print(last_clickout_list)
  62.  
  63. #set empty reference for all last clickouts
  64. for index in last_clickout_list:
  65.     testing.at[index,'reference'] = ""
  66.  
  67. df_test = testing[['session_id','action_type','reference']]
  68. print(df_test)
  69.  
  70. reference_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\reference.csv'
  71. training_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\train.csv'
  72. testing_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\test.csv'
  73.  
  74. training.to_csv(training_csv)
  75. testing.to_csv(testing_csv)
  76. testing_label.to_csv(reference_csv)
RAW Paste Data