caparol6991

Untitled

Nov 11th, 2019
155
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2.  
  3. pd.set_option('display.max_columns', None) #show all columns
  4. pd.set_option('display.max_rows', None) #show all columns
  5. data = pd.read_csv("train.csv", delimiter = ',',engine="python") #read data
  6. number_of_rows = data.shape[0] #get number of rows
  7.  
  8. split_rows = int(number_of_rows * 0.8) #split the data initially 80/20
  9.  
  10. #print(data.tail())
  11.  
  12. training = data.iloc[:split_rows,:].copy() #create training set
  13. testing = data.iloc[split_rows:,:].copy() #create test set
  14.  
  15. data = ""
  16.  
  17. #get the last session id of traning set and first session id of testing set
  18. last_session_training = training.tail(1)["session_id"].values[0]
  19. first_session_testing = testing.head(1)["session_id"].values[0]
  20.  
  21. #check if the last session  from training set is not in the test set
  22. while last_session_training == first_session_testing:
  23.  
  24.     training = training.append(testing.head(1)) #append the first test row at the end of training set
  25.     testing = testing.iloc[1:,] #remove first row of testing set
  26.     first_session_testing = testing.head(1)["session_id"].values[0] #get current first session id of testing set
  27.  
  28.  
  29. print("Training set elements: " + str(training.shape[0]))
  30. print("Test set elements: " + str(testing.shape[0]))
  31.  
  32. print("Last session in training set: " + last_session_training)
  33. print("First session in testing set: " + first_session_testing)
  34.  
  35. #fresh tesing set
  36. testing_label = testing.copy()
  37.  
  38. last_clickout_list = [] #list of last click out actions in given session
  39. current_session = testing.head(1)["session_id"].values[0]
  40.  
  41. clickout_index = -1
  42. last_action = ""
  43. session_history = []
  44.  
  45. for index, row in testing.iterrows():
  46.     if row["session_id"] != current_session: #check if new session started
  47.         if row["session_id"] in session_history: #check if session is not a duplicate
  48.             print("duplicate!" + row["session_id"])
  49.             continue #skip to next iteration
  50.         current_session = row["session_id"] #set new session
  51.         session_history.append(row["session_id"]) #add session id to history of sessions
  52.         if clickout_index != -1 and last_action == "clickout item": #if sessions las action was clickout
  53.             last_clickout_list.append(clickout_index) #add index of row with last clickout action
  54.             clickout_index = -1 #clear last clickout action index
  55.  
  56.     if row["action_type"] == "clickout item": #check if current action is a clickout
  57.         clickout_index = index
  58.     last_action = row["action_type"]
  59.  
  60. #check clickout for last session
  61. if clickout_index != -1:
  62.     last_clickout_list.append(clickout_index)
  63.  
  64. #print(last_clickout_list)
  65.  
  66. #set empty reference for all last clickouts
  67. for index in last_clickout_list:
  68.     testing.at[index,'reference'] = ""
  69.  
  70.  
  71. submission_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\submission_popular.csv'
  72. reference_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\reference.csv'
  73. training_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\train.csv'
  74. testing_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\test.csv'
  75. data_path = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\'
  76.  
  77.  
  78. training.to_csv(training_csv)
  79. testing.to_csv(testing_csv)
  80. testing_label.to_csv(reference_csv)
RAW Paste Data