caparol6991

Untitled

Nov 11th, 2019
158
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2.  
  3. pd.set_option('display.max_columns', None) #show all columns
  4. pd.set_option('display.max_rows', None) #show all columns
  5. data = pd.read_csv("train.csv", nrows=2000000, delimiter = ',') #read data
  6. number_of_rows = data.shape[0] #get number of rows
  7.  
  8. split_rows = int(number_of_rows * 0.8) #split the data initially 80/20
  9.  
  10. #print(data.tail())
  11.  
  12. training = data.iloc[:split_rows,:].copy() #create training set
  13. testing = data.iloc[split_rows:,:].copy() #create test set
  14.  
  15.  
  16. #get the last session id of traning set and first session id of testing set
  17. last_session_training = training.tail(1)["session_id"].values[0]
  18. first_session_testing = testing.head(1)["session_id"].values[0]
  19.  
  20. #check if the last session from training set is not in the test set
  21. while last_session_training == first_session_testing:
  22.  
  23. training = training.append(testing.head(1)) #append the first test row at the end of training set
  24. testing = testing.iloc[1:,] #remove first row of testing set
  25. first_session_testing = testing.head(1)["session_id"].values[0] #get current first session id of testing set
  26.  
  27.  
  28. print("Training set elements: " + str(training.shape[0]))
  29. print("Test set elements: " + str(testing.shape[0]))
  30.  
  31. print("Last session in training set: " + last_session_training)
  32. print("First session in testing set: " + first_session_testing)
  33.  
  34. #fresh tesing set
  35. testing_label = testing.copy()
  36.  
  37. last_clickout_list = [] #list of last click out actions in given session
  38. current_session = testing.head(1)["session_id"].values[0]
  39.  
  40. clickout_index = -1
  41. i = -1
  42. last_action = ""
  43.  
  44. for index, row in testing.iterrows():
  45. if row["session_id"] != current_session: #check if new session started
  46. current_session = row["session_id"] #set new session
  47. if clickout_index != -1 and last_action == "clickout item": #if session had an clickout action
  48. last_clickout_list.append(clickout_index) #add index of row with last clickout action
  49. clickout_index = -1 #clear last clickout action index
  50.  
  51. if(row["action_type"] == "clickout item"): #check if current action is a clickout
  52. clickout_index = index
  53. i+=1
  54. last_action = row["action_type"]
  55.  
  56. #check clickout for last session
  57. if clickout_index != -1:
  58. last_clickout_list.append(clickout_index)
  59.  
  60. print(last_clickout_list)
  61.  
  62. #set empty reference for all last clickouts
  63. for index in last_clickout_list:
  64. testing.at[index,'reference'] = ""
  65.  
  66.  
  67. submission_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\submission_popular.csv'
  68. reference_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\reference.csv'
  69. training_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\train.csv'
  70. testing_csv = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\test.csv'
  71. data_path = 'C:\\Users\\Karol\\PycharmProjects\\Systemy_Rekomendacyjne\data\\'
  72.  
  73.  
  74. training.to_csv(training_csv)
  75. testing.to_csv(testing_csv)
  76. testing_label.to_csv(reference_csv)
RAW Paste Data