CAROJASQ

4 predictors gpu log

Oct 23rd, 2018
49
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.81 KB | None | 0 0
  1. Running calculations on GPU
  2. Doing regressions for 1 predictors (54) regressions
  3. Number of possible combinations are 54, batch size is 2447000
  4. Generating 2447000 combs for this batch
  5. Processing from 0 to 54 regressions in this batch
  6. For this batch 0 models are invalid
  7. Doing regressions for 2 predictors (1431) regressions
  8. Number of possible combinations are 1431, batch size is 1819000
  9. Generating 1819000 combs for this batch
  10. Processing from 0 to 1431 regressions in this batch
  11. For this batch 0 models are invalid
  12. Doing regressions for 3 predictors (24804) regressions
  13. Number of possible combinations are 24804, batch size is 1441000
  14. Generating 1441000 combs for this batch
  15. Processing from 0 to 24804 regressions in this batch
  16. For this batch 0 models are invalid
  17. Doing regressions for 4 predictors (316251) regressions
  18. Number of possible combinations are 316251, batch size is 1190000
  19. Generating 1190000 combs for this batch
  20. Processing from 0 to 316251 regressions in this batch
  21. For this batch 0 models are invalid
  22. 342540 Regressions has been done, tt 5.53639101982, te: 3.53138804436
  23. Using GPU to do regressions took 12.2080521584
  24. Timer unit: 1e-06 s
  25.  
  26. Total time: 11.8288 s
  27. File: /home/nvera/Cris/HMMMR/src/batched_regression.py
  28. Function: find_best_models_gpu at line 182
  29.  
  30. Line # Hits Time Per Hit % Time Line Contents
  31. ==============================================================
  32. 182 def find_best_models_gpu(file_name='../TestData/Y=2X1+3X2+4X3+5_with_shitty.csv', min_predictors=1, max_predictors=4, metric=None, window=None, handle=None, max_batch_size=None, **kwargs):
  33. 183 """
  34. 184
  35. 185 :param file_name: File name containing data, the format is the following
  36. 186 Columns: Contains data for N-2 predictors, 1 column full of 1s and 1 column with outcome data
  37. 187 Columns 1 to N-2 contains predictors data
  38. 188 The N-1 column is always full of 1s (due the constant on the model)
  39. 189 The N column contains Y data
  40. 190 Rows: The first row contains the name of the predictor
  41. 191 The next rows contains the observations (They need to be real values, no empty/nans are allowed
  42. 192 :param max_predictors: Max numbers of predictors to test in the regression. Should b N-2 at max
  43. 193 :return: Ordered array (by RMSE) of tuples containing (predictors_combination, RMSE)
  44. 194 """
  45. 195 1 2.0 2.0 0.0 tt = te = 0 # total time
  46. 196 1 145105.0 145105.0 1.2 handle = handle if handle else cublas.cublasCreate()
  47. 197 1 12392.0 12392.0 0.1 XY = np.loadtxt(open(file_name, "rb"), delimiter=",", skiprows=1, dtype=np.float32)
  48. 198 1 102.0 102.0 0.0 X = np.delete(XY, XY.shape[1] - 1, 1)
  49. 199 1 3.0 3.0 0.0 Y = XY[:, -1]
  50. 200 1 2.0 2.0 0.0 combs_rmse = None
  51. 201 1 1.0 1.0 0.0 done_regressions = 0
  52. 202 1 15.0 15.0 0.0 with open(file_name, 'rb') as f:
  53. 203 1 50.0 50.0 0.0 col_names = np.array(f.readline().strip().split(','))
  54. 204 5 11.0 2.2 0.0 for n_predictors in range(min_predictors, max_predictors+1):
  55. 205 4 1306.0 326.5 0.0 _print_memory_usage("Initial State: ")
  56. 206 4 1214.0 303.5 0.0 max_batch_size = _get_max_batch_size(n_predictors+1, Y.size)
  57. 207 4 74.0 18.5 0.0 iterator = get_combinatorial_iterator(X, n_predictors)
  58. 208 4 14.0 3.5 0.0 index_combinations = get_column_index_combinations(iterator, X, max_batch_size=max_batch_size) # n predictors - 1 constant
  59. 209 4 85.0 21.2 0.0 s_i = ncr(X.shape[1]-1, n_predictors) # Number of possible combinations
  60. 210 4 154.0 38.5 0.0 print "Doing regressions for {} predictors ({}) regressions".format(n_predictors, s_i)
  61. 211 4 50.0 12.5 0.0 print "Number of possible combinations are {}, batch size is {}".format(s_i, max_batch_size)
  62. 212 4 5.0 1.2 0.0 i = 0
  63. 213 8 628230.0 78528.8 5.3 for current_combinations in index_combinations:
  64. 214 4 124.0 31.0 0.0 print "Processing from {} to {} regressions in this batch".format(i, i + len(current_combinations))
  65. 215 4 11.0 2.8 0.0 ss = time()
  66. 216 4 1968057.0 492014.2 16.6 Xs = get_X_matrices_from_combinations(X, current_combinations)
  67. 217 4 1482041.0 370510.2 12.5 XTs = get_Xt_matrices_from_combinations(X.T, current_combinations)
  68. 218 4 81206.0 20301.5 0.7 YsObs = get_Ys_matrices(Y, len(current_combinations))
  69. 219 4 45.0 11.2 0.0 te += time() - ss
  70. 220 4 5.0 1.2 0.0 ss = time()
  71. 221 4 5536311.0 1384077.8 46.8 regression_results = massive_multilineal_regresion(Xs, XTs, YsObs, handle=handle)
  72. 222 4 48.0 12.0 0.0 tt += time() - ss
  73. 223 4 198952.0 49738.0 1.7 regression_results['predictors_combinations'] = np.array(current_combinations, dtype=np.int32)
  74. 224 # If the matrix had not inverse then the model is invalid
  75. 225 4 1808.0 452.0 0.0 invalid_models = np.where(regression_results['inv_results'].get() != 0)[0]
  76. 226 4 214.0 53.5 0.0 print "For this batch {} models are invalid".format(len(invalid_models))
  77. 227 # Cleaning invalid model results
  78. 228 4 10277.0 2569.2 0.1 regression_results['predictors_combinations'] = np.delete(regression_results['predictors_combinations'], invalid_models, 0)
  79. 229 4 9933.0 2483.2 0.1 regression_results['beta_coefficients'] = np.delete(regression_results['beta_coefficients'], invalid_models, 0)
  80. 230 4 1238.0 309.5 0.0 regression_results['rmse'] = np.delete(regression_results['rmse'], invalid_models, 0)
  81. 231 4 119287.0 29821.8 1.0 regression_results['ys_sim'] = np.delete(regression_results['ys_sim'], invalid_models, 0)
  82. 232 4 117693.0 29423.2 1.0 regression_results['ys_obs'] = np.delete(regression_results['ys_obs'], invalid_models, 0)
  83. 233 342544 791033.0 2.3 6.7 combinations_cols_names = np.array([col_names[x] for x in regression_results['predictors_combinations']])
  84. 234 4 12.0 3.0 0.0 if combs_rmse is None:
  85. 235 1 166.0 166.0 0.0 combs_rmse = np.array(list(zip(combinations_cols_names, regression_results['rmse'])))
  86. 236 else:
  87. 237 3 406968.0 135656.0 3.4 combs_rmse = np.vstack((combs_rmse, np.array(list(zip(combinations_cols_names, regression_results['rmse'])))))
  88. 238 4 15.0 3.8 0.0 i += len(current_combinations)
  89. 239 4 23.0 5.8 0.0 done_regressions += len(current_combinations)
  90. 240 1 54.0 54.0 0.0 print "{} Regressions has been done, tt {}, te: {}".format(done_regressions, tt, te)
  91. 241 1 314484.0 314484.0 2.7 ordered_combs = combs_rmse[combs_rmse[:, 1].argsort()]
  92. 242 1 3.0 3.0 0.0 return ordered_combs
  93.  
  94. Total time: 0 s
  95. File: /home/nvera/Cris/HMMMR/src/numpy_multiple_regression.py
  96. Function: find_best_models_cpu at line 78
  97.  
  98. Line # Hits Time Per Hit % Time Line Contents
  99. ==============================================================
  100. 78 def find_best_models_cpu(file_name='../TestData/Y=2X1+3X2+4X3+5_with_shitty.csv', min_predictors=1, max_predictors=4, handle=None, **kwargs):
  101. 79 """
  102. 80
  103. 81 :param file_name: File name containing data, the format is the following
  104. 82 Columns: Contains data for N-2 predictors, 1 column full of 1s and 1 column with outcome data
  105. 83 Columns 1 to N-2 contains predictors data
  106. 84 The N-1 column is always full of 1s (due the constant on the model)
  107. 85 The N column contains Y data
  108. 86 Rows: The first row contains the name of the predictor
  109. 87 The next rows contains the observations (They need to be real values, no empty/nans are allowed
  110. 88 :param max_predictors: Max numbers of predictors to test in the regression. Should b N-2 at max
  111. 89 :return: Ordered array (by RMSE) of tuples containing (predictors_combination, RMSE)
  112. 90 """
  113. 91 XY = np.loadtxt(open(file_name, "rb"), delimiter=",", skiprows=1, dtype=np.float32)
  114. 92 X = np.delete(XY, XY.shape[1] - 1, 1)
  115. 93 Y = XY[:, -1]
  116. 94 combs_rmse = None
  117. 95 done_regressions = 0
  118. 96 invalid_regressions = 0
  119. 97 with open(file_name, 'rb') as f:
  120. 98 col_names = np.array(f.readline().strip().split(','))
  121. 99 for n_predictors in range(min_predictors, max_predictors+1):
  122. 100 index_combinations = get_column_index_combinations(X, n_predictors) # n predictors - 1 constant
  123. 101 s_i = ncr(X.shape[1]-1, n_predictors) # Number of possible combinations
  124. 102 print "Doing regressions for {} predictors ({}) regressions".format(n_predictors, s_i)
  125. 103 for comb in index_combinations:
  126. 104 try:
  127. 105 X1, X1t = get_X_Xt_matrix(X, comb)
  128. 106 regression = numpy_regression(X1, X1t, Y)
  129. 107 combinations_cols_names = np.array([col_names[x] for x in comb])
  130. 108 result = np.array([[combinations_cols_names, regression['metric']]])
  131. 109
  132. 110 if combs_rmse is None:
  133. 111 combs_rmse = result
  134. 112 else:
  135. 113 combs_rmse = np.vstack([combs_rmse, result])
  136. 114 except:
  137. 115 invalid_regressions += 1
  138. 116 done_regressions += s_i
  139. 117 print "{} Regressions has been done, {} invalid".format(done_regressions, invalid_regressions)
  140. 118 ordered_combs = combs_rmse[combs_rmse[:, 1].argsort()]
  141. 119 return ordered_combs
  142.  
  143. Total time: 43.2653 s
  144. File: massive_multilinear_regresions.py
  145. Function: perform_regressions at line 53
  146.  
  147. Line # Hits Time Per Hit % Time Line Contents
  148. ==============================================================
  149. 53 @do_profile(follow=[find_best_models_gpu, find_best_models_cpu])
  150. 54 def perform_regressions():
  151. 55 1 2.0 2.0 0.0 start_time = time()
  152. 56 1 3256.0 3256.0 0.0 input_file, window, max_predictors, min_predictors, metric, output_file, device, max_batch_size = parse_arguments()
  153. 57 1 1.0 1.0 0.0 if device == "gpu":
  154. 58 1 45.0 45.0 0.0 print "Running calculations on GPU"
  155. 59 1 12204723.0 12204723.0 28.2 ordered_combs = find_best_models_gpu(file_name=input_file, min_predictors=min_predictors, max_predictors=max_predictors, metric=metric, window=window, max_batch_size=max_batch_size)
  156. 60 1 54.0 54.0 0.0 print "Using GPU to do regressions took {}".format(time() - start_time)
  157. 61 elif device == "cpu":
  158. 62 ordered_combs = find_best_models_cpu(file_name=input_file, min_predictors=min_predictors, max_predictors=max_predictors, metric=metric, window=window, max_batch_size=max_batch_size)
  159. 63 1 21154.0 21154.0 0.0 df = pd.DataFrame(ordered_combs)
  160. 64 1 31036091.0 31036091.0 71.7 df.to_csv("/tmp/{}".format(output_file))
  161.  
  162. python massive_multilinear_regresions.py -i -mp 4 -np 1 -d gpu 42,06s user 10,32s system 95% cpu 54,905 total
Add Comment
Please, Sign In to add comment