Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Running calculations on GPU
- Doing regressions for 1 predictors (54) regressions
- Number of possible combinations are 54, batch size is 2447000
- Generating 2447000 combs for this batch
- Processing from 0 to 54 regressions in this batch
- For this batch 0 models are invalid
- Doing regressions for 2 predictors (1431) regressions
- Number of possible combinations are 1431, batch size is 1819000
- Generating 1819000 combs for this batch
- Processing from 0 to 1431 regressions in this batch
- For this batch 0 models are invalid
- Doing regressions for 3 predictors (24804) regressions
- Number of possible combinations are 24804, batch size is 1441000
- Generating 1441000 combs for this batch
- Processing from 0 to 24804 regressions in this batch
- For this batch 0 models are invalid
- 26289 Regressions has been done, tt 1.91709589958, te: 0.336488008499
- Using GPU to do regressions took 2.74310398102
- Timer unit: 1e-06 s
- Total time: 2.6961 s
- File: /home/nvera/Cris/HMMMR/src/batched_regression.py
- Function: find_best_models_gpu at line 182
- Line # Hits Time Per Hit % Time Line Contents
- ==============================================================
- 182 def find_best_models_gpu(file_name='../TestData/Y=2X1+3X2+4X3+5_with_shitty.csv', min_predictors=1, max_predictors=4, metric=None, window=None, handle=None, max_batch_size=None, **kwargs):
- 183 """
- 184
- 185 :param file_name: File name containing data, the format is the following
- 186 Columns: Contains data for N-2 predictors, 1 column full of 1s and 1 column with outcome data
- 187 Columns 1 to N-2 contains predictors data
- 188 The N-1 column is always full of 1s (due the constant on the model)
- 189 The N column contains Y data
- 190 Rows: The first row contains the name of the predictor
- 191 The next rows contains the observations (They need to be real values, no empty/nans are allowed
- 192 :param max_predictors: Max numbers of predictors to test in the regression. Should b N-2 at max
- 193 :return: Ordered array (by RMSE) of tuples containing (predictors_combination, RMSE)
- 194 """
- 195 1 2.0 2.0 0.0 tt = te = 0 # total time
- 196 1 145957.0 145957.0 5.4 handle = handle if handle else cublas.cublasCreate()
- 197 1 12844.0 12844.0 0.5 XY = np.loadtxt(open(file_name, "rb"), delimiter=",", skiprows=1, dtype=np.float32)
- 198 1 120.0 120.0 0.0 X = np.delete(XY, XY.shape[1] - 1, 1)
- 199 1 3.0 3.0 0.0 Y = XY[:, -1]
- 200 1 1.0 1.0 0.0 combs_rmse = None
- 201 1 1.0 1.0 0.0 done_regressions = 0
- 202 1 15.0 15.0 0.0 with open(file_name, 'rb') as f:
- 203 1 53.0 53.0 0.0 col_names = np.array(f.readline().strip().split(','))
- 204 4 11.0 2.8 0.0 for n_predictors in range(min_predictors, max_predictors+1):
- 205 3 1020.0 340.0 0.0 _print_memory_usage("Initial State: ")
- 206 3 945.0 315.0 0.0 max_batch_size = _get_max_batch_size(n_predictors+1, Y.size)
- 207 3 65.0 21.7 0.0 iterator = get_combinatorial_iterator(X, n_predictors)
- 208 3 11.0 3.7 0.0 index_combinations = get_column_index_combinations(iterator, X, max_batch_size=max_batch_size) # n predictors - 1 constant
- 209 3 75.0 25.0 0.0 s_i = ncr(X.shape[1]-1, n_predictors) # Number of possible combinations
- 210 3 107.0 35.7 0.0 print "Doing regressions for {} predictors ({}) regressions".format(n_predictors, s_i)
- 211 3 38.0 12.7 0.0 print "Number of possible combinations are {}, batch size is {}".format(s_i, max_batch_size)
- 212 3 5.0 1.7 0.0 i = 0
- 213 6 55196.0 9199.3 2.0 for current_combinations in index_combinations:
- 214 3 67.0 22.3 0.0 print "Processing from {} to {} regressions in this batch".format(i, i + len(current_combinations))
- 215 3 8.0 2.7 0.0 ss = time()
- 216 3 181394.0 60464.7 6.7 Xs = get_X_matrices_from_combinations(X, current_combinations)
- 217 3 146398.0 48799.3 5.4 XTs = get_Xt_matrices_from_combinations(X.T, current_combinations)
- 218 3 8659.0 2886.3 0.3 YsObs = get_Ys_matrices(Y, len(current_combinations))
- 219 3 13.0 4.3 0.0 te += time() - ss
- 220 3 4.0 1.3 0.0 ss = time()
- 221 3 1917051.0 639017.0 71.1 regression_results = massive_multilineal_regresion(Xs, XTs, YsObs, handle=handle)
- 222 3 28.0 9.3 0.0 tt += time() - ss
- 223 3 18535.0 6178.3 0.7 regression_results['predictors_combinations'] = np.array(current_combinations, dtype=np.int32)
- 224 # If the matrix had not inverse then the model is invalid
- 225 3 828.0 276.0 0.0 invalid_models = np.where(regression_results['inv_results'].get() != 0)[0]
- 226 3 156.0 52.0 0.0 print "For this batch {} models are invalid".format(len(invalid_models))
- 227 # Cleaning invalid model results
- 228 3 1372.0 457.3 0.1 regression_results['predictors_combinations'] = np.delete(regression_results['predictors_combinations'], invalid_models, 0)
- 229 3 1152.0 384.0 0.0 regression_results['beta_coefficients'] = np.delete(regression_results['beta_coefficients'], invalid_models, 0)
- 230 3 326.0 108.7 0.0 regression_results['rmse'] = np.delete(regression_results['rmse'], invalid_models, 0)
- 231 3 12836.0 4278.7 0.5 regression_results['ys_sim'] = np.delete(regression_results['ys_sim'], invalid_models, 0)
- 232 3 8523.0 2841.0 0.3 regression_results['ys_obs'] = np.delete(regression_results['ys_obs'], invalid_models, 0)
- 233 26292 86269.0 3.3 3.2 combinations_cols_names = np.array([col_names[x] for x in regression_results['predictors_combinations']])
- 234 3 7.0 2.3 0.0 if combs_rmse is None:
- 235 1 154.0 154.0 0.0 combs_rmse = np.array(list(zip(combinations_cols_names, regression_results['rmse'])))
- 236 else:
- 237 2 73196.0 36598.0 2.7 combs_rmse = np.vstack((combs_rmse, np.array(list(zip(combinations_cols_names, regression_results['rmse'])))))
- 238 3 10.0 3.3 0.0 i += len(current_combinations)
- 239 3 6.0 2.0 0.0 done_regressions += len(current_combinations)
- 240 1 44.0 44.0 0.0 print "{} Regressions has been done, tt {}, te: {}".format(done_regressions, tt, te)
- 241 1 22590.0 22590.0 0.8 ordered_combs = combs_rmse[combs_rmse[:, 1].argsort()]
- 242 1 1.0 1.0 0.0 return ordered_combs
- Total time: 0 s
- File: /home/nvera/Cris/HMMMR/src/numpy_multiple_regression.py
- Function: find_best_models_cpu at line 78
- Line # Hits Time Per Hit % Time Line Contents
- ==============================================================
- 78 def find_best_models_cpu(file_name='../TestData/Y=2X1+3X2+4X3+5_with_shitty.csv', min_predictors=1, max_predictors=4, handle=None, **kwargs):
- 79 """
- 80
- 81 :param file_name: File name containing data, the format is the following
- 82 Columns: Contains data for N-2 predictors, 1 column full of 1s and 1 column with outcome data
- 83 Columns 1 to N-2 contains predictors data
- 84 The N-1 column is always full of 1s (due the constant on the model)
- 85 The N column contains Y data
- 86 Rows: The first row contains the name of the predictor
- 87 The next rows contains the observations (They need to be real values, no empty/nans are allowed
- 88 :param max_predictors: Max numbers of predictors to test in the regression. Should b N-2 at max
- 89 :return: Ordered array (by RMSE) of tuples containing (predictors_combination, RMSE)
- 90 """
- 91 XY = np.loadtxt(open(file_name, "rb"), delimiter=",", skiprows=1, dtype=np.float32)
- 92 X = np.delete(XY, XY.shape[1] - 1, 1)
- 93 Y = XY[:, -1]
- 94 combs_rmse = None
- 95 done_regressions = 0
- 96 invalid_regressions = 0
- 97 with open(file_name, 'rb') as f:
- 98 col_names = np.array(f.readline().strip().split(','))
- 99 for n_predictors in range(min_predictors, max_predictors+1):
- 100 index_combinations = get_column_index_combinations(X, n_predictors) # n predictors - 1 constant
- 101 s_i = ncr(X.shape[1]-1, n_predictors) # Number of possible combinations
- 102 print "Doing regressions for {} predictors ({}) regressions".format(n_predictors, s_i)
- 103 for comb in index_combinations:
- 104 try:
- 105 X1, X1t = get_X_Xt_matrix(X, comb)
- 106 regression = numpy_regression(X1, X1t, Y)
- 107 combinations_cols_names = np.array([col_names[x] for x in comb])
- 108 result = np.array([[combinations_cols_names, regression['metric']]])
- 109
- 110 if combs_rmse is None:
- 111 combs_rmse = result
- 112 else:
- 113 combs_rmse = np.vstack([combs_rmse, result])
- 114 except:
- 115 invalid_regressions += 1
- 116 done_regressions += s_i
- 117 print "{} Regressions has been done, {} invalid".format(done_regressions, invalid_regressions)
- 118 ordered_combs = combs_rmse[combs_rmse[:, 1].argsort()]
- 119 return ordered_combs
- Total time: 5.09919 s
- File: massive_multilinear_regresions.py
- Function: perform_regressions at line 53
- Line # Hits Time Per Hit % Time Line Contents
- ==============================================================
- 53 @do_profile(follow=[find_best_models_gpu, find_best_models_cpu])
- 54 def perform_regressions():
- 55 1 3.0 3.0 0.0 start_time = time()
- 56 1 3262.0 3262.0 0.1 input_file, window, max_predictors, min_predictors, metric, output_file, device, max_batch_size = parse_arguments()
- 57 1 1.0 1.0 0.0 if device == "gpu":
- 58 1 39.0 39.0 0.0 print "Running calculations on GPU"
- 59 1 2739787.0 2739787.0 53.7 ordered_combs = find_best_models_gpu(file_name=input_file, min_predictors=min_predictors, max_predictors=max_predictors, metric=metric, window=window, max_batch_size=max_batch_size)
- 60 1 32.0 32.0 0.0 print "Using GPU to do regressions took {}".format(time() - start_time)
- 61 elif device == "cpu":
- 62 ordered_combs = find_best_models_cpu(file_name=input_file, min_predictors=min_predictors, max_predictors=max_predictors, metric=metric, window=window, max_batch_size=max_batch_size)
- 63 1 1063.0 1063.0 0.0 df = pd.DataFrame(ordered_combs)
- 64 1 2354999.0 2354999.0 46.2 df.to_csv("/tmp/{}".format(output_file))
- python massive_multilinear_regresions.py -i -mp 3 -np 1 -d gpu 6,38s user 7,71s system 84% cpu 16,650 total
Add Comment
Please, Sign In to add comment