Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Method for mat-vet multiplication:
- __global__ void gpu_compute_func_and_delta_values(double* points_d, double* indexes_d, double* vec_d, int MATRIX_SIZE, int version) {
- int x_blocks_count = (MATRIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
- int gidx = blockDim.x * blockIdx.x + threadIdx.x;
- int gidy = blockDim.y * blockIdx.y + threadIdx.y;
- int tidx = threadIdx.x;
- extern __shared__ double shared_points[];
- if (gidx < MATRIX_SIZE) {
- shared_points[threadIdx.x] = points_d[gidx] * indexes_d[gidy * MATRIX_SIZE + gidx];
- //printf("points: %f %f\n", shared_points[threadIdx.x], shared_points[threadIdx.x + blockDim.x]);
- }
- else {
- shared_points[threadIdx.x] = 0.0;
- }
- __syncthreads();
- if (BLOCK_SIZE >= 1024 && threadIdx.x < 512) {
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 512];
- }
- __syncthreads();
- if (BLOCK_SIZE >= 512 && threadIdx.x < 256) {
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 256];
- }
- __syncthreads();
- if (BLOCK_SIZE >= 256 && threadIdx.x < 128) {
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 128];
- }
- __syncthreads();
- if (BLOCK_SIZE >= 128 && threadIdx.x < 64) {
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 64];
- }
- __syncthreads();
- if (BLOCK_SIZE >= 64 && threadIdx.x < 32) {
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 32];
- }
- __syncthreads();
- if (threadIdx.x < 32) {
- if (version >= 7){
- double sum = shared_points[threadIdx.x];
- sum += __shfl_down_sync(SHAFFLE_CONST, sum, 16);
- sum += __shfl_down_sync(SHAFFLE_CONST, sum, 8);
- sum += __shfl_down_sync(SHAFFLE_CONST, sum, 4);
- sum += __shfl_down_sync(SHAFFLE_CONST, sum, 2);
- sum += __shfl_down_sync(SHAFFLE_CONST, sum, 1);
- if (threadIdx.x == 0) {
- vec_d[gidy * x_blocks_count + blockIdx.x] = sum;
- }
- }
- else{
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 16]; __syncwarp();
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 8]; __syncwarp();
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 4]; __syncwarp();
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 2]; __syncwarp();
- shared_points[threadIdx.x] += shared_points[threadIdx.x + 1]; __syncwarp();
- if (tidx == 0) {
- vec_d[gidy * x_blocks_count + blockIdx.x] = shared_points[threadIdx.x];
- }
- }
- }
- }
- Method for cublas inversion:
- void gpu_cublasInverse(DataInitializer* data) {
- cublasStatus_t status2 = cublasDgetrfBatched(data->cublasContextHandler,
- data->MATRIX_SIZE,
- data->cublas_ajacobian_d,
- data->MATRIX_SIZE,
- data->cublas_pivot,
- data->cublas_info, 1);
- cublasStatus_t status = cublasDgetriBatched(data->cublasContextHandler,
- data->MATRIX_SIZE,
- (const double**)data->cublas_ajacobian_d,
- data->MATRIX_SIZE,
- data->cublas_pivot,
- data->cublas_ainverse_jacobian_d,
- data->MATRIX_SIZE,
- data->cublas_info, 1);
- }
- How do I launch it:
- ###loop
- #ifdef INTERMEDIATE_RESULTS
- auto start = std::chrono::high_resolution_clock::now();
- #endif
- gpu_compute_func_and_delta_values << <gridDim, blockDim, blockDim.x * sizeof(double) >> > (
- data->points_d, data->indexes_d, data->intermediate_funcs_value_d, data->MATRIX_SIZE, version);
- cudaDeviceSynchronize();
- cudaMemcpy(data->intermediate_funcs_value_h, data->intermediate_funcs_value_d, x_blocks_count * data->MATRIX_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
- for (int i = 0; i < data->MATRIX_SIZE; i++) {
- data->funcs_value_h[i] = -data->vector_b_h[i];
- for (int j = 0; j < x_blocks_count; j++) {
- data->funcs_value_h[i] += data->intermediate_funcs_value_h[i * x_blocks_count + j];
- }
- }
- cudaMemcpy(data->funcs_value_d, data->funcs_value_h, data->MATRIX_SIZE * sizeof(double), cudaMemcpyHostToDevice);
- #ifdef INTERMEDIATE_RESULTS
- auto end = std::chrono::high_resolution_clock::now();
- data->intermediate_results[1] = std::chrono::duration<double>(end - start).count();
- start = std::chrono::high_resolution_clock::now();
- #endif
- gpu_cublasInverse(data);
- #ifdef INTERMEDIATE_RESULTS
- end = std::chrono::high_resolution_clock::now();
- data->intermediate_results[2] = std::chrono::duration<double>(end - start).count();
- start = std::chrono::high_resolution_clock::now();
- #endif
- gpu_compute_func_and_delta_values << <gridDim, blockDim, blockDim.x * sizeof(double) >> > (
- data->funcs_value_d, data->inverse_jacobian_d, data->delta_d, data->MATRIX_SIZE, version);
- cudaDeviceSynchronize();
- cudaMemcpy(data->delta_h, data->delta_d, x_blocks_count * data->MATRIX_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
- for (int i = 0; i < data->MATRIX_SIZE; i++) {
- delta[i] = 0;
- for (int j = 0; j < x_blocks_count; j++) {
- delta[i] -= data->delta_h[i * x_blocks_count + j];
- }
- }
- #ifdef INTERMEDIATE_RESULTS
- end = std::chrono::high_resolution_clock::now();
- data->intermediate_results[3] = std::chrono::duration<double>(end - start).count();
- start = std::chrono::high_resolution_clock::now();
- #endif
- ###loop
Advertisement
Add Comment
Please, Sign In to add comment