Guest User

Untitled

a guest
Apr 24th, 2025
45
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.26 KB | None | 0 0
  1. Method for mat-vet multiplication:
  2.  
  3. __global__ void gpu_compute_func_and_delta_values(double* points_d, double* indexes_d, double* vec_d, int MATRIX_SIZE, int version) {
  4.     int x_blocks_count = (MATRIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
  5.     int gidx = blockDim.x * blockIdx.x + threadIdx.x;
  6.     int gidy = blockDim.y * blockIdx.y + threadIdx.y;
  7.     int tidx = threadIdx.x;
  8.     extern __shared__ double shared_points[];
  9.     if (gidx < MATRIX_SIZE) {
  10.         shared_points[threadIdx.x] = points_d[gidx] * indexes_d[gidy * MATRIX_SIZE + gidx];
  11.         //printf("points: %f %f\n", shared_points[threadIdx.x], shared_points[threadIdx.x + blockDim.x]);
  12.     }
  13.     else {
  14.         shared_points[threadIdx.x] = 0.0;
  15.     }
  16.     __syncthreads();
  17.     if (BLOCK_SIZE >= 1024 && threadIdx.x < 512) {
  18.         shared_points[threadIdx.x] += shared_points[threadIdx.x + 512];
  19.     }
  20.     __syncthreads();
  21.     if (BLOCK_SIZE >= 512 && threadIdx.x < 256) {
  22.         shared_points[threadIdx.x] += shared_points[threadIdx.x + 256];
  23.     }
  24.     __syncthreads();
  25.     if (BLOCK_SIZE >= 256 && threadIdx.x < 128) {
  26.         shared_points[threadIdx.x] += shared_points[threadIdx.x + 128];
  27.     }
  28.     __syncthreads();
  29.     if (BLOCK_SIZE >= 128 && threadIdx.x < 64) {
  30.         shared_points[threadIdx.x] += shared_points[threadIdx.x + 64];
  31.     }
  32.     __syncthreads();
  33.     if (BLOCK_SIZE >= 64 && threadIdx.x < 32) {
  34.         shared_points[threadIdx.x] += shared_points[threadIdx.x + 32];
  35.     }
  36.     __syncthreads();
  37.     if (threadIdx.x < 32) {
  38.         if (version >= 7){
  39.             double sum = shared_points[threadIdx.x];
  40.             sum += __shfl_down_sync(SHAFFLE_CONST, sum, 16);
  41.             sum += __shfl_down_sync(SHAFFLE_CONST, sum, 8);
  42.             sum += __shfl_down_sync(SHAFFLE_CONST, sum, 4);
  43.             sum += __shfl_down_sync(SHAFFLE_CONST, sum, 2);
  44.             sum += __shfl_down_sync(SHAFFLE_CONST, sum, 1);
  45.             if (threadIdx.x == 0) {
  46.                 vec_d[gidy * x_blocks_count + blockIdx.x] = sum;
  47.             }
  48.         }
  49.         else{
  50.             shared_points[threadIdx.x] += shared_points[threadIdx.x + 16]; __syncwarp();
  51.             shared_points[threadIdx.x] += shared_points[threadIdx.x + 8]; __syncwarp();
  52.             shared_points[threadIdx.x] += shared_points[threadIdx.x + 4]; __syncwarp();
  53.             shared_points[threadIdx.x] += shared_points[threadIdx.x + 2]; __syncwarp();
  54.             shared_points[threadIdx.x] += shared_points[threadIdx.x + 1]; __syncwarp();
  55.             if (tidx == 0) {
  56.                 vec_d[gidy * x_blocks_count + blockIdx.x] = shared_points[threadIdx.x];
  57.             }
  58.         }
  59.     }
  60. }
  61.  
  62.  
  63. Method for cublas inversion:
  64.  
  65. void gpu_cublasInverse(DataInitializer* data) {
  66.     cublasStatus_t status2 = cublasDgetrfBatched(data->cublasContextHandler, 
  67. data->MATRIX_SIZE, 
  68. data->cublas_ajacobian_d, 
  69. data->MATRIX_SIZE, 
  70. data->cublas_pivot, 
  71. data->cublas_info, 1);
  72.     cublasStatus_t status = cublasDgetriBatched(data->cublasContextHandler, 
  73. data->MATRIX_SIZE, 
  74. (const double**)data->cublas_ajacobian_d, 
  75. data->MATRIX_SIZE, 
  76. data->cublas_pivot, 
  77. data->cublas_ainverse_jacobian_d, 
  78. data->MATRIX_SIZE, 
  79. data->cublas_info, 1);
  80. }
  81.  
  82.  
  83. How do I launch it:
  84.  
  85. ###loop
  86.  
  87. #ifdef INTERMEDIATE_RESULTS
  88.         auto start = std::chrono::high_resolution_clock::now();
  89. #endif
  90.         gpu_compute_func_and_delta_values << <gridDim, blockDim, blockDim.x * sizeof(double) >> > (
  91.             data->points_d, data->indexes_d, data->intermediate_funcs_value_d, data->MATRIX_SIZE, version);
  92.         cudaDeviceSynchronize();
  93.         cudaMemcpy(data->intermediate_funcs_value_h, data->intermediate_funcs_value_d, x_blocks_count * data->MATRIX_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
  94.  
  95.         for (int i = 0; i < data->MATRIX_SIZE; i++) {
  96.             data->funcs_value_h[i] = -data->vector_b_h[i];
  97.             for (int j = 0; j < x_blocks_count; j++) {
  98.                 data->funcs_value_h[i] += data->intermediate_funcs_value_h[i * x_blocks_count + j];
  99.             }
  100.         }
  101.  
  102.         cudaMemcpy(data->funcs_value_d, data->funcs_value_h, data->MATRIX_SIZE * sizeof(double), cudaMemcpyHostToDevice);
  103.  
  104. #ifdef INTERMEDIATE_RESULTS
  105.        auto end = std::chrono::high_resolution_clock::now();
  106.         data->intermediate_results[1] = std::chrono::duration<double>(end - start).count();
  107.         start = std::chrono::high_resolution_clock::now();
  108. #endif
  109.         gpu_cublasInverse(data);
  110. #ifdef INTERMEDIATE_RESULTS
  111.         end = std::chrono::high_resolution_clock::now();
  112.         data->intermediate_results[2] = std::chrono::duration<double>(end - start).count();
  113.         start = std::chrono::high_resolution_clock::now();
  114. #endif
  115.  
  116.         gpu_compute_func_and_delta_values << <gridDim, blockDim, blockDim.x * sizeof(double) >> > (
  117.             data->funcs_value_d, data->inverse_jacobian_d, data->delta_d, data->MATRIX_SIZE, version);
  118.         cudaDeviceSynchronize();
  119.  
  120.         cudaMemcpy(data->delta_h, data->delta_d, x_blocks_count * data->MATRIX_SIZE * sizeof(double), cudaMemcpyDeviceToHost);
  121.  
  122.         for (int i = 0; i < data->MATRIX_SIZE; i++) {
  123.             delta[i] = 0;
  124.             for (int j = 0; j < x_blocks_count; j++) {
  125.                 delta[i] -= data->delta_h[i * x_blocks_count + j];
  126.             }
  127.         }
  128.  
  129. #ifdef INTERMEDIATE_RESULTS
  130.         end = std::chrono::high_resolution_clock::now();
  131.         data->intermediate_results[3] = std::chrono::duration<double>(end - start).count();
  132.         start = std::chrono::high_resolution_clock::now();
  133. #endif
  134.  
  135. ###loop
  136.  
  137.  
  138.  
Advertisement
Add Comment
Please, Sign In to add comment