Advertisement
Guest User

CUDA array reduction optimisation

a guest
Dec 2nd, 2023
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.24 KB | Source Code | 0 0
  1. #include <cuda_runtime_api.h>
  2. #include <cuda_runtime.h>
  3. #include <cuda.h>
  4.  
  5. #include "device_launch_parameters.h"
  6.  
  7. #include <thrust/device_vector.h>
  8. #include <thrust/functional.h>
  9. #include <thrust/transform.h>
  10. #include <thrust/execution_policy.h>
  11. #include <thrust/reduce.h>
  12. #include <thrust/sort.h>
  13.  
  14. #include <random>
  15. #include <iostream>
  16. #include <chrono>
  17.  
  18. # define CUDA_CHECK {\
  19.     cudaError_t  cu_error = cudaGetLastError();                                 \
  20.     if (cu_error != cudaSuccess) {                                              \
  21.       std::cout << "Cuda error: " << cudaGetErrorString(cu_error) << std::endl; \
  22.     }                                                                           \
  23.   }
  24.  
  25. struct custom_functor{
  26.     float factor;
  27.     custom_functor(float _factor){
  28.       factor = _factor;
  29.     }
  30.     __host__ __device__ int operator()(float &x) const {
  31.         return (int) floor(x / factor);
  32.     }
  33. };
  34.  
  35.  
  36. __global__ void custom_reduce_kernel(float *d_x, float *d_a, float *d_temp_a, int N, int Na, float da)
  37. {
  38. // Get our global thread ID
  39.   int index = blockIdx.x * blockDim.x + threadIdx.x;
  40.   int stride = blockDim.x * gridDim.x;
  41.  
  42.   float ix ;
  43.  
  44.   // Compute a
  45.   for(int x = index; x < N; x += stride) {
  46.       ix = floor(d_x[x] / da);
  47.  
  48.       d_temp_a[((int)ix) + Na * index] += 0.5;
  49.       d_temp_a[((int)ix + 1) + Na * index] += 0.5;
  50.   }
  51.   __syncthreads();
  52.  
  53.  
  54.   // Reduce
  55.   for(int l = index; l < Na; l += stride) {
  56.       for(int m = 0; m < stride; m += 1) {
  57.           d_a[l] += d_temp_a[l + Na * m];
  58.       }
  59.   }
  60.   __syncthreads();
  61. }
  62.  
  63. void test_custom_reduce(float *d_x, float *d_a, float *h_a, int N, int Na, float da)
  64. {
  65.   int blockSize = 512; // Number of threads in each thread block
  66.   int gridSize = (int)ceil((float) N /blockSize); // Number of thread blocks in grid
  67.  
  68.   // Create temp d_a array
  69.   float *d_temp_a;
  70.   cudaMalloc((void **) &d_temp_a, Na * blockSize * gridSize * sizeof(float));
  71.   CUDA_CHECK;
  72.  
  73.   custom_reduce_kernel<<<gridSize,blockSize>>>(d_x, d_a, d_temp_a, N, Na, da);
  74.   cudaMemcpy(h_a, d_a, Na * sizeof(float), cudaMemcpyDeviceToHost);
  75.  
  76.   cudaFree(d_temp_a);
  77. }
  78.  
  79.  
  80.  
  81. __global__ void thrust_reduce_kernel(float *d_a, int* d_a_idxs, int* d_a_cnts, int N, int Na, int n_entries)
  82. {
  83.   int index = blockIdx.x * blockDim.x + threadIdx.x;
  84.  
  85.   if (index >= n_entries)
  86.     return;
  87.  
  88.   int a_idx = d_a_idxs[index];
  89.   int a_cnt = d_a_cnts[index];
  90.  
  91.   if ((a_idx + 1) >= Na || a_idx < 0 || a_idx >= Na || (a_idx + 1) < 0)
  92.   {
  93.     printf("Should not happen according to you!\n");
  94.     return;
  95.   }
  96.  
  97.   atomicAdd(&d_a[a_idx], a_cnt * 0.5f);
  98.   atomicAdd(&d_a[a_idx+1], a_cnt * 0.5f);
  99. }
  100.  
  101. void test_thrust_reduce(float *d_x, float *d_a, float *h_a, int N, int Na, float da)
  102. {
  103.   int *d_xi, *d_ones;
  104.   int *d_a_cnt_keys, *d_a_cnt_vals;
  105.  
  106.   cudaMalloc((void**) &d_xi, N * sizeof(int));
  107.   cudaMalloc((void**) &d_ones, N * sizeof(float));
  108.  
  109.   cudaMalloc((void**) &d_a_cnt_keys, Na * sizeof(int));
  110.   cudaMalloc((void**) &d_a_cnt_vals, Na * sizeof(int));
  111.   CUDA_CHECK;
  112.  
  113.   thrust::device_ptr<float> dt_x(d_x);
  114.   thrust::device_ptr<float> dt_a(d_a);
  115.   thrust::device_ptr<int> dt_xi(d_xi);
  116.   thrust::device_ptr<int> dt_ones(d_ones);
  117.   thrust::device_ptr<int> dt_a_cnt_keys(d_a_cnt_keys);
  118.   thrust::device_ptr<int> dt_a_cnt_vals(d_a_cnt_vals);
  119.  
  120.   custom_functor f(da);
  121.   thrust::fill(thrust::device, dt_a, dt_a + Na, 0.0f);
  122.   thrust::fill(thrust::device, dt_ones, dt_ones + N, 1);
  123.   thrust::fill(thrust::device, dt_a_cnt_keys, dt_a_cnt_keys + Na, -1);
  124.   thrust::fill(thrust::device, dt_a_cnt_vals, dt_a_cnt_vals + Na, 0);
  125.  
  126.   thrust::transform(thrust::device, dt_x, dt_x + N, dt_xi, f);
  127.   thrust::sort(thrust::device, dt_xi, dt_xi + N);
  128.  
  129.   thrust::pair<thrust::device_ptr<int>,thrust::device_ptr<int>> new_end;
  130.   new_end = thrust::reduce_by_key(thrust::device, dt_xi, dt_xi + N, dt_ones,
  131.                                   dt_a_cnt_keys, dt_a_cnt_vals);
  132.  
  133.   int n_entries = new_end.first - dt_a_cnt_keys;
  134.   int n_entries_2 = new_end.first - dt_a_cnt_keys;
  135.  
  136.   dim3 dimBlock(256);
  137.   dim3 dimGrid((n_entries + dimBlock.x - 1) / dimBlock.x);
  138.   thrust_reduce_kernel<<<dimGrid, dimBlock>>>(d_a, d_a_cnt_keys, d_a_cnt_vals, N, Na, n_entries);
  139.   cudaMemcpy(h_a, d_a, Na * sizeof(float), cudaMemcpyDeviceToHost);
  140.  
  141.   cudaFree(d_xi);
  142.   cudaFree(d_ones);
  143.   cudaFree(d_a_cnt_keys);
  144.   cudaFree(d_a_cnt_vals);
  145. }
  146.  
  147.  
  148.  
  149. __global__ void simple_atomicAdd_kernel(const float *d_x, float *d_a, float da, int N, int Na)
  150. {
  151.   int index = blockIdx.x * blockDim.x + threadIdx.x;
  152.  
  153.   if (index >= N)
  154.     return;
  155.  
  156.   int a_idx = floor(d_x[index] / da); // in principle i < size(a)
  157.  
  158.   atomicAdd(&d_a[a_idx], 0.5f);
  159.   atomicAdd(&d_a[a_idx+1], 0.5f);
  160. }
  161. void test_simple_atomicAdd(float *d_x, float *d_a, float *h_a, int N, int Na, float da)
  162. {
  163.   cudaMemset(d_a, 0, Na * sizeof(float));
  164.  
  165.   dim3 dimBlock(256);
  166.   dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x);
  167.   simple_atomicAdd_kernel<<<dimGrid, dimBlock>>>(d_x, d_a, da, N, Na);
  168.   cudaMemcpy(h_a, d_a, Na * sizeof(float), cudaMemcpyDeviceToHost);
  169. }
  170.  
  171. void reference(float* h_x, float* h_a, int N, float da)
  172. {
  173.   for(int j = 0; j < N; j++) {
  174.     float i = floor(h_x[j] / da); // in principle i < size(a)
  175.  
  176.     h_a[(int)i] += 0.5;
  177.     h_a[(int)i+1] += 0.5; // I simplify the problem
  178.   }
  179. }
  180.  
  181. int main(int argc, char **argv)
  182. {
  183.   float da = 0.1f;
  184.   int N = 100000;  
  185.   int Na = 4096;  
  186.  
  187.   float L = 50; // box size
  188.   float dxMesh = L / Na; // cell size
  189.  
  190.   float *h_x = (float *)malloc(N * sizeof(float));
  191.  
  192.   float *h_a1 = (float *)malloc(Na * sizeof(float));
  193.   float *h_a2 = (float *)malloc(Na * sizeof(float));
  194.   float *h_a3 = (float *)malloc(Na * sizeof(float));
  195.   float *h_a_reference = (float *)malloc(Na * sizeof(float));
  196.  
  197.   /* Initialize random seed: */
  198.   std::default_random_engine generator;
  199.   std::uniform_real_distribution<float> generate_unif_dist(0.0,1.0);
  200.  
  201.   // h_x random initialisation
  202.   for (int x = 0; x < N; x++) {
  203.       float random = generate_unif_dist(generator);
  204.       h_x[x] = random * L;
  205.   }
  206.  
  207.   float *d_x, *d_a;
  208.   cudaMalloc((void**) &d_x, N * sizeof(float));
  209.   cudaMalloc((void**) &d_a, Na * sizeof(float));
  210.  
  211.   cudaMemcpy(d_x, h_x, N * sizeof(float), cudaMemcpyHostToDevice);
  212.   cudaDeviceSynchronize();
  213.  
  214.  
  215.   std::chrono::steady_clock::time_point current_bi = std::chrono::steady_clock::now();
  216.   reference(h_x, h_a_reference, N, da);
  217.   std::chrono::steady_clock::time_point current_ai = std::chrono::steady_clock::now();
  218.   float time_ref = std::chrono::duration_cast<std::chrono::microseconds> (current_ai - current_bi).count();
  219.  
  220.  
  221.   current_bi = std::chrono::steady_clock::now();
  222.   test_custom_reduce(d_x, d_a, h_a1, N, Na, da);
  223.   current_ai = std::chrono::steady_clock::now();
  224.   float time1 = std::chrono::duration_cast<std::chrono::microseconds> (current_ai - current_bi).count();
  225.  
  226.  
  227.   current_bi = std::chrono::steady_clock::now();
  228.   test_thrust_reduce(d_x, d_a, h_a2, N, Na, da);
  229.   current_ai = std::chrono::steady_clock::now();
  230.   float time2 = std::chrono::duration_cast<std::chrono::microseconds> (current_ai - current_bi).count();
  231.  
  232.   current_bi = std::chrono::steady_clock::now();
  233.   test_simple_atomicAdd(d_x, d_a, h_a3, N, Na, da);
  234.   current_ai = std::chrono::steady_clock::now();
  235.   float time3 = std::chrono::duration_cast<std::chrono::microseconds> (current_ai - current_bi).count();
  236.  
  237.   for (int i = 0; i < Na; i++)
  238.   {
  239.     if (fabs(h_a_reference[i]-h_a1[i]) > 0.0001)
  240.       std::cout << "Error 1: " << i << " - " << h_a_reference[i] << " != " << h_a1[i] << std::endl;
  241.    
  242.     if (fabs(h_a_reference[i]-h_a2[i]) > 0.0001)
  243.       std::cout << "Error 2: " << i << " - " << h_a_reference[i] << " != " << h_a2[i] << std::endl;
  244.    
  245.     if (fabs(h_a_reference[i]-h_a3[i]) > 0.0001)
  246.       std::cout << "Error 3: " << i << " - " << h_a_reference[i] << " != " << h_a3[i] << std::endl;
  247.   }
  248.  
  249.   std::cout << "Times: " << std::endl;
  250.   std::cout << "- CPU Reference:         " << time_ref << " ms" << std::endl;
  251.   std::cout << "- CUDA Custom reduce:    " << time1 << " ms" << std::endl;
  252.   std::cout << "- CUDA Thrust reduce:    " << time2 << " ms" << std::endl;
  253.   std::cout << "- CUDA Simple atomicAdd: " << time3 << " ms" << std::endl;
  254.  
  255.   free(h_x);
  256.   free(h_a1);
  257.   free(h_a2);
  258.   free(h_a3);
  259.  
  260.   cudaFree(d_x);
  261.   cudaFree(d_a);
  262.  
  263.   return 0;
  264. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement