SHARE
TWEET

Untitled

a guest Jun 18th, 2019 56 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <stdio.h>
  6.  
  7. __global__ void block_sum_rw(float *dane, float *wyniki, const size_t rozmiar_danych)
  8. {
  9.     int n = 2;
  10.  
  11.     unsigned int i = n * (blockIdx.x* blockDim.x + threadIdx.x);
  12.     unsigned int offset = blockIdx.x* blockDim.x;
  13.     for (int j = 1; j < n; ++j)
  14.         dane[offset + threadIdx.x] += dane[offset + threadIdx.x + blockDim.x*j];
  15.  
  16.     for (unsigned int odstep = 2; odstep <= blockDim.x; odstep *= 2)
  17.     {
  18.         if (threadIdx.x%odstep == 0)
  19.             if (i + odstep < rozmiar_danych)
  20.                 dane[threadIdx.x / odstep + offset] += dane[threadIdx.x / odstep + offset + blockDim.x / odstep];
  21.  
  22.         __syncthreads();
  23.     }
  24.     if (threadIdx.x == 0)
  25.     {
  26.         wyniki[blockIdx.x] = dane[offset];
  27.     }
  28.  
  29. }
  30.  
  31.  
  32. int main()
  33. {
  34.     float *dane;
  35.     float *wyniki;
  36.     int block_size = 512;
  37.  
  38.     const int arraySize = 1<<28;
  39.  
  40.     dane = (float*)malloc(arraySize * sizeof(float));
  41.     wyniki = (float*)malloc(arraySize * sizeof(float));
  42.     for (int i = 0; i < arraySize; i++) {
  43.         dane[i] = 1;
  44.         wyniki[i] = 0;
  45.     }
  46.    
  47.     int threads = arraySize / 2;
  48.     if (arraySize > 2 * block_size) threads = block_size;
  49.     int grid = arraySize / threads / 2;
  50.     printf("grid: %d, threads: %d\n", grid, threads);
  51.  
  52.     //int grid = 2;
  53.     //int threads = 32;
  54.  
  55.     float *dane_cuda, *wyniki_cuda, *c_d;
  56.     cudaMalloc((void **)&dane_cuda, arraySize * sizeof(float));
  57.     cudaMalloc((void **)&wyniki_cuda, arraySize * sizeof(float));
  58.  
  59.  
  60.     cudaMemcpy(dane_cuda, dane, arraySize * sizeof(float), cudaMemcpyHostToDevice);
  61.     cudaMemcpy(wyniki_cuda, wyniki, arraySize*sizeof(float), cudaMemcpyHostToDevice);
  62.  
  63.  
  64.     block_sum_rw << <grid, threads >> > (dane_cuda, wyniki_cuda, arraySize);
  65.  
  66.     while (grid > 1) {
  67.         if (grid >= block_size) grid /= threads;
  68.         else {
  69.             threads = grid;
  70.             grid = 1;
  71.         }
  72.         block_sum_rw << <grid, threads >> > (wyniki_cuda, wyniki_cuda, arraySize);
  73.  
  74.     }
  75.     cudaMemcpy(wyniki, wyniki_cuda, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
  76.  
  77.    
  78.  
  79.     // cudaDeviceReset must be called before exiting in order for profiling and
  80.     // tracing tools such as Nsight and Visual Profiler to show complete traces.
  81.     cudaError_t cudaStatus = cudaDeviceReset();
  82.     if (cudaStatus != cudaSuccess) {
  83.         fprintf(stderr, "cudaDeviceReset failed!");
  84.         return 1;
  85.     }
  86.     return 0;
  87. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top