Advertisement
Guest User

Untitled

a guest
Jun 18th, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.25 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <stdio.h>
  6.  
  7. __global__ void block_sum_rw(float *dane, float *wyniki, const size_t rozmiar_danych)
  8. {
  9. int n = 2;
  10.  
  11. unsigned int i = n * (blockIdx.x* blockDim.x + threadIdx.x);
  12. unsigned int offset = blockIdx.x* blockDim.x;
  13. for (int j = 1; j < n; ++j)
  14. dane[offset + threadIdx.x] += dane[offset + threadIdx.x + blockDim.x*j];
  15.  
  16. for (unsigned int odstep = 2; odstep <= blockDim.x; odstep *= 2)
  17. {
  18. if (threadIdx.x%odstep == 0)
  19. if (i + odstep < rozmiar_danych)
  20. dane[threadIdx.x / odstep + offset] += dane[threadIdx.x / odstep + offset + blockDim.x / odstep];
  21.  
  22. __syncthreads();
  23. }
  24. if (threadIdx.x == 0)
  25. {
  26. wyniki[blockIdx.x] = dane[offset];
  27. }
  28.  
  29. }
  30.  
  31.  
  32. int main()
  33. {
  34. float *dane;
  35. float *wyniki;
  36. int block_size = 512;
  37.  
  38. const int arraySize = 1<<28;
  39.  
  40. dane = (float*)malloc(arraySize * sizeof(float));
  41. wyniki = (float*)malloc(arraySize * sizeof(float));
  42. for (int i = 0; i < arraySize; i++) {
  43. dane[i] = 1;
  44. wyniki[i] = 0;
  45. }
  46.  
  47. int threads = arraySize / 2;
  48. if (arraySize > 2 * block_size) threads = block_size;
  49. int grid = arraySize / threads / 2;
  50. printf("grid: %d, threads: %d\n", grid, threads);
  51.  
  52. //int grid = 2;
  53. //int threads = 32;
  54.  
  55. float *dane_cuda, *wyniki_cuda, *c_d;
  56. cudaMalloc((void **)&dane_cuda, arraySize * sizeof(float));
  57. cudaMalloc((void **)&wyniki_cuda, arraySize * sizeof(float));
  58.  
  59.  
  60. cudaMemcpy(dane_cuda, dane, arraySize * sizeof(float), cudaMemcpyHostToDevice);
  61. cudaMemcpy(wyniki_cuda, wyniki, arraySize*sizeof(float), cudaMemcpyHostToDevice);
  62.  
  63.  
  64. block_sum_rw << <grid, threads >> > (dane_cuda, wyniki_cuda, arraySize);
  65.  
  66. while (grid > 1) {
  67. if (grid >= block_size) grid /= threads;
  68. else {
  69. threads = grid;
  70. grid = 1;
  71. }
  72. block_sum_rw << <grid, threads >> > (wyniki_cuda, wyniki_cuda, arraySize);
  73.  
  74. }
  75. cudaMemcpy(wyniki, wyniki_cuda, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
  76.  
  77.  
  78.  
  79. // cudaDeviceReset must be called before exiting in order for profiling and
  80. // tracing tools such as Nsight and Visual Profiler to show complete traces.
  81. cudaError_t cudaStatus = cudaDeviceReset();
  82. if (cudaStatus != cudaSuccess) {
  83. fprintf(stderr, "cudaDeviceReset failed!");
  84. return 1;
  85. }
  86. return 0;
  87. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement