Advertisement
Guest User

Untitled

a guest
Aug 21st, 2014
301
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.10 KB | None | 0 0
  1. $ cat t544.cu
  2. #include <stdio.h>
  3. #define imin(a,b) (a<b?a:b)
  4. #define HANDLE_ERROR(x) x
  5.  
  6. const int N = 33 * 1024;
  7. const int threadsPerBlock = 256;
  8. const int blocksPerGrid =
  9. imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
  10.  
  11.  
  12. __global__ void dot( float *a, float *b, float *c ) {
  13. __shared__ float cache[threadsPerBlock];
  14. int tid = threadIdx.x + blockIdx.x * blockDim.x;
  15. int cacheIndex = threadIdx.x;
  16.  
  17. float temp = 0;
  18. while (tid < N) {
  19. temp += a[tid] * b[tid];
  20. tid += blockDim.x * gridDim.x;
  21. }
  22.  
  23. // set the cache values
  24. cache[cacheIndex] = temp;
  25.  
  26. // synchronize threads in this block
  27. __syncthreads();
  28.  
  29. // for reductions, threadsPerBlock must be a power of 2
  30. // because of the following code
  31. int i = blockDim.x/2;
  32. while (i != 0) {
  33. if (cacheIndex < i)
  34. cache[cacheIndex] += cache[cacheIndex + i];
  35. __syncthreads();
  36. i /= 2;
  37. }
  38.  
  39. if (cacheIndex == 0)
  40. c[blockIdx.x] = cache[0];
  41. }
  42.  
  43.  
  44. int main( void ) {
  45. float *a, *b, c, *partial_c;
  46. float *dev_a, *dev_b, *dev_partial_c;
  47.  
  48. // allocate memory on the cpu side
  49. a = (float*)malloc( N*sizeof(float) );
  50. b = (float*)malloc( N*sizeof(float) );
  51. partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
  52.  
  53. // allocate the memory on the GPU
  54. HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
  55. N*sizeof(float) ) );
  56. HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
  57. N*sizeof(float) ) );
  58. HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
  59. blocksPerGrid*sizeof(float) ) );
  60.  
  61. // fill in the host memory with data
  62. for (int i=0; i<N; i++) {
  63. a[i] = i;
  64. b[i] = i*2;
  65. }
  66.  
  67. // copy the arrays 'a' and 'b' to the GPU
  68. HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
  69. cudaMemcpyHostToDevice ) );
  70. HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
  71. cudaMemcpyHostToDevice ) );
  72.  
  73. dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
  74. dev_partial_c );
  75.  
  76. // copy the array 'c' back from the GPU to the CPU
  77. HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
  78. blocksPerGrid*sizeof(float),
  79. cudaMemcpyDeviceToHost ) );
  80.  
  81. // finish up on the CPU side
  82. c = 0;
  83. for (int i=0; i<blocksPerGrid; i++) {
  84. c += partial_c[i];
  85. }
  86.  
  87. #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
  88. printf( "Does GPU value %.6g = %.6g?\n", c,
  89. 2 * sum_squares( (float)(N - 1) ) );
  90.  
  91. // free memory on the gpu side
  92. HANDLE_ERROR( cudaFree( dev_a ) );
  93. HANDLE_ERROR( cudaFree( dev_b ) );
  94. HANDLE_ERROR( cudaFree( dev_partial_c ) );
  95.  
  96. // free memory on the cpu side
  97. free( a );
  98. free( b );
  99. free( partial_c );
  100. }
  101.  
  102. $ nvcc -arch=sm_20 -o t544 t544.cu
  103. $ cuda-memcheck ./t544
  104. ========= CUDA-MEMCHECK
  105. Does GPU value 2.57236e+13 = 2.57236e+13?
  106. ========= ERROR SUMMARY: 0 errors
  107. $
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement