Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2017
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.39 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <assert.h>
  3. #define BLOCK_SIZE 16
  4. #define N 1618 // available to all threads in device
  5. //kernel
  6. // optimized version of adjacent difference
  7. __global__ void adj_diff_naive(int *result, int *input)
  8. {
  9. // shorthand for threadIdx.x
  10. int tx = threadIdx.x;
  11.  
  12. // allocate a __shared__ array, one element per thread
  13. __shared__ int s_data[BLOCK_SIZE];
  14.  
  15. // each thread reads one element to s_data
  16. unsigned int i = blockDim.x * blockIdx.x + tx;
  17. s_data[tx] = input[i];
  18.  
  19. // avoid race condition: ensure all loads complete
  20. // before continuing
  21. __syncthreads();
  22.  
  23. if (tx < N)
  24. {
  25. int pr = s_data[tx];
  26. int pr2 = s_data[tx - 1];
  27. result[i] = pr - pr2;
  28. }
  29. }
  30.  
  31. int main(int argc, char** argv)
  32. {
  33.  
  34. // pointer to host memory
  35. int *h_a;
  36.  
  37. // size of array
  38. //int dimA = 256 * 1024; // 256K elements (1MB total)
  39. int dimA = BLOCK_SIZE;
  40. // pointer to device memory
  41. int *d_b, *d_a;
  42.  
  43. // define block size
  44. int numThreadsPerBlock = 4;
  45.  
  46. // compute number of blocks needed based on
  47. // array size and desired block size
  48. int numBlocks = dimA / numThreadsPerBlock;
  49.  
  50. // allocate host and device memory
  51. size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
  52. h_a = (int *)malloc(memSize);//...
  53. cudaMalloc((void **)&d_a, memSize); //aloc d_a //...
  54. cudaMalloc((void **)&d_b, memSize); //aloc d_b //...
  55.  
  56. // Initialize input array on host
  57. for (int i = 0; i < dimA; ++i)
  58. {
  59. h_a[i] = i;
  60. }
  61.  
  62.  
  63. //Forma: cudaMemcpy( void *dst, void *src, size_t nbytes, enum cudaMemcpyKind direction);
  64. //
  65. // Copy host array to device array ==> cudaMemcpyHostToDevice
  66. cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); //...
  67.  
  68. // launch kernel
  69. dim3 dimGrid(numBlocks);
  70. dim3 dimBlock(numThreadsPerBlock);
  71.  
  72.  
  73. adj_diff_naive << < numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(int) >> >(d_b, d_a);
  74.  
  75. // block until the device has completed
  76. cudaThreadSynchronize();
  77.  
  78. // device to host copy ==> cudaMemcpyDeviceToHost
  79. cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); //...
  80.  
  81. // verify the data returned to the host is correct
  82. for (int i = 0; i < dimA; i++)
  83. {
  84. assert(h_a[i] == dimA - 1 - i);
  85. //printf(d_b[i] + " ");
  86. }
  87.  
  88. // free device memory
  89. cudaFree(d_a);
  90. cudaFree(d_b);
  91.  
  92. // free host memory
  93. free(h_a);
  94.  
  95. // If the program makes it this far, then the results
  96. // are correct
  97. printf("Correct!\n");
  98.  
  99. return 0;
  100. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement