Advertisement
Guest User

Untitled

a guest
Jun 11th, 2013
550
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.53 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. // For the CUDA runtime routines (prefixed with "cuda_")
  4. #include <cuda_runtime.h>
  5. /**
  6. * CUDA Kernel Device code
  7. */
  8. __global__ void vectorSum(int *A, int *B, int numElements) {
  9. extern __shared__ int S[];
  10. // Each thread loads one element from global to shared memory
  11. int tid = threadIdx.x;
  12. int i = blockDim.x * blockIdx.x + threadIdx.x;
  13. if (i < numElements) {
  14. S[tid] = A[i];
  15. __syncthreads();
  16. // Reduce in shared memory
  17. for (int t = blockDim.x/2; t > 0; t>>=1) {
  18. if (tid < t) {
  19. S[tid] += S[tid + t];
  20. }
  21. __syncthreads();
  22. }
  23. if (tid == 0) B[blockIdx.x] = S[0];
  24. }
  25. }
  26. /**
  27. * Host main routine
  28. */
  29. int main(int argc, char *argv[]) {
  30. if (argc != 2){
  31. fprintf(stderr, "ERROR: invalid number of arguments");
  32. exit(EXIT_FAILURE);
  33. }
  34. int numElements = atoi(argv[1]);
  35. // Error code to check return values for CUDA calls
  36. cudaError_t err = cudaSuccess;
  37. // Print the vector length to be used, and compute its size
  38. size_t size = numElements * sizeof(int);
  39. printf("[Vector summation of %d elements]\n", numElements);
  40. // Allocate the host input vector A
  41. int *h_A = (int*)malloc(size);
  42. // Allocate the host output vector B
  43. int *h_B = (int*)malloc(size);
  44. // Verify that allocations succeeded
  45. if (h_A == NULL || h_B == NULL){
  46. fprintf(stderr, "Failed to allocate host vectors!\n");
  47. exit(EXIT_FAILURE);
  48. }
  49. // Initialize the host input vector
  50. for (int i = 0; i < numElements; ++i) {
  51. h_A[i] = i;
  52. printf("%d ", h_A[i]);
  53. }
  54. printf("\n");
  55. // Allocate the device input vector A
  56. int *d_A = NULL;
  57. err = cudaMalloc((void **)&d_A, size);
  58. if (err != cudaSuccess) {
  59. fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
  60. exit(EXIT_FAILURE);
  61. }
  62. // Allocate the device output vector B
  63. int *d_B = NULL;
  64. err = cudaMalloc((void **)&d_B, size);
  65. if (err != cudaSuccess) {
  66. fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
  67. exit(EXIT_FAILURE);
  68. }
  69. // Copy the host input vector A in host memory to the device input vector in device memory
  70. printf("Copy input data from the host memory to the CUDA device\n");
  71. err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  72. if (err != cudaSuccess) {
  73. fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
  74. exit(EXIT_FAILURE);
  75. }
  76. // Copy the host output vector B in host memory to the device input vector in device memory
  77. printf("Copy input data from the host memory to the CUDA device\n");
  78. err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
  79. if (err != cudaSuccess) {
  80. fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
  81. exit(EXIT_FAILURE);
  82. }
  83. // Launch the Vector Summation CUDA Kernel
  84. int threadsPerBlock = 256;
  85. int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
  86. printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
  87. vectorSum<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, numElements);
  88. err = cudaGetLastError();
  89. if (err != cudaSuccess) {
  90. fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
  91. exit(EXIT_FAILURE);
  92. }
  93. // Copy the device result vector in device memory to the host result vector in host memory
  94. printf("Copy output data from the CUDA device to the host memory\n");
  95. err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
  96. if (err != cudaSuccess) {
  97. fprintf(stderr, "Failed to copy vector B from device to host (error code %s)!\n", cudaGetErrorString(err));
  98. exit(EXIT_FAILURE);
  99. }
  100. // Print the result
  101. printf("Summation result: %d\n", d_B[0]);
  102. // Free device global memory
  103. err = cudaFree(d_A);
  104. if (err != cudaSuccess) {
  105. fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
  106. exit(EXIT_FAILURE);
  107. }
  108. err = cudaFree(d_B);
  109. if (err != cudaSuccess) {
  110. fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
  111. exit(EXIT_FAILURE);
  112. }
  113. // Free host memory
  114. free(h_A);
  115. free(h_B);
  116. // Reset the device and exit
  117. err = cudaDeviceReset();
  118. if (err != cudaSuccess) {
  119. fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
  120. exit(EXIT_FAILURE);
  121. }
  122. printf("Done\n");
  123. return 0;
  124. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement