Advertisement
Guest User

Untitled

a guest
Jan 16th, 2019
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.61 KB | None | 0 0
  1. /**
  2. * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
  3. *
  4. * Please refer to the NVIDIA end user license agreement (EULA) associated
  5. * with this source code for terms and conditions that govern your use of
  6. * this software. Any use, reproduction, disclosure, or distribution of
  7. * this software and related documentation outside the terms of the EULA
  8. * is strictly prohibited.
  9. *
  10. */
  11.  
  12. /**
  13. * Vector addition: C = A + B.
  14. *
  15. * This sample is a very basic sample that implements element by element
  16. * vector addition. It is the same as the sample illustrating Chapter 2
  17. * of the programming guide with some additions like error checking.
  18. */
  19.  
  20. #include <stdio.h>
  21.  
  22. // For the CUDA runtime routines (prefixed with "cuda_")
  23. #include <cuda_runtime.h>
  24.  
  25. #include <helper_cuda.h>
  26. #include <helper_functions.h>
  27. #include <device_functions.h>
  28.  
  29. #define PARAM 1
  30. /**
  31. * CUDA Kernel Device code
  32. *
  33. * Computes the vector addition of A and B into C. The 3 vectors have the same
  34. * number of elements numElements.
  35. */
  36. template <int BLOCK_SIZE>
  37. __global__ void
  38. vectorAdd(float *A, float *C, int numElements)
  39. {
  40. unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
  41.  
  42. int BLOCK_NUMBER = (numElements + BLOCK_SIZE * (PARAM + 1) - 1) / (BLOCK_SIZE*(PARAM + 1));
  43. __shared__ float As[BLOCK_SIZE*(PARAM + 1)];
  44. As[threadIdx.x] = 0;
  45. for (int j=0; j<(PARAM + 1); j++)
  46. if ((i + (BLOCK_NUMBER * BLOCK_SIZE*j)) < numElements)
  47. As[threadIdx.x] += A[i + (BLOCK_NUMBER*BLOCK_SIZE*j)]; //odstęp = ilość bloków * rozmiar bloku
  48. __syncthreads();
  49.  
  50.  
  51. for (unsigned int odstep = ceilf(blockDim.x / 2); odstep >= 1; odstep = odstep / 2) {
  52.  
  53. if (threadIdx.x < (odstep * 2)) {
  54. if (threadIdx.x + odstep < blockDim.x) {
  55. As[threadIdx.x] = As[threadIdx.x] + As[threadIdx.x + odstep];
  56. }
  57. }
  58. __syncthreads();
  59. }
  60.  
  61. if (threadIdx.x == 0) {
  62. C[blockIdx.x] = As[0];
  63. }
  64. }
  65.  
  66.  
  67. /**
  68. * Host main routine
  69. */
  70. int
  71. main(void)
  72. {
  73. const int threadsPerBlock = 256;
  74.  
  75. // Error code to check return values for CUDA calls
  76. cudaError_t err = cudaSuccess;
  77.  
  78. // Print the vector length to be used, and compute its size
  79. int numElements = 500; //TODO
  80. size_t size = numElements * sizeof(float);
  81. printf("[Addition of %d elements]\n", numElements);
  82.  
  83. // Allocate the host input vector A
  84. float *h_A = (float *)malloc(size);
  85.  
  86.  
  87. // Allocate the host output vector C
  88. float *h_C = (float *)malloc(size/(threadsPerBlock * (PARAM+1))); //sufit
  89.  
  90. // Verify that allocations succeeded
  91. if (h_A == NULL || h_C == NULL)
  92. {
  93. fprintf(stderr, "Failed to allocate host vectors!\n");
  94. exit(EXIT_FAILURE);
  95. }
  96.  
  97. float sum = 0;
  98. // Initialize the host input vectors
  99. for (int i = 0; i < numElements; ++i)
  100. {
  101. h_A[i] = rand()/(float)RAND_MAX;
  102. sum += h_A[i];
  103.  
  104. }
  105.  
  106. // Allocate the device input vector A
  107. float *d_A = NULL;
  108. err = cudaMalloc((void **)&d_A, size);
  109.  
  110. if (err != cudaSuccess)
  111. {
  112. fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
  113. exit(EXIT_FAILURE);
  114. }
  115.  
  116. // Allocate the device output vector C
  117. float *d_C = NULL;
  118. err = cudaMalloc((void **)&d_C, (size / (threadsPerBlock * (PARAM+1))));
  119.  
  120. if (err != cudaSuccess)
  121. {
  122. fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
  123. exit(EXIT_FAILURE);
  124. }
  125.  
  126. // Copy the host input vectors A and B in host memory to the device input vectors in
  127. // device memory
  128. printf("Copy input data from the host memory to the CUDA device\n");
  129. err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  130.  
  131. if (err != cudaSuccess)
  132. {
  133. fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
  134. exit(EXIT_FAILURE);
  135. }
  136.  
  137. // Launch the Vector Add CUDA Kernel
  138. const int blocksPerGrid = (numElements + threadsPerBlock*(PARAM + 1) - 1) / (threadsPerBlock * (PARAM + 1));
  139. printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
  140. vectorAdd<threadsPerBlock><<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
  141. err = cudaGetLastError();
  142.  
  143. if (err != cudaSuccess)
  144. {
  145. fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
  146. exit(EXIT_FAILURE);
  147. }
  148.  
  149. // Copy the device result vector in device memory to the host result vector
  150. // in host memory.
  151. printf("Copy output data from the CUDA device to the host memory\n");
  152. err = cudaMemcpy(h_C, d_C, (size / (threadsPerBlock * (PARAM+1))), cudaMemcpyDeviceToHost);
  153.  
  154. if (err != cudaSuccess)
  155. {
  156. fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
  157. exit(EXIT_FAILURE);
  158. }
  159.  
  160.  
  161. if (abs(sum - h_C[0])>0.01)
  162. {
  163. fprintf(stderr, "Result verification failed!\nh_C:%f\nsum:%f\n", h_C[0], sum);
  164. exit(EXIT_FAILURE);
  165. }
  166. printf("h_C:%f\nsum:%f\nTest PASSED\n", h_C[0], sum);
  167.  
  168. // Free device global memory
  169. err = cudaFree(d_A);
  170.  
  171. if (err != cudaSuccess)
  172. {
  173. fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
  174. exit(EXIT_FAILURE);
  175. }
  176.  
  177. err = cudaFree(d_C);
  178.  
  179. if (err != cudaSuccess)
  180. {
  181. fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
  182. exit(EXIT_FAILURE);
  183. }
  184.  
  185. // Free host memory
  186. free(h_A);
  187. free(h_C);
  188.  
  189. printf("Done\n");
  190. return 0;
  191. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement