Advertisement
Guest User

Untitled

a guest
May 19th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.37 KB | None | 0 0
  1. // System includes
  2. #define WIN32
  3. #include <stdio.h>
  4. #include <assert.h>
  5.  
  6. // CUDA runtime
  7. #include <cuda_runtime.h>
  8.  
  9. // Helper functions and utilities to work with CUDA
  10. #include <helper_functions.h>
  11.  
  12. const unsigned int VEC_SIZE = 1073741824;
  13.  
  14. //sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja z rozbieżnością wiązek
  15. __global__ void block_sum_rw(int* dane, int* wyniki) {
  16.  
  17. //identyfikator pierwszej wartości czytanej zależny od identyfikatora wątku i bloku (suma 2 elementów)
  18. unsigned int i = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
  19.  
  20. //odstęp między wartościami sumowanymi elementów
  21. for (unsigned int odstep = 1; odstep < 2 * blockDim.x; odstep *= 2) {
  22. //wykluczenie wątków w kolejnych etapach co 2,4,8,...
  23. if (threadIdx.x%odstep == 0)
  24. // test rozmiaru danych, jeśli potrzebny
  25. if (i + odstep < VEC_SIZE)
  26. dane[i] += dane[i + odstep];
  27. // synchronizacja gotowości danych
  28. __syncthreads();
  29. }
  30.  
  31. // wątek 0 zapisuje wynik
  32. if (threadIdx.x == 0) {
  33. wyniki[blockIdx.x] = dane[i];
  34. }
  35. }
  36.  
  37. //sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja bez rozbieżności wiązek
  38. __global__ void block_sum_brw(int* dane, int* wyniki) {
  39.  
  40. //identyfikator pierwszej wartości czytanej zależny od identyfikatora wątku i bloku (suma 2 elementów)
  41. unsigned int i = blockIdx.x * 2 * blockDim.x + threadIdx.x;
  42.  
  43. //odstęp między wartościami sumowanymi elementów
  44. for (unsigned int odstep = blockDim.x; odstep > 0; odstep >>= 1) {
  45. if (threadIdx.x < odstep) {
  46. dane[i] += dane[i + odstep];
  47. }
  48. __syncthreads();
  49. }
  50.  
  51. // wątek 0 zapisuje wynik
  52. if (threadIdx.x == 0) {
  53. wyniki[blockIdx.x] = dane[i];
  54. }
  55. }
  56.  
  57. void constantInit(int* data, int val)
  58. {
  59. for (unsigned int i = 0; i < VEC_SIZE; ++i)
  60. {
  61. data[i] = val;
  62. }
  63. }
  64.  
  65. //1 - wersja RW, 3 - wersja BRW
  66. int reduceVector(int version, int block_size)
  67. {
  68. // Calculate required grid size (in blocks)
  69. int grid_size = VEC_SIZE / 2 / block_size;
  70.  
  71. // Allocate host memory vector
  72. unsigned long long vec_mem = sizeof(int) * VEC_SIZE;
  73. unsigned long long res_mem = sizeof(int) * grid_size;
  74. int* h_vec = (int*)malloc(vec_mem);
  75. int* h_res = (int*)malloc(res_mem);
  76.  
  77. // Initialize host memory
  78. constantInit(h_vec, 1);
  79.  
  80. // Allocate device memory
  81. int* d_vec;
  82. int* d_res;
  83.  
  84. // Initialize device memory
  85. cudaError_t error;
  86. error = cudaMalloc((void **)&d_vec, vec_mem);
  87. if (error != cudaSuccess)
  88. {
  89. printf("cudaMalloc d_vec returned error code %d, line(%d)\n", error, __LINE__);
  90. exit(EXIT_FAILURE);
  91. }
  92. error = cudaMalloc((void **)&d_res, res_mem);
  93. if (error != cudaSuccess)
  94. {
  95. printf("cudaMalloc d_res returned error code %d, line(%d)\n", error, __LINE__);
  96. exit(EXIT_FAILURE);
  97. }
  98.  
  99. // copy host memory to device
  100. error = cudaMemcpy(d_vec, h_vec, vec_mem, cudaMemcpyHostToDevice);
  101. if (error != cudaSuccess)
  102. {
  103. printf("cudaMemcpy (d_vec,h_vec) returned error code %d, line(%d)\n", error, __LINE__);
  104. exit(EXIT_FAILURE);
  105. }
  106. error = cudaMemcpy(d_res, h_res, res_mem, cudaMemcpyHostToDevice);
  107. if (error != cudaSuccess)
  108. {
  109. printf("cudaMemcpy (d_res,h_res) returned error code %d, line(%d)\n", error, __LINE__);
  110. exit(EXIT_FAILURE);
  111. }
  112.  
  113. // Create and start timer
  114. printf("Computing result using CUDA Kernel...\n");
  115.  
  116. // Performs operation using CUDA kernel
  117. int turn = 0;
  118. if (version == 1)
  119. {
  120. do
  121. {
  122. if (turn == 0)
  123. block_sum_rw <<< grid_size, block_size >>> (d_vec, d_res);
  124. else
  125. block_sum_rw <<< grid_size, block_size >>> (d_res, d_vec);
  126. grid_size /= 2 * block_size;
  127. turn = abs(turn - 1);
  128. } while (grid_size >= 1);
  129. }
  130. else
  131. {
  132. do
  133. {
  134. if (turn == 0)
  135. block_sum_brw <<< grid_size, block_size >>> (d_vec, d_res);
  136. else
  137. block_sum_brw <<< grid_size, block_size >>> (d_res, d_vec);
  138. grid_size /= 2 * block_size;
  139. turn = abs(turn - 1);
  140. } while (grid_size >= 1);
  141. }
  142.  
  143. printf("done\n");
  144.  
  145. cudaDeviceSynchronize();
  146.  
  147. // Copy result from device to host
  148. error = cudaMemcpy(h_vec, d_vec, vec_mem, cudaMemcpyDeviceToHost);
  149. if (error != cudaSuccess)
  150. {
  151. printf("cudaMemcpy (h_vec,d_vec) returned error code %d, line(%d)\n", error, __LINE__);
  152. exit(EXIT_FAILURE);
  153. }
  154. error = cudaMemcpy(h_res, d_res, res_mem, cudaMemcpyDeviceToHost);
  155. if (error != cudaSuccess)
  156. {
  157. printf("cudaMemcpy (h_res,d_res) returned error code %d, line(%d)\n", error, __LINE__);
  158. exit(EXIT_FAILURE);
  159. }
  160.  
  161. printf("Checking computed result for correctness: ");
  162. bool correct = true;
  163.  
  164. if (turn == 1 && h_res[0] != VEC_SIZE)
  165. {
  166. printf("Error! Vector reduced to value: %d\n", h_res[0]);
  167. correct = false;
  168. }
  169. else if (turn == 0 && h_vec[0] != VEC_SIZE)
  170. {
  171. printf("Error! Vector reduced to value: %d\n", h_vec[0]);
  172. correct = false;
  173. }
  174.  
  175. printf("%s\n", correct ? "OK" : "FAIL");
  176.  
  177. // Clean up memory
  178. free(h_vec);
  179. free(h_res);
  180. cudaFree(d_vec);
  181. cudaFree(d_res);
  182.  
  183. cudaDeviceReset();
  184.  
  185. if (correct)
  186. {
  187. return EXIT_SUCCESS;
  188. }
  189. else
  190. {
  191. return EXIT_FAILURE;
  192. }
  193. }
  194.  
  195. /**
  196. * Program main
  197. */
  198. int main(int argc, char **argv)
  199. {
  200. printf("[Reduction Using CUDA] - Starting...\n");
  201.  
  202. // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
  203. int devID = 0;
  204.  
  205. if (checkCmdLineFlag(argc, (const char **)argv, "device"))
  206. {
  207. devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
  208. cudaSetDevice(devID);
  209. }
  210.  
  211. cudaError_t error;
  212. cudaDeviceProp deviceProp;
  213. error = cudaGetDevice(&devID);
  214.  
  215. if (error != cudaSuccess)
  216. {
  217. printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
  218. }
  219.  
  220. error = cudaGetDeviceProperties(&deviceProp, devID);
  221.  
  222. if (deviceProp.computeMode == cudaComputeModeProhibited)
  223. {
  224. fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
  225. exit(EXIT_SUCCESS);
  226. }
  227.  
  228. if (error != cudaSuccess)
  229. {
  230. printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
  231. }
  232. else
  233. {
  234. printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
  235. }
  236.  
  237. // Define block size
  238. int block_size = 512;
  239. int version = 1; //1 - RW, 3 - BRW
  240.  
  241. printf("Block size: %d\n", block_size);
  242.  
  243. int reduce_result = reduceVector(version, block_size);
  244.  
  245. exit(reduce_result);
  246. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement