Advertisement
Guest User

Untitled

a guest
Dec 10th, 2017
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.46 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3.  
  4. #include <cuda.h>
  5. #include <device_functions.h>
  6. #include <stdio.h>
  7. #include <iostream>
  8. #include <conio.h>
  9. #include <fstream>
  10. #include <sstream>
  11. #include <string>
  12. #include <iomanip>
  13.  
  14. #define ARRAY_SIZE 12000
  15. #define BLOCK_NUMBER 24
  16. #define THREAD_NUMBER 512
  17.  
  18. __global__ void kernel1(int *inputData, int *outputData) {
  19. __shared__ int data[ARRAY_SIZE];
  20. int threadIndex = threadIdx.x;
  21. int index = blockIdx.x*blockDim.x + threadIdx.x;
  22. data[threadIndex] = inputData[index];
  23. __syncthreads();
  24.  
  25. //reduction
  26. for (int i = 1; i < blockDim.x; i *= 2) {
  27. if (threadIndex % (2 * i) == 0) {
  28. data[threadIndex] += data[threadIndex + i];
  29. }
  30. __syncthreads();
  31. }
  32.  
  33. if (threadIndex == 0) {
  34. outputData[blockIdx.x] = data[0];
  35. }
  36. }
  37.  
  38. __global__ void kernel2(int *inputData, int *outputData) {
  39. __shared__ int data[ARRAY_SIZE];
  40. int threadIndex = threadIdx.x;
  41. int index = blockIdx.x*blockDim.x + threadIdx.x;
  42. data[threadIndex] = inputData[index];
  43. __syncthreads();
  44.  
  45. //reduction
  46. for (int i = 1; i < blockDim.x; i *= 2) {
  47. int index = 2 * i * threadIndex;
  48. if (index < blockDim.x) {
  49. data[threadIndex] += data[threadIndex + i];
  50. }
  51. __syncthreads();
  52. }
  53.  
  54. if (threadIndex == 0) {
  55. outputData[blockIdx.x] = data[0];
  56. }
  57. }
  58.  
  59. __global__ void kernel3(int *inputData, int *outputData) {
  60. __shared__ int data[ARRAY_SIZE];
  61. int threadIndex = threadIdx.x;
  62. int index = blockIdx.x*blockDim.x + threadIdx.x;
  63. data[threadIndex] = inputData[index];
  64. __syncthreads();
  65.  
  66. //reduction
  67. for (int s = blockDim.x / 2; s > 0; s >>= 1) {
  68. if (threadIndex < s) {
  69. data[threadIndex] += data[threadIndex + s];
  70. }
  71. __syncthreads();
  72. }
  73.  
  74. if (threadIndex == 0) {
  75. outputData[blockIdx.x] = data[0];
  76. }
  77. }
  78.  
  79. __global__ void kernel4(int *inputData, int *outputData) {
  80. __shared__ int data[THREAD_NUMBER];
  81. int threadIndex = threadIdx.x;
  82. int index = blockIdx.x*blockDim.x + threadIdx.x;
  83. data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
  84. __syncthreads();
  85.  
  86. //reduction
  87. for (int s = blockDim.x / 2; s > 0; s >>= 1) {
  88. if (threadIndex < s) {
  89. data[threadIndex] += data[threadIndex + s];
  90. }
  91. __syncthreads();
  92. }
  93.  
  94. if (threadIndex == 0) {
  95. outputData[blockIdx.x] = data[0];
  96. }
  97. }
  98.  
  99. __global__ void kernel5(int *inputData, int *outputData) {
  100. __shared__ int data[THREAD_NUMBER];
  101. int threadIndex = threadIdx.x;
  102. int index = blockIdx.x*blockDim.x + threadIdx.x;
  103. data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
  104. __syncthreads();
  105.  
  106. //reduction
  107. for (int s = blockDim.x / 2; s > 32; s>>=1) {
  108. if (threadIndex < s) {
  109. data[threadIndex] += data[threadIndex + s];
  110. }
  111. __syncthreads();
  112. }
  113.  
  114. if (threadIndex < 32) {
  115. data[threadIndex] += data[threadIndex + 32];
  116. data[threadIndex] += data[threadIndex + 16];
  117. data[threadIndex] += data[threadIndex + 8];
  118. data[threadIndex] += data[threadIndex + 4];
  119. data[threadIndex] += data[threadIndex + 2];
  120. data[threadIndex] += data[threadIndex + 1];
  121. }
  122.  
  123. if (threadIndex == 0) {
  124. outputData[blockIdx.x] = data[0];
  125. }
  126. }
  127.  
  128. template<unsigned int blockSize> __global__ void kernel6(int *inputData, int *outputData) {
  129. __shared__ int data[THREAD_NUMBER];
  130. int threadIndex = threadIdx.x;
  131. int index = blockIdx.x*blockDim.x + threadIdx.x;
  132. data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
  133. __syncthreads();
  134.  
  135. if (blockSize >= 512) {
  136. if (threadIndex < 256) {
  137. data[threadIndex] += data[threadIndex + 256];
  138. __syncthreads();
  139. }
  140. }
  141.  
  142. if (blockSize >= 256) {
  143. if (threadIndex < 128) {
  144. data[threadIndex] += data[threadIndex + 128];
  145. __syncthreads();
  146. }
  147. }
  148.  
  149. if (blockSize >= 128) {
  150. if (threadIndex < 64) {
  151. data[threadIndex] += data[threadIndex + 64];
  152. __syncthreads();
  153. }
  154. }
  155.  
  156. if (threadIndex < 32) {
  157. data[threadIndex] += data[threadIndex + 32];
  158. data[threadIndex] += data[threadIndex + 16];
  159. data[threadIndex] += data[threadIndex + 8];
  160. data[threadIndex] += data[threadIndex + 4];
  161. data[threadIndex] += data[threadIndex + 2];
  162. data[threadIndex] += data[threadIndex + 1];
  163. }
  164.  
  165. if (threadIndex == 0) {
  166. outputData[blockIdx.x] = data[0];
  167. }
  168. }
  169.  
  170. template<unsigned int blockSize>__global__ void kernel7(int *inputData, int *outputData, unsigned int n) {
  171. __shared__ int data[THREAD_NUMBER];
  172. int threadIndex = threadIdx.x;
  173. int index = blockIdx.x*(blockSize*2) + threadIndex;
  174. int gridSize = blockSize * 2 * gridDim.x;
  175. data[threadIndex] = 0;
  176.  
  177. while (index < n) {
  178. data[threadIndex] += inputData[index] + inputData[index + blockSize];
  179. index += gridSize;
  180. }
  181. __syncthreads();
  182.  
  183. if (blockSize >= 512) {
  184. if (threadIndex < 256) {
  185. data[threadIndex] += data[threadIndex + 256];
  186. __syncthreads();
  187. }
  188. }
  189.  
  190. if (blockSize >= 256) {
  191. if (threadIndex < 128) {
  192. data[threadIndex] += data[threadIndex + 128];
  193. __syncthreads();
  194. }
  195. }
  196.  
  197. if (blockSize >= 128) {
  198. if (threadIndex < 64) {
  199. data[threadIndex] += data[threadIndex + 64];
  200. __syncthreads();
  201. }
  202. }
  203.  
  204. if (threadIndex < 32) {
  205. data[threadIndex] += data[threadIndex + 32];
  206. data[threadIndex] += data[threadIndex + 16];
  207. data[threadIndex] += data[threadIndex + 8];
  208. data[threadIndex] += data[threadIndex + 4];
  209. data[threadIndex] += data[threadIndex + 2];
  210. data[threadIndex] += data[threadIndex + 1];
  211. }
  212.  
  213. if (threadIndex == 0) {
  214. outputData[blockIdx.x] = data[0];
  215. }
  216. }
  217.  
  218. int main() {
  219. int a[ARRAY_SIZE] = { 0 };
  220. int c_1[BLOCK_NUMBER] = { 0 };
  221. int c_2[BLOCK_NUMBER] = { 0 };
  222. int c_3[BLOCK_NUMBER] = { 0 };
  223. int c_4[BLOCK_NUMBER] = { 0 };
  224. int c_5[BLOCK_NUMBER] = { 0 };
  225. int c_6[BLOCK_NUMBER] = { 0 };
  226. int c_7[BLOCK_NUMBER] = { 0 };
  227.  
  228.  
  229. std::ifstream infile("Text.txt");
  230.  
  231. int singleIn;
  232. int count = 0;
  233. while (infile >> singleIn) {
  234. a[count] = singleIn;
  235. count++;
  236. }
  237.  
  238. int *device_a = 0;
  239. int *device_c_1 = 0;
  240. int *device_c_2 = 0;
  241. int *device_c_3 = 0;
  242. int *device_c_4 = 0;
  243. int *device_c_5 = 0;
  244. int *device_c_6 = 0;
  245. int *device_c_7 = 0;
  246.  
  247. cudaEvent_t start1, stop1, start2, stop2, start3,stop3, start4,stop4,start5,stop5,start6,stop6,start7,stop7;
  248. cudaEventCreate(&start1);
  249. cudaEventCreate(&stop1);
  250. cudaEventCreate(&start2);
  251. cudaEventCreate(&stop2);
  252. cudaEventCreate(&start3);
  253. cudaEventCreate(&stop3);
  254. cudaEventCreate(&start4);
  255. cudaEventCreate(&stop4);
  256. cudaEventCreate(&start5);
  257. cudaEventCreate(&stop5);
  258. cudaEventCreate(&start6);
  259. cudaEventCreate(&stop6);
  260. cudaEventCreate(&start7);
  261. cudaEventCreate(&stop7);
  262. cudaError_t cudaStatus;
  263.  
  264. cudaStatus = cudaMalloc((void**)&device_a, ARRAY_SIZE * sizeof(int));
  265. if (cudaStatus != cudaSuccess) {
  266. std::cout << "Tutaj panie1 " << cudaGetErrorString(cudaStatus) << std::endl;
  267. }
  268.  
  269. cudaStatus = cudaMalloc((void**)&device_c_1, BLOCK_NUMBER * sizeof(int));
  270. if (cudaStatus != cudaSuccess) {
  271. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  272. }
  273. cudaStatus = cudaMalloc((void**)&device_c_2, BLOCK_NUMBER * sizeof(int));
  274. if (cudaStatus != cudaSuccess) {
  275. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  276. }
  277. cudaStatus = cudaMalloc((void**)&device_c_3, BLOCK_NUMBER * sizeof(int));
  278. if (cudaStatus != cudaSuccess) {
  279. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  280. }
  281. cudaStatus = cudaMalloc((void**)&device_c_4, BLOCK_NUMBER * sizeof(int));
  282. if (cudaStatus != cudaSuccess) {
  283. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  284. }
  285. cudaStatus = cudaMalloc((void**)&device_c_5, BLOCK_NUMBER * sizeof(int));
  286. if (cudaStatus != cudaSuccess) {
  287. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  288. }
  289. cudaStatus = cudaMalloc((void**)&device_c_6, BLOCK_NUMBER * sizeof(int));
  290. if (cudaStatus != cudaSuccess) {
  291. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  292. }
  293. cudaStatus = cudaMalloc((void**)&device_c_7, BLOCK_NUMBER * sizeof(int));
  294. if (cudaStatus != cudaSuccess) {
  295. std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
  296. }
  297.  
  298. cudaStatus = cudaMemcpy(device_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
  299. if (cudaStatus != cudaSuccess) {
  300. std::cout << "Tutaj panie3 " << cudaGetErrorString(cudaStatus) << std::endl;
  301. }
  302.  
  303. cudaEventRecord(start1);
  304. kernel1 <<<BLOCK_NUMBER, THREAD_NUMBER >>> (device_a, device_c_1);
  305. cudaEventRecord(stop1);
  306.  
  307. cudaStatus = cudaDeviceSynchronize();
  308. if (cudaStatus != cudaSuccess) {
  309. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  310. }
  311.  
  312. cudaStatus = cudaMemcpy(c_1, device_c_1, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  313. if (cudaStatus != cudaSuccess) {
  314. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  315. }
  316.  
  317. cudaEventSynchronize(stop1);
  318. float millis1 = 0;
  319. cudaEventElapsedTime(&millis1, start1, stop1);
  320. int secondAdd1 = 0;
  321. for (int i = 0; i < BLOCK_NUMBER; i++) {
  322. secondAdd1 += c_1[i];
  323. }
  324.  
  325. std::cout << "Kernel 1" << std::endl;
  326. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis1 << "\t Result: \t" << secondAdd1 << std::endl;
  327.  
  328.  
  329. cudaEventRecord(start2);
  330. kernel2 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_2);
  331. cudaEventRecord(stop2);
  332.  
  333. cudaStatus = cudaDeviceSynchronize();
  334. if (cudaStatus != cudaSuccess) {
  335. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  336. }
  337.  
  338. cudaStatus = cudaMemcpy(c_2, device_c_2, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  339. if (cudaStatus != cudaSuccess) {
  340. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  341. }
  342.  
  343. cudaEventSynchronize(stop2);
  344. float millis2 = 0;
  345. cudaEventElapsedTime(&millis2, start2, stop2);
  346. int secondAdd2 = 0;
  347. for (int i = 0; i < BLOCK_NUMBER; i++) {
  348. secondAdd2 += c_2[i];
  349. }
  350. std::cout << "Kernel 2" << std::endl;
  351. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis2 << "\t Result:\t" << secondAdd2 << std::endl;
  352.  
  353.  
  354. cudaEventRecord(start3);
  355. kernel3 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_3);
  356. cudaEventRecord(stop3);
  357.  
  358. cudaStatus = cudaDeviceSynchronize();
  359. if (cudaStatus != cudaSuccess) {
  360. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  361. }
  362.  
  363. cudaStatus = cudaMemcpy(c_3, device_c_3, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  364. if (cudaStatus != cudaSuccess) {
  365. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  366. }
  367.  
  368. cudaEventSynchronize(stop3);
  369. float millis3 = 0;
  370. cudaEventElapsedTime(&millis3, start3, stop3);
  371. int secondAdd3 = 0;
  372. for (int i = 0; i < BLOCK_NUMBER; i++) {
  373. secondAdd3 += c_3[i];
  374. }
  375. std::cout << "Kernel 3" << std::endl;
  376. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis3 << "\t Result: \t" << secondAdd3 << std::endl;
  377.  
  378. cudaEventRecord(start4);
  379. kernel4 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_4);
  380. cudaEventRecord(stop4);
  381.  
  382. cudaStatus = cudaDeviceSynchronize();
  383. if (cudaStatus != cudaSuccess) {
  384. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  385. }
  386.  
  387. cudaStatus = cudaMemcpy(c_4, device_c_4, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  388. if (cudaStatus != cudaSuccess) {
  389. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  390. }
  391.  
  392. cudaEventSynchronize(stop4);
  393. float millis4 = 0;
  394. cudaEventElapsedTime(&millis4, start4, stop4);
  395. int secondAdd4 = 0;
  396. for (int i = 0; i < BLOCK_NUMBER; i++) {
  397. secondAdd4 += c_4[i];
  398. }
  399. std::cout << "Kernel 4" << std::endl;
  400. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis4 << "\t Result: \t" << secondAdd4 << std::endl;
  401.  
  402.  
  403. cudaEventRecord(start5);
  404. kernel5 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_5);
  405. cudaEventRecord(stop5);
  406.  
  407. cudaStatus = cudaDeviceSynchronize();
  408. if (cudaStatus != cudaSuccess) {
  409. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  410. }
  411.  
  412. cudaStatus = cudaMemcpy(c_5, device_c_5, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  413. if (cudaStatus != cudaSuccess) {
  414. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  415. }
  416.  
  417. cudaEventSynchronize(stop5);
  418. float millis5 = 0;
  419. cudaEventElapsedTime(&millis5, start5, stop5);
  420. int secondAdd5 = 0;
  421. for (int i = 0; i < BLOCK_NUMBER; i++) {
  422. secondAdd5 += c_5[i];
  423. }
  424. std::cout << "Kernel5" << std::endl;
  425. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis5 << "\t Result: \t" << secondAdd5 << std::endl;
  426.  
  427.  
  428. cudaEventRecord(start6);
  429. kernel6<THREAD_NUMBER> << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_6);
  430. cudaEventRecord(stop6);
  431.  
  432. cudaStatus = cudaDeviceSynchronize();
  433. if (cudaStatus != cudaSuccess) {
  434. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  435. }
  436.  
  437. cudaStatus = cudaMemcpy(c_6, device_c_6, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  438. if (cudaStatus != cudaSuccess) {
  439. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  440. }
  441.  
  442. cudaEventSynchronize(stop1);
  443. float millis6 = 0;
  444. cudaEventElapsedTime(&millis6, start6, stop6);
  445. int secondAdd6 = 0;
  446. for (int i = 0; i < BLOCK_NUMBER; i++) {
  447. secondAdd6 += c_6[i];
  448. }
  449. std::cout << "Kernel 6" << std::endl;
  450. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis6 << "\t Result: \t" << secondAdd6 << std::endl;
  451.  
  452. cudaEventRecord(start7);
  453. kernel7<THREAD_NUMBER> << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_7,ARRAY_SIZE);
  454. cudaEventRecord(stop7);
  455.  
  456. cudaStatus = cudaDeviceSynchronize();
  457. if (cudaStatus != cudaSuccess) {
  458. std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
  459. }
  460.  
  461. cudaStatus = cudaMemcpy(c_7, device_c_7, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
  462. if (cudaStatus != cudaSuccess) {
  463. std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
  464. }
  465.  
  466. cudaEventSynchronize(stop7);
  467. float millis7 = 0;
  468. cudaEventElapsedTime(&millis7, start7, stop7);
  469. int secondAdd7 = 0;
  470. for (int i = 0; i < BLOCK_NUMBER; i++) {
  471. secondAdd7 += c_7[i];
  472. }
  473. std::cout << "Kernel 7" << std::endl;
  474. std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis7 << "\t Result: \t" << secondAdd7 << std::endl;
  475.  
  476. int hostReductionResult = 0;
  477. for (int i = 0; i < ARRAY_SIZE; i++) {
  478. hostReductionResult += a[i];
  479. }
  480. std::cout << "Host reduction result: " << hostReductionResult << std::endl;
  481.  
  482. getch();
  483.  
  484. return 0;
  485. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement