Advertisement
Guest User

Untitled

a guest
Jan 20th, 2020
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.65 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3.  
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <time.h>
  7. #include <math.h>
  8.  
  9.  
  10. // 652.052 us - naive, 1.942 ms - shared memory
  11. // cpu: 5 ms
  12. // memcpy - H to D : 10.368 us, 5 invocari, total: 86,008 kB
  13. // memcpy - D to H : 3.904 us, 2 invocari, total: 32,768 kB
  14. // total: 450.74 ms.
  15.  
  16. //#define COMP_SW__PRINT_SIGNAL
  17. //#define COMP_SW__PRINT_TEMPLATE
  18. //#define COMP_SW__PRINT_TEMPLATE_PADD
  19. //#define COMP_SW__PRINT_MEDII_PADD
  20.  
  21. #define COMP_SW__PRINT_RESULT_CPU
  22. #define COMP_SW__PRINT_RESULT_GPU
  23.  
  24.  
  25. #define DIM_SIGNAL (2559u)
  26. #define DIM_TEMPLATE (512u)
  27. #define DIM_XCORR (DIM_SIGNAL - DIM_TEMPLATE + 1)
  28.  
  29. //#if ((DIM_SIG % DIM_TEMPLATE) != (DIM_TEMPLATE - 1))
  30. //#error "Alege o alta dimensiune a semnalului / template-ului care satisface conditia."
  31. //#endif
  32.  
  33. #define NO_THREADS_BLOCK DIM_TEMPLATE
  34. #define NO_BLOCKS (DIM_SIGNAL / DIM_TEMPLATE)
  35. // NO_BLOCKS = DIM_SIG / DIM_TEMPLATE, pt DIM_SIG % DIM_TEMPLATE == (DIM_TEMPLATE - 1)
  36.  
  37. double getRandom(double min, double max)
  38. {
  39. double range = (max - min);
  40. double divv = RAND_MAX / range;
  41. double rezultat = (min + ((double)rand() / divv));
  42. return rezultat;
  43. }
  44. //_______________________________________________________________________
  45. // Calcul Template Matching with Cross-Correlation --- Secvential (CPU)
  46. //_______________________________________________________________________
  47.  
  48. void CPU_xCorr(double* xcorr, double* sig, double* tem, double medSig, double medTem, unsigned int dimTem, unsigned int dimSig)
  49. {
  50. double sumNumarator = 0.0;
  51. double sumNumitor_S = 0.0;
  52. double sumNumitor_T = 0.0;
  53.  
  54. for (unsigned int t = 0; t <= (dimSig - dimTem); ++t)
  55. {
  56. sumNumarator = 0.0;
  57. sumNumitor_S = 0.0;
  58. sumNumitor_T = 0.0;
  59.  
  60. for (unsigned int i = 0; i < dimTem; ++i)
  61. {
  62. double l_S = sig[i + t] - medSig;
  63. double l_T = tem[i] - medTem;
  64.  
  65. sumNumarator += l_S * l_T;
  66. sumNumitor_S += l_S * l_S;
  67. sumNumitor_T += l_T * l_T;
  68. }
  69. xcorr[t] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
  70. }
  71. }
  72. //_______________________________________________________________________
  73. // Calcul Template Matching with Cross-Correlation --- Paralel (GPU)
  74. //_______________________________________________________________________
  75.  
  76. __global__ void naive__GPU_xCorr_Template_Matching(double* xcorr, double* sig, double* tem, double medSig, double medTem, unsigned int dimTem, unsigned int dimSig)
  77. {
  78. double sumNumarator = 0.0;
  79. double sumNumitor_S = 0.0;
  80. double sumNumitor_T = 0.0;
  81. unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
  82.  
  83.  
  84. for (unsigned int i = 0; i < dimTem; ++i)
  85. {
  86. double l_S = sig[thrd_offs + i] - medSig;
  87. double l_T = tem[i] - medTem;
  88.  
  89. sumNumarator += l_S * l_T;
  90. sumNumitor_S += l_S * l_S;
  91. sumNumitor_T += l_T * l_T;
  92. }
  93. xcorr[thrd_offs] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
  94. }
  95.  
  96. __global__ void shamem__GPU_xCorr_Template_Matching(double* xcorr, double* sig, double* tem, double* medSig, double* medTem)
  97. {
  98. double sumNumarator = 0.0;
  99. double sumNumitor_S = 0.0;
  100. double sumNumitor_T = 0.0;
  101.  
  102. unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
  103. __shared__ double sham_sig[2 * DIM_TEMPLATE - 1];
  104. __shared__ double sham_tem[DIM_TEMPLATE];
  105. __shared__ double sham_xcorr[DIM_TEMPLATE]; // [(2 * DIM_TEMPLATE - 1) - DIM_TEMPLATE + 1]
  106. __shared__ double sham_medSig[DIM_TEMPLATE];
  107. __shared__ double sham_medTem[DIM_TEMPLATE];
  108.  
  109. for (unsigned char i = 0; i <= 1; ++i)
  110. {
  111. sham_sig[threadIdx.x + DIM_TEMPLATE * i] = sig[thrd_offs + DIM_TEMPLATE - 1];
  112. }
  113. sham_tem[threadIdx.x] = tem[thrd_offs];
  114. sham_medTem[threadIdx.x] = medTem[thrd_offs];
  115. sham_medSig[threadIdx.x] = medSig[thrd_offs];
  116. __syncthreads();
  117.  
  118. for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
  119. {
  120. double l_S = sham_sig[((threadIdx.x + i) % DIM_TEMPLATE) + threadIdx.x] - sham_medSig[threadIdx.x];
  121. double l_T = sham_tem[((threadIdx.x + i) % DIM_TEMPLATE)] - sham_medTem[threadIdx.x];
  122.  
  123. sumNumarator += l_S * l_T;
  124. sumNumitor_S += l_S * l_S;
  125. sumNumitor_T += l_T * l_T;
  126. }
  127. sham_xcorr[threadIdx.x] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
  128. __syncthreads();
  129.  
  130. xcorr[thrd_offs] = sham_xcorr[threadIdx.x];
  131. }
  132.  
  133. int main(void)
  134. {
  135. //_______________________________________________________________________
  136. // Calcul numar de threads necesare per block. Calcul numar de blocks.
  137. //_______________________________________________________________________
  138. dim3 dimGrid(NO_BLOCKS);
  139. dim3 dimBlock(NO_THREADS_BLOCK);
  140.  
  141. cudaError_t cudaStatus;
  142. cudaEvent_t start, stop;
  143. cudaEventCreate(&start);
  144. cudaEventCreate(&stop);
  145. clock_t cpu_start, cpu_end;
  146.  
  147. double elap;
  148.  
  149. srand(time(NULL));
  150.  
  151. float elapsedTime;
  152.  
  153. double *dev_signal, *hos_signal;
  154. double *dev_templa, *hos_templa;
  155. double *dev_xcorr, *hos_xcorr, *hos_xcorr_compare;
  156.  
  157. double hos_medTemplate[DIM_TEMPLATE * NO_BLOCKS];
  158. double hos_medSignal[DIM_TEMPLATE * NO_BLOCKS];
  159. double *dev_medSignal, *dev_medTemplate;
  160. //_______________________________________________________________________
  161. // Alocare memorie pentru vectorii: semnal, template, rezultat
  162. // Cronometrare.
  163. //_______________________________________________________________________
  164.  
  165. hos_signal = (double*)malloc(sizeof(double) * DIM_SIGNAL);
  166. hos_templa = (double*)malloc(sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
  167. hos_xcorr = (double*)malloc(sizeof(double) * DIM_XCORR);
  168. hos_xcorr_compare = (double*)malloc(sizeof(double) * DIM_XCORR);
  169.  
  170. printf("\ndim_template: %d\n", DIM_TEMPLATE);
  171. printf("\ndim_threads_per_block: %d\n", NO_THREADS_BLOCK);
  172. printf("\ndim_grid: %d\n", NO_BLOCKS);
  173. printf("\ndim_xcorr: %d\n", DIM_XCORR);
  174.  
  175. cudaStatus = cudaSetDevice(0);
  176. if (cudaStatus != cudaSuccess) {
  177. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  178. goto Error;
  179. }
  180.  
  181. cudaStatus = cudaMalloc((void**)&dev_signal, sizeof(double) * DIM_SIGNAL);
  182. if (cudaStatus != cudaSuccess) {
  183. fprintf(stderr, "cudaMalloc failed! - device signal\n");
  184. goto Error;
  185. }
  186. cudaStatus = cudaMalloc((void**)&dev_templa, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
  187. if (cudaStatus != cudaSuccess) {
  188. fprintf(stderr, "cudaMalloc failed! - device template\n");
  189. goto Error;
  190. }
  191. cudaStatus = cudaMalloc((void**)&dev_xcorr, sizeof(double) * DIM_XCORR);
  192. if (cudaStatus != cudaSuccess) {
  193. fprintf(stderr, "cudaMalloc failed! - device xcorr\n");
  194. goto Error;
  195. }
  196. cudaStatus = cudaMalloc((void**)&dev_medTemplate, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
  197. if (cudaStatus != cudaSuccess) {
  198. fprintf(stderr, "cudaMalloc failed! - device medTemplate\n");
  199. goto Error;
  200. }
  201. cudaStatus = cudaMalloc((void**)&dev_medSignal, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
  202. if (cudaStatus != cudaSuccess) {
  203. fprintf(stderr, "cudaMalloc failed! - device medSignal\n");
  204. goto Error;
  205. }
  206.  
  207. //_______________________________________________________________________
  208. // Initializare semnal cu valori random intre MIN si MAX. Calcul medie.
  209. //_______________________________________________________________________
  210. const double minn = -10.0;
  211. const double maxx = 10.0;
  212.  
  213. for (unsigned int i = 0; i < DIM_TEMPLATE * NO_BLOCKS; i++)
  214. {
  215. hos_medSignal[i] = 0;
  216. hos_medTemplate[i] = 0;
  217. }
  218. for (unsigned int i = 0; i < DIM_SIGNAL; i++)
  219. {
  220. double rezultat = getRandom(minn, maxx);
  221. hos_signal[i] = rezultat;
  222. hos_medSignal[0] += hos_signal[i];
  223. }
  224. hos_medSignal[0] /= (double)DIM_SIGNAL;
  225. printf("\nmedia semnal: %f\n", hos_medSignal[0]);
  226. //_______________________________________________________________________
  227. // Initializare template cu valori in (-1,1) (normalizate). Calcul medie.
  228. //_______________________________________________________________________
  229. for (unsigned int i = 0; i < DIM_TEMPLATE; i++)
  230. {
  231. hos_templa[i] = getRandom(-1.0, 1.0);
  232. hos_medTemplate[0] += hos_templa[i];
  233. }
  234. hos_medTemplate[0] /= (double)DIM_TEMPLATE;
  235. printf("\nmedia template: %f\n", hos_medTemplate[0]);
  236.  
  237. #ifdef COMP_SW__PRINT_SIGNAL
  238. // Afisare semnal.
  239. {
  240. printf("SIGNAL: RAND(%f, %f) :\n\n", minn, maxx);
  241. for (unsigned int i = 0; i < DIM_SIGNAL; ++i)
  242. {
  243. if (i % 10 == 0) printf("\n");
  244. printf("%f ", hos_signal[i]);
  245. }
  246. printf("\n\n");
  247. }
  248. #endif
  249. #ifdef COMP_SW__PRINT_TEMPLATE
  250. // Afisare template.
  251. {
  252. printf("TEMPLATE: RAND(-1, 1) :\n\n");
  253. for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
  254. {
  255. if (i % 10 == 0) printf("\n");
  256. printf("%f ", hos_templa[i]);
  257. }
  258. printf("\n\n");
  259. }
  260. #endif
  261. // padding template
  262. for (unsigned int i = DIM_TEMPLATE; i < DIM_TEMPLATE * NO_BLOCKS; i++)
  263. {
  264. hos_templa[i] = hos_templa[i % DIM_TEMPLATE];
  265. }
  266. // padding medii
  267. for (unsigned int i = 1; i < DIM_TEMPLATE * NO_BLOCKS; i++)
  268. {
  269. hos_medSignal[i] = hos_medSignal[0];
  270. hos_medTemplate[i] = hos_medTemplate[0];
  271. }
  272. #ifdef COMP_SW__PRINT_TEMPLATE_PADD
  273. // Afisare template cu padding.
  274. {
  275. printf("TEMPLATE padded: RAND(-1, 1) :\n\n");
  276. for (unsigned int i = 0; i < DIM_TEMPLATE * NO_BLOCKS; ++i)
  277. {
  278. if (i % 10 == 0) printf("\n");
  279. printf("%f ", hos_templa[i]);
  280. }
  281. printf("\n\n");
  282. }
  283. #endif
  284. #ifdef COMP_SW__PRINT_MEDII_PADD
  285. // Afisare medii cu padding.
  286. {
  287. printf("Medii: Signal:\n\n");
  288. for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
  289. {
  290. if (i % 10 == 0) printf("\n");
  291. printf("%f ", hos_medTemplate[i]);
  292. }
  293. printf("\n\n");
  294.  
  295. printf("Medii: Template:\n\n");
  296. for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
  297. {
  298. if (i % 10 == 0) printf("\n");
  299. printf("%f ", hos_medSignal[i]);
  300. }
  301. printf("\n\n");
  302. }
  303. #endif
  304.  
  305.  
  306. //_______________________________________________________________________
  307. // Transmitere date de intrare catre device (GPU)
  308. // Cronometrare.
  309. //_______________________________________________________________________
  310. // startCPUTimer();
  311. cudaStatus = cudaMemcpy(dev_signal, hos_signal, sizeof(double) * DIM_SIGNAL, cudaMemcpyHostToDevice);
  312. if (cudaStatus != cudaSuccess) {
  313. fprintf(stderr, "cudaMemcpy failed! - to device, signal\n");
  314. goto Error;
  315. }
  316. cudaStatus = cudaMemcpy(dev_templa, hos_templa, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
  317. if (cudaStatus != cudaSuccess) {
  318. fprintf(stderr, "cudaMemcpy failed! - to device, template\n");
  319. goto Error;
  320. }
  321. cudaStatus = cudaMemcpy(dev_xcorr, hos_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyHostToDevice);
  322. if (cudaStatus != cudaSuccess) {
  323. fprintf(stderr, "cudaMemcpy failed! - to device, xcorr\n");
  324. goto Error;
  325. }
  326. cudaStatus = cudaMemcpy(dev_medTemplate, hos_medTemplate, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
  327. if (cudaStatus != cudaSuccess) {
  328. fprintf(stderr, "cudaMemcpy failed! - to device, medTemplate\n");
  329. goto Error;
  330. }
  331. cudaStatus = cudaMemcpy(dev_medSignal, hos_medSignal, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
  332. if (cudaStatus != cudaSuccess) {
  333. fprintf(stderr, "cudaMemcpy failed! - to device, medSignal\n");
  334. goto Error;
  335. }
  336.  
  337. printf("II transfer");
  338. //stopCPUTimer();
  339. //printf("time[TRANSFER-H-D]: %f\n\n", elapsedTime);
  340.  
  341. //_______________________________________________________________________
  342. // Pornire kernel. Cronometrare.
  343. //_______________________________________________________________________
  344.  
  345. cudaEventRecord(start, 0);
  346.  
  347. shamem__GPU_xCorr_Template_Matching <<< dimGrid, dimBlock >>>(dev_xcorr, dev_signal, dev_templa, dev_medSignal, dev_medTemplate);
  348.  
  349. cudaError_t eroare = cudaGetLastError();
  350. printf("\nCudaLastError:\n");
  351. printf(cudaGetErrorString(eroare));
  352. printf("\n\n");
  353.  
  354. cudaEventRecord(stop, 0);
  355. cudaEventSynchronize(stop);
  356. cudaThreadSynchronize();
  357.  
  358. //_______________________________________________________________________
  359. // Calcul timp de executie kernel. Afisare rezultate.
  360. //_______________________________________________________________________
  361. cudaEventElapsedTime(&elapsedTime, start, stop);
  362. printf("time[KERNEL_SHAMEM]: %f\n\n", elapsedTime);
  363.  
  364. cudaStatus = cudaMemcpy(hos_xcorr, dev_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyDeviceToHost);
  365. if (cudaStatus != cudaSuccess) {
  366. fprintf(stderr, "cudaMemcpy failed! - to host, xcorr");
  367. goto Error;
  368. }
  369. #ifdef COMP_SW__PRINT_RESULT_GPU
  370. for (unsigned int i = 0; i < DIM_XCORR; ++i)
  371. {
  372. if (i % 10 == 0) printf("\n");
  373. printf("%f ", hos_xcorr[i]);
  374. }
  375. printf("\n_______________________________________________________________________\n\n");
  376. #endif
  377. //_______________________________________________________________________
  378. // Calcul CPU. Cronometrare.
  379. //_______________________________________________________________________
  380. cpu_start = clock();
  381. CPU_xCorr(hos_xcorr_compare, hos_signal, hos_templa, hos_medSignal[0], hos_medTemplate[0], DIM_TEMPLATE, DIM_SIGNAL);
  382. printf("\ntime[CPU]: %f\n", ((double)(clock() - cpu_start))/CLOCKS_PER_SEC);
  383.  
  384. #ifdef COMP_SW__PRINT_RESULT_CPU
  385. for (unsigned int i = 0; i < DIM_XCORR; ++i)
  386. {
  387. if (i % 10 == 0) printf("\n");
  388. printf("%f ", hos_xcorr_compare[i]);
  389. }
  390. #endif
  391.  
  392. for (unsigned int i = 0; i < DIM_XCORR; ++i)
  393. {
  394. if (hos_xcorr[i] != hos_xcorr_compare[i])
  395. {
  396. printf("\n\n\nNOT EQUAL\n\n\n");
  397. //goto Error;
  398. goto tryagain;
  399. }
  400. }
  401. tryagain:
  402. printf("trying old_ver\n\n");
  403. cudaEventRecord(start, 0);
  404. naive__GPU_xCorr_Template_Matching <<< dimGrid, dimBlock >>>(dev_xcorr, dev_signal, dev_templa, hos_medSignal[0], hos_medTemplate[0], DIM_TEMPLATE, DIM_SIGNAL);
  405. cudaEventRecord(stop, 0);
  406. cudaEventSynchronize(stop);
  407. cudaThreadSynchronize();
  408. cudaEventElapsedTime(&elapsedTime, start, stop);
  409. printf("time[KERNEL_NAIVE]: %f\n\n", elapsedTime);
  410.  
  411. cudaStatus = cudaMemcpy(hos_xcorr, dev_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyDeviceToHost);
  412. if (cudaStatus != cudaSuccess) {
  413. fprintf(stderr, "cudaMemcpy failed! - to host, xcorr");
  414. goto Error;
  415. }
  416. for (unsigned int i = 0; i < DIM_XCORR; ++i)
  417. {
  418. if (hos_xcorr[i] != hos_xcorr_compare[i])
  419. {
  420. printf("\n\n\nNOT EQUAL\n\n\n");
  421. goto Error;
  422. }
  423. }
  424. printf("\n\n\nRESULTS EQUAL\n\n\n");
  425. goto Succes;
  426. //_______________________________________________________________________
  427. // Tratare erori. Elibereare memorie.
  428. //_______________________________________________________________________
  429. Error:
  430. printf("Eroare!\n\n");
  431. Succes:
  432. cudaFree(hos_xcorr);
  433. printf("freed hos_xcorr!\n\n");
  434. cudaFree(hos_signal);
  435. printf("freed hos_signal!\n\n");
  436. cudaFree(hos_templa);
  437. printf("freed hos_templa!\n\n");
  438. cudaFree(dev_xcorr);
  439. printf("freed dev_xcorr!\n\n");
  440. cudaFree(dev_signal);
  441. printf("freed dev_signal!\n\n");
  442. cudaFree(dev_templa);
  443. printf("freed dev_templa!\n\n");
  444. cudaFree(dev_medSignal);
  445. printf("freed dev_medSignal!\n\n");
  446. cudaFree(dev_medTemplate);
  447. printf("freed dev_medTemplate!\n\n");
  448.  
  449.  
  450. return 0;
  451. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement