Advertisement
Guest User

Untitled

a guest
Dec 13th, 2019
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.57 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <stdint.h>
  4. #include <string.h>
  5. #include <iostream>
  6. #include <ctype.h>
  7. #include "cuda_runtime.h"
  8. #include "device_launch_parameters.h"
  9. #include "cuda.h"
  10. #include "math.h"
  11.  
  12. #define CEIL(a,b) ((a+b-1)/b)
  13. #define SWAP(a,b,t) t=b; b=a; a=t;
  14. #define DATAMB(bytes) (bytes/1024/1024)
  15. #define DATABW(bytes,timems) ((float)bytes/(timems * 1.024*1024.0*1024.0))
  16.  
  17. typedef unsigned char uch;
  18. typedef unsigned long ul;
  19. typedef unsigned int ui;
  20.  
  21. uch *TheImg, *CopyImg; // Where images are stored in CPU
  22. uch *GPUImg, *GPUCopyImg, *GPUResult; // Where images are stored in GPU
  23.  
  24. struct ImgProp{
  25. int Hpixels;
  26. int Vpixels;
  27. int width;
  28. int height;
  29. uch HeaderInfo[54];
  30. ul Hbytes;
  31. } ip;
  32.  
  33. #define IPHB ip.Hbytes
  34. #define IPH ip.Hpixels
  35. #define IPV ip.Vpixels
  36. #define IMAGESIZE (IPHB*IPV)
  37. #define IMAGEPIX (IPH*IPV)
  38.  
  39. __global__
  40. void Rotate(uch *ImgDst, uch *ImgSrc, ui FS ,int RotAngle , int width , int height, int BlkPerRow)
  41. {
  42. ui ThrPerBlk = blockDim.x;
  43. ui MYbid = blockIdx.x;
  44. ui MYtid = threadIdx.x;
  45. ui MYgtid = ThrPerBlk * MYbid + MYtid;
  46. ui row = blockIdx.x / BlkPerRow;
  47. ui cWidth , cHeight;
  48. ui X,Y;
  49. // ul Yinput;
  50. //ui NewLocation;
  51. ui ThrPerRow;
  52.  
  53. ThrPerRow = ThrPerBlk * BlkPerRow;
  54. X = MYgtid - (ThrPerRow * row);
  55. Y = row;
  56. cWidth = (width / 2);
  57. cHeight = (width / 2);
  58.  
  59. ul Xin = (0);
  60. ul Yin = (1);
  61.  
  62.  
  63.  
  64. // //Transpose Cords
  65. // Xin = (X - cWidth);
  66. // Yin = (Y - cHeight);
  67.  
  68. if(MYbid == 400 && MYtid == 1){
  69. printf("X: %d - Y: %d\n",X,Y);
  70. printf("X: %d - cWid: %d -- Y: %d - cHei: %d\n", X,cWidth,Y, cHeight);
  71. printf("Xin: %d - Yin: %d \n", Xin, Yin);
  72. }
  73.  
  74. }
  75.  
  76. // Read a 24-bit/pixel BMP file into a 1D linear array.
  77. // Allocate memory to store the 1D image and return its pointer.
  78. uch *ReadBMPlin(char* fn)
  79. {
  80. static uch *Img;
  81. FILE* f = fopen(fn, "rb");
  82. if (f == NULL){ printf("\n\n%s NOT FOUND\n\n", fn); exit(EXIT_FAILURE); }
  83.  
  84. uch HeaderInfo[54];
  85. fread(HeaderInfo, sizeof(uch), 54, f); // read the 54-byte header
  86. // extract image height and width from header
  87. int width = *(int*)&HeaderInfo[18]; ip.Hpixels = width;
  88. int height = *(int*)&HeaderInfo[22]; ip.Vpixels = height;
  89. int RowBytes = (width * 3 + 3) & (~3); ip.Hbytes = RowBytes;
  90. //save header for re-use
  91. memcpy(ip.HeaderInfo, HeaderInfo,54);
  92. printf("\n Input File name: %17s (%u x %u) File Size=%u", fn,
  93. ip.Hpixels, ip.Vpixels, IMAGESIZE);
  94. // allocate memory to store the main image (1 Dimensional array)
  95. Img = (uch *)malloc(IMAGESIZE);
  96. if (Img == NULL) return Img; // Cannot allocate memory
  97. // read the image from disk
  98. fread(Img, sizeof(uch), IMAGESIZE, f);
  99. fclose(f);
  100. return Img;
  101. }
  102.  
  103.  
  104. // Write the 1D linear-memory stored image into file.
  105. void WriteBMPlin(uch *Img, char* fn)
  106. {
  107. FILE* f = fopen(fn, "wb");
  108. if (f == NULL){ printf("\n\nFILE CREATION ERROR: %s\n\n", fn); exit(1); }
  109. //write header
  110. fwrite(ip.HeaderInfo, sizeof(uch), 54, f);
  111. //write data
  112. fwrite(Img, sizeof(uch), IMAGESIZE, f);
  113. printf("\nOutput File name: %17s (%u x %u) File Size=%u", fn, ip.Hpixels, ip.Vpixels, IMAGESIZE);
  114. fclose(f);
  115. }
  116.  
  117. int main(int argc, char **argv)
  118. {
  119. //uch HeaderInfo[54];
  120. float totalTime, tfrCPUtoGPU, tfrGPUtoCPU, kernelExecutionTime; // GPU code run times
  121. cudaError_t cudaStatus, cudaStatus2;
  122. cudaEvent_t time1, time2, time3, time4;
  123. char InputFileName[255], OutputFileName[255], ProgName[255];
  124. ui BlkPerRow;//, BlkPerRowInt, BlkPerRowInt2;
  125. ui ThrPerBlk = 256, NumBlocks, GPUDataTransfer;//, NB2, NB4, NB8;
  126. //ui RowBytes, RowInts;
  127. cudaDeviceProp GPUprop;
  128. ul SupportedKBlocks, SupportedMBlocks, MaxThrPerBlk;
  129. //ui *GPUCopyImg32, *GPUImg32;
  130. char SupportedBlocks[100];
  131. int RotAngle;
  132.  
  133.  
  134. switch (argc){
  135. case 3 : printf("Please Set a value for rotation"); exit(EXIT_FAILURE);break;
  136. case 4 : strcpy(InputFileName, argv[1]);
  137. strcpy(OutputFileName, argv[2]);
  138. RotAngle = atoi(argv[3]); break;
  139. case 5 : ThrPerBlk = atoi(argv[4]);
  140. strcpy(InputFileName, argv[1]);
  141. strcpy(OutputFileName, argv[2]);
  142. RotAngle = atoi(argv[3]);
  143. printf("Rotate Angle: %d - ThrPerBlk: %d", RotAngle , ThrPerBlk); break;
  144. //case 5: ThrPerBlk=atoi(argv[4]);
  145. default: printf("\n\nUsage: ./rotate inputBMP outputBMP [RotAngle]");
  146. printf("\n\nExample: ./rotate infilename.bmp outname.bmp -75\n\n");
  147. printf("\n\nNothing executed ... Exiting ...\n\n");
  148. exit(EXIT_FAILURE);
  149. }
  150. if((RotAngle<-360) || (RotAngle>360)){
  151. printf("\nRotation angle of %d degrees is invalid ...\n",RotAngle);
  152. printf("\nPlease enter an angle between -360 and +360 degrees ...\n");
  153. printf("\n\nNothing executed ... Exiting ...\n\n");
  154. exit(EXIT_FAILURE);
  155. }
  156.  
  157. if ((ThrPerBlk < 32) || (ThrPerBlk > 1024)) {
  158. printf("Invalid ThrPerBlk option '%u'. Must be between 32 and 1024. \n", ThrPerBlk);
  159. exit(EXIT_FAILURE);
  160. }
  161.  
  162. // Calculate the image rot and degrigate for 360
  163. // Create CPU memory to store the input and output images
  164. TheImg = ReadBMPlin(InputFileName); // Read the input image if memory can be allocated
  165. if (TheImg == NULL){
  166. printf("Cannot allocate memory for the input image...\n");
  167. exit(EXIT_FAILURE);
  168. }
  169. CopyImg = (uch *)malloc(IMAGESIZE);
  170. if (CopyImg == NULL){
  171. free(TheImg);
  172. printf("Cannot allocate memory for the input image...\n");
  173. exit(EXIT_FAILURE);
  174. }
  175.  
  176. // Choose which GPU to run on, change this on a multi-GPU system.
  177. int NumGPUs = 0;
  178. cudaGetDeviceCount(&NumGPUs);
  179. if (NumGPUs == 0){
  180. printf("\nNo CUDA Device is available\n");
  181. exit(EXIT_FAILURE);
  182. }
  183. cudaStatus = cudaSetDevice(0);
  184. if (cudaStatus != cudaSuccess) {
  185. fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  186. exit(EXIT_FAILURE);
  187. }
  188. cudaGetDeviceProperties(&GPUprop, 0);
  189. SupportedKBlocks = (ui)GPUprop.maxGridSize[0] * (ui)GPUprop.maxGridSize[1] * (ui)GPUprop.maxGridSize[2] / 1024;
  190. SupportedMBlocks = SupportedKBlocks / 1024;
  191. sprintf(SupportedBlocks, "%u %c", (SupportedMBlocks >= 5) ? SupportedMBlocks : SupportedKBlocks, (SupportedMBlocks >= 5) ? 'M' : 'K');
  192. MaxThrPerBlk = (ui)GPUprop.maxThreadsPerBlock;
  193.  
  194. cudaEventCreate(&time1);
  195. cudaEventCreate(&time2);
  196. cudaEventCreate(&time3);
  197. cudaEventCreate(&time4);
  198.  
  199. cudaEventRecord(time1, 0); // Time stamp at the start of the GPU transfer
  200. // Allocate GPU buffer for the input and output images
  201. cudaStatus = cudaMalloc((void**)&GPUImg, IMAGESIZE);
  202. cudaStatus2 = cudaMalloc((void**)&GPUCopyImg, IMAGESIZE);
  203. if ((cudaStatus != cudaSuccess) || (cudaStatus2 != cudaSuccess)){
  204. fprintf(stderr, "cudaMalloc failed! Can't allocate GPU memory");
  205. exit(EXIT_FAILURE);
  206. }
  207. // These are the same pointers as GPUCopyImg and GPUImg, however, casted to an integer pointer
  208. // GPUCopyImg32 = (ui *)GPUCopyImg;
  209. // GPUImg32 = (ui *)GPUImg;
  210.  
  211. // Copy input vectors from host memory to GPU buffers.
  212. cudaStatus = cudaMemcpy(GPUImg, TheImg, IMAGESIZE, cudaMemcpyHostToDevice);
  213. if (cudaStatus != cudaSuccess) {
  214. fprintf(stderr, "cudaMemcpy CPU to GPU failed!");
  215. exit(EXIT_FAILURE);
  216. }
  217.  
  218. cudaEventRecord(time2, 0); // Time stamp after the CPU --> GPU tfr is done
  219.  
  220.  
  221. //Setup
  222. // RowBytes = (IPH * 3 + 3) & (~3);
  223. // RowInts = RowBytes / 4;
  224. BlkPerRow = CEIL(IPH,ThrPerBlk);
  225. // BlkPerRowInt = CEIL(RowInts, ThrPerBlk);
  226. // BlkPerRowInt2 = CEIL(CEIL(RowInts,2), ThrPerBlk);
  227.  
  228. //NumBlocks = IPV*BlkPerRow; //old
  229. NumBlocks = CEIL(IMAGESIZE,ThrPerBlk);
  230. printf("\n ---!!!! %d : %d !!!---- \n" , BlkPerRow , NumBlocks);
  231. // extract image height and width from header
  232. // uch HeaderInfo[54];
  233. // int width = *(int*)&HeaderInfo[18]; ip.Hpixels = width;
  234. // int height = *(int*)&HeaderInfo[22]; ip.Vpixels = height;
  235. //printf("Width: %d - Height: %d \n", width , height);
  236. //printf("Width: %d - Height: %d \n", ip.Hpixels , ip.Vpixels);
  237.  
  238. //360 for loop needs to go here
  239. Rotate <<< NumBlocks, ThrPerBlk >>> (GPUCopyImg, GPUImg, IMAGESIZE , RotAngle , ip.Hpixels , ip.Vpixels , BlkPerRow); // , IPH , IPV
  240.  
  241. GPUResult = GPUCopyImg;
  242. GPUDataTransfer = 2*IMAGESIZE;
  243.  
  244. // cudaDeviceSynchronize waits for the kernel to finish, and returns
  245. // any errors encountered during the launch.
  246. cudaStatus = cudaDeviceSynchronize();
  247. if (cudaStatus != cudaSuccess) {
  248. fprintf(stderr, "\n\ncudaDeviceSynchronize returned error code %d after launching the kernel!\n", cudaStatus);
  249. exit(EXIT_FAILURE);
  250. }
  251. cudaEventRecord(time3, 0);
  252.  
  253. // Copy output (results) from GPU buffer to host (CPU) memory.
  254. cudaStatus = cudaMemcpy(CopyImg, GPUResult, IMAGESIZE, cudaMemcpyDeviceToHost);
  255. if (cudaStatus != cudaSuccess) {
  256. fprintf(stderr, "cudaMemcpy GPU to CPU failed!");
  257. exit(EXIT_FAILURE);
  258. }
  259. cudaEventRecord(time4, 0);
  260.  
  261. cudaEventSynchronize(time1);
  262. cudaEventSynchronize(time2);
  263. cudaEventSynchronize(time3);
  264. cudaEventSynchronize(time4);
  265.  
  266. cudaEventElapsedTime(&totalTime, time1, time4);
  267. cudaEventElapsedTime(&tfrCPUtoGPU, time1, time2);
  268. cudaEventElapsedTime(&kernelExecutionTime, time2, time3);
  269. cudaEventElapsedTime(&tfrGPUtoCPU, time3, time4);
  270.  
  271. cudaStatus = cudaDeviceSynchronize();
  272. //checkError(cudaGetLastError()); // screen for errors in kernel launches
  273. if (cudaStatus != cudaSuccess) {
  274. fprintf(stderr, "\n Program failed after cudaDeviceSynchronize()!");
  275. free(TheImg);
  276. free(CopyImg);
  277. exit(EXIT_FAILURE);
  278. }
  279. WriteBMPlin(CopyImg, OutputFileName); // Write the flipped image back to disk
  280. printf("\n--------------------------------------------------------------------------\n");
  281. printf("%s ComputeCapab=%d.%d [max %s blocks; %d thr/blk] \n",
  282. GPUprop.name, GPUprop.major, GPUprop.minor, SupportedBlocks, MaxThrPerBlk);
  283. printf("--------------------------------------------------------------------------\n");
  284. printf("%s %s %c %u [%u BLOCKS, %u BLOCKS/ROW]\n", ProgName, InputFileName, OutputFileName, ThrPerBlk, NumBlocks, BlkPerRow);
  285. printf("--------------------------------------------------------------------------\n");
  286. printf("--------------------------------------------------------------------------\n");
  287. printf("CPU->GPU Transfer =%7.2f ms ... %4d MB ... %6.2f GB/s\n", tfrCPUtoGPU, DATAMB(IMAGESIZE), DATABW(IMAGESIZE, tfrCPUtoGPU));
  288. printf("Kernel Execution =%7.2f ms ... %4d MB ... %6.2f GB/s\n", kernelExecutionTime, DATAMB(GPUDataTransfer), DATABW(GPUDataTransfer, kernelExecutionTime));
  289. printf("GPU->CPU Transfer =%7.2f ms ... %4d MB ... %6.2f GB/s\n", tfrGPUtoCPU, DATAMB(IMAGESIZE), DATABW(IMAGESIZE, tfrGPUtoCPU));
  290. printf("--------------------------------------------------------------------------\n");
  291. printf("Total time elapsed =%7.2f ms %4d MB ... %6.2f GB/s\n", totalTime, DATAMB((2*IMAGESIZE+GPUDataTransfer)), DATABW((2 * IMAGESIZE + GPUDataTransfer), totalTime));
  292. printf("--------------------------------------------------------------------------\n\n");
  293.  
  294. // Deallocate CPU, GPU memory and destroy events.
  295. cudaFree(GPUImg);
  296. cudaFree(GPUCopyImg);
  297. cudaEventDestroy(time1);
  298. cudaEventDestroy(time2);
  299. cudaEventDestroy(time3);
  300. cudaEventDestroy(time4);
  301. // cudaDeviceReset must be called before exiting in order for profiling and
  302. // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
  303. cudaStatus = cudaDeviceReset();
  304. if (cudaStatus != cudaSuccess) {
  305. fprintf(stderr, "cudaDeviceReset failed!");
  306. free(TheImg);
  307. free(CopyImg);
  308. exit(EXIT_FAILURE);
  309. }
  310. free(TheImg);
  311. free(CopyImg);
  312. return(EXIT_SUCCESS);
  313. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement