Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <stdint.h>
- #include <string.h>
- #include <iostream>
- #include <ctype.h>
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include "cuda.h"
- #include "math.h"
- #define CEIL(a,b) ((a+b-1)/b)
- #define SWAP(a,b,t) t=b; b=a; a=t;
- #define DATAMB(bytes) (bytes/1024/1024)
- #define DATABW(bytes,timems) ((float)bytes/(timems * 1.024*1024.0*1024.0))
- typedef unsigned char uch;
- typedef unsigned long ul;
- typedef unsigned int ui;
- uch *TheImg, *CopyImg; // Where images are stored in CPU
- uch *GPUImg, *GPUCopyImg, *GPUResult; // Where images are stored in GPU
- struct ImgProp{
- int Hpixels;
- int Vpixels;
- int width;
- int height;
- uch HeaderInfo[54];
- ul Hbytes;
- } ip;
- #define IPHB ip.Hbytes
- #define IPH ip.Hpixels
- #define IPV ip.Vpixels
- #define IMAGESIZE (IPHB*IPV)
- #define IMAGEPIX (IPH*IPV)
- __global__
- void Rotate(uch *ImgDst, uch *ImgSrc, ui FS ,int RotAngle , int width , int height, int BlkPerRow)
- {
- ui ThrPerBlk = blockDim.x;
- ui MYbid = blockIdx.x;
- ui MYtid = threadIdx.x;
- ui MYgtid = ThrPerBlk * MYbid + MYtid;
- ui row = blockIdx.x / BlkPerRow;
- ui cWidth , cHeight;
- ui X,Y;
- // ul Yinput;
- //ui NewLocation;
- ui ThrPerRow;
- ThrPerRow = ThrPerBlk * BlkPerRow;
- X = MYgtid - (ThrPerRow * row);
- Y = row;
- cWidth = (width / 2);
- cHeight = (width / 2);
- ul Xin = (0);
- ul Yin = (1);
- // //Transpose Cords
- // Xin = (X - cWidth);
- // Yin = (Y - cHeight);
- if(MYbid == 400 && MYtid == 1){
- printf("X: %d - Y: %d\n",X,Y);
- printf("X: %d - cWid: %d -- Y: %d - cHei: %d\n", X,cWidth,Y, cHeight);
- printf("Xin: %d - Yin: %d \n", Xin, Yin);
- }
- }
- // Read a 24-bit/pixel BMP file into a 1D linear array.
- // Allocate memory to store the 1D image and return its pointer.
- uch *ReadBMPlin(char* fn)
- {
- static uch *Img;
- FILE* f = fopen(fn, "rb");
- if (f == NULL){ printf("\n\n%s NOT FOUND\n\n", fn); exit(EXIT_FAILURE); }
- uch HeaderInfo[54];
- fread(HeaderInfo, sizeof(uch), 54, f); // read the 54-byte header
- // extract image height and width from header
- int width = *(int*)&HeaderInfo[18]; ip.Hpixels = width;
- int height = *(int*)&HeaderInfo[22]; ip.Vpixels = height;
- int RowBytes = (width * 3 + 3) & (~3); ip.Hbytes = RowBytes;
- //save header for re-use
- memcpy(ip.HeaderInfo, HeaderInfo,54);
- printf("\n Input File name: %17s (%u x %u) File Size=%u", fn,
- ip.Hpixels, ip.Vpixels, IMAGESIZE);
- // allocate memory to store the main image (1 Dimensional array)
- Img = (uch *)malloc(IMAGESIZE);
- if (Img == NULL) return Img; // Cannot allocate memory
- // read the image from disk
- fread(Img, sizeof(uch), IMAGESIZE, f);
- fclose(f);
- return Img;
- }
- // Write the 1D linear-memory stored image into file.
- void WriteBMPlin(uch *Img, char* fn)
- {
- FILE* f = fopen(fn, "wb");
- if (f == NULL){ printf("\n\nFILE CREATION ERROR: %s\n\n", fn); exit(1); }
- //write header
- fwrite(ip.HeaderInfo, sizeof(uch), 54, f);
- //write data
- fwrite(Img, sizeof(uch), IMAGESIZE, f);
- printf("\nOutput File name: %17s (%u x %u) File Size=%u", fn, ip.Hpixels, ip.Vpixels, IMAGESIZE);
- fclose(f);
- }
- int main(int argc, char **argv)
- {
- //uch HeaderInfo[54];
- float totalTime, tfrCPUtoGPU, tfrGPUtoCPU, kernelExecutionTime; // GPU code run times
- cudaError_t cudaStatus, cudaStatus2;
- cudaEvent_t time1, time2, time3, time4;
- char InputFileName[255], OutputFileName[255], ProgName[255];
- ui BlkPerRow;//, BlkPerRowInt, BlkPerRowInt2;
- ui ThrPerBlk = 256, NumBlocks, GPUDataTransfer;//, NB2, NB4, NB8;
- //ui RowBytes, RowInts;
- cudaDeviceProp GPUprop;
- ul SupportedKBlocks, SupportedMBlocks, MaxThrPerBlk;
- //ui *GPUCopyImg32, *GPUImg32;
- char SupportedBlocks[100];
- int RotAngle;
- switch (argc){
- case 3 : printf("Please Set a value for rotation"); exit(EXIT_FAILURE);break;
- case 4 : strcpy(InputFileName, argv[1]);
- strcpy(OutputFileName, argv[2]);
- RotAngle = atoi(argv[3]); break;
- case 5 : ThrPerBlk = atoi(argv[4]);
- strcpy(InputFileName, argv[1]);
- strcpy(OutputFileName, argv[2]);
- RotAngle = atoi(argv[3]);
- printf("Rotate Angle: %d - ThrPerBlk: %d", RotAngle , ThrPerBlk); break;
- //case 5: ThrPerBlk=atoi(argv[4]);
- default: printf("\n\nUsage: ./rotate inputBMP outputBMP [RotAngle]");
- printf("\n\nExample: ./rotate infilename.bmp outname.bmp -75\n\n");
- printf("\n\nNothing executed ... Exiting ...\n\n");
- exit(EXIT_FAILURE);
- }
- if((RotAngle<-360) || (RotAngle>360)){
- printf("\nRotation angle of %d degrees is invalid ...\n",RotAngle);
- printf("\nPlease enter an angle between -360 and +360 degrees ...\n");
- printf("\n\nNothing executed ... Exiting ...\n\n");
- exit(EXIT_FAILURE);
- }
- if ((ThrPerBlk < 32) || (ThrPerBlk > 1024)) {
- printf("Invalid ThrPerBlk option '%u'. Must be between 32 and 1024. \n", ThrPerBlk);
- exit(EXIT_FAILURE);
- }
- // Calculate the image rot and degrigate for 360
- // Create CPU memory to store the input and output images
- TheImg = ReadBMPlin(InputFileName); // Read the input image if memory can be allocated
- if (TheImg == NULL){
- printf("Cannot allocate memory for the input image...\n");
- exit(EXIT_FAILURE);
- }
- CopyImg = (uch *)malloc(IMAGESIZE);
- if (CopyImg == NULL){
- free(TheImg);
- printf("Cannot allocate memory for the input image...\n");
- exit(EXIT_FAILURE);
- }
- // Choose which GPU to run on, change this on a multi-GPU system.
- int NumGPUs = 0;
- cudaGetDeviceCount(&NumGPUs);
- if (NumGPUs == 0){
- printf("\nNo CUDA Device is available\n");
- exit(EXIT_FAILURE);
- }
- cudaStatus = cudaSetDevice(0);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- exit(EXIT_FAILURE);
- }
- cudaGetDeviceProperties(&GPUprop, 0);
- SupportedKBlocks = (ui)GPUprop.maxGridSize[0] * (ui)GPUprop.maxGridSize[1] * (ui)GPUprop.maxGridSize[2] / 1024;
- SupportedMBlocks = SupportedKBlocks / 1024;
- sprintf(SupportedBlocks, "%u %c", (SupportedMBlocks >= 5) ? SupportedMBlocks : SupportedKBlocks, (SupportedMBlocks >= 5) ? 'M' : 'K');
- MaxThrPerBlk = (ui)GPUprop.maxThreadsPerBlock;
- cudaEventCreate(&time1);
- cudaEventCreate(&time2);
- cudaEventCreate(&time3);
- cudaEventCreate(&time4);
- cudaEventRecord(time1, 0); // Time stamp at the start of the GPU transfer
- // Allocate GPU buffer for the input and output images
- cudaStatus = cudaMalloc((void**)&GPUImg, IMAGESIZE);
- cudaStatus2 = cudaMalloc((void**)&GPUCopyImg, IMAGESIZE);
- if ((cudaStatus != cudaSuccess) || (cudaStatus2 != cudaSuccess)){
- fprintf(stderr, "cudaMalloc failed! Can't allocate GPU memory");
- exit(EXIT_FAILURE);
- }
- // These are the same pointers as GPUCopyImg and GPUImg, however, casted to an integer pointer
- // GPUCopyImg32 = (ui *)GPUCopyImg;
- // GPUImg32 = (ui *)GPUImg;
- // Copy input vectors from host memory to GPU buffers.
- cudaStatus = cudaMemcpy(GPUImg, TheImg, IMAGESIZE, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy CPU to GPU failed!");
- exit(EXIT_FAILURE);
- }
- cudaEventRecord(time2, 0); // Time stamp after the CPU --> GPU tfr is done
- //Setup
- // RowBytes = (IPH * 3 + 3) & (~3);
- // RowInts = RowBytes / 4;
- BlkPerRow = CEIL(IPH,ThrPerBlk);
- // BlkPerRowInt = CEIL(RowInts, ThrPerBlk);
- // BlkPerRowInt2 = CEIL(CEIL(RowInts,2), ThrPerBlk);
- //NumBlocks = IPV*BlkPerRow; //old
- NumBlocks = CEIL(IMAGESIZE,ThrPerBlk);
- printf("\n ---!!!! %d : %d !!!---- \n" , BlkPerRow , NumBlocks);
- // extract image height and width from header
- // uch HeaderInfo[54];
- // int width = *(int*)&HeaderInfo[18]; ip.Hpixels = width;
- // int height = *(int*)&HeaderInfo[22]; ip.Vpixels = height;
- //printf("Width: %d - Height: %d \n", width , height);
- //printf("Width: %d - Height: %d \n", ip.Hpixels , ip.Vpixels);
- //360 for loop needs to go here
- Rotate <<< NumBlocks, ThrPerBlk >>> (GPUCopyImg, GPUImg, IMAGESIZE , RotAngle , ip.Hpixels , ip.Vpixels , BlkPerRow); // , IPH , IPV
- GPUResult = GPUCopyImg;
- GPUDataTransfer = 2*IMAGESIZE;
- // cudaDeviceSynchronize waits for the kernel to finish, and returns
- // any errors encountered during the launch.
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "\n\ncudaDeviceSynchronize returned error code %d after launching the kernel!\n", cudaStatus);
- exit(EXIT_FAILURE);
- }
- cudaEventRecord(time3, 0);
- // Copy output (results) from GPU buffer to host (CPU) memory.
- cudaStatus = cudaMemcpy(CopyImg, GPUResult, IMAGESIZE, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy GPU to CPU failed!");
- exit(EXIT_FAILURE);
- }
- cudaEventRecord(time4, 0);
- cudaEventSynchronize(time1);
- cudaEventSynchronize(time2);
- cudaEventSynchronize(time3);
- cudaEventSynchronize(time4);
- cudaEventElapsedTime(&totalTime, time1, time4);
- cudaEventElapsedTime(&tfrCPUtoGPU, time1, time2);
- cudaEventElapsedTime(&kernelExecutionTime, time2, time3);
- cudaEventElapsedTime(&tfrGPUtoCPU, time3, time4);
- cudaStatus = cudaDeviceSynchronize();
- //checkError(cudaGetLastError()); // screen for errors in kernel launches
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "\n Program failed after cudaDeviceSynchronize()!");
- free(TheImg);
- free(CopyImg);
- exit(EXIT_FAILURE);
- }
- WriteBMPlin(CopyImg, OutputFileName); // Write the flipped image back to disk
- printf("\n--------------------------------------------------------------------------\n");
- printf("%s ComputeCapab=%d.%d [max %s blocks; %d thr/blk] \n",
- GPUprop.name, GPUprop.major, GPUprop.minor, SupportedBlocks, MaxThrPerBlk);
- printf("--------------------------------------------------------------------------\n");
- printf("%s %s %c %u [%u BLOCKS, %u BLOCKS/ROW]\n", ProgName, InputFileName, OutputFileName, ThrPerBlk, NumBlocks, BlkPerRow);
- printf("--------------------------------------------------------------------------\n");
- printf("--------------------------------------------------------------------------\n");
- printf("CPU->GPU Transfer =%7.2f ms ... %4d MB ... %6.2f GB/s\n", tfrCPUtoGPU, DATAMB(IMAGESIZE), DATABW(IMAGESIZE, tfrCPUtoGPU));
- printf("Kernel Execution =%7.2f ms ... %4d MB ... %6.2f GB/s\n", kernelExecutionTime, DATAMB(GPUDataTransfer), DATABW(GPUDataTransfer, kernelExecutionTime));
- printf("GPU->CPU Transfer =%7.2f ms ... %4d MB ... %6.2f GB/s\n", tfrGPUtoCPU, DATAMB(IMAGESIZE), DATABW(IMAGESIZE, tfrGPUtoCPU));
- printf("--------------------------------------------------------------------------\n");
- printf("Total time elapsed =%7.2f ms %4d MB ... %6.2f GB/s\n", totalTime, DATAMB((2*IMAGESIZE+GPUDataTransfer)), DATABW((2 * IMAGESIZE + GPUDataTransfer), totalTime));
- printf("--------------------------------------------------------------------------\n\n");
- // Deallocate CPU, GPU memory and destroy events.
- cudaFree(GPUImg);
- cudaFree(GPUCopyImg);
- cudaEventDestroy(time1);
- cudaEventDestroy(time2);
- cudaEventDestroy(time3);
- cudaEventDestroy(time4);
- // cudaDeviceReset must be called before exiting in order for profiling and
- // tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
- cudaStatus = cudaDeviceReset();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaDeviceReset failed!");
- free(TheImg);
- free(CopyImg);
- exit(EXIT_FAILURE);
- }
- free(TheImg);
- free(CopyImg);
- return(EXIT_SUCCESS);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement