Advertisement
desdemona

cuda po raz ostatni

Jun 2nd, 2015
671
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 22.97 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4. #include "helper_cuda.h"
  5.  
  6. #include <stdlib.h>
  7. #include <stdio.h>
  8. #include <memory>
  9. #include <iostream>
  10. #include <cstdio>
  11. #include <cstdlib>
  12. #include <ctime>
  13.  
  14. #include "bitmap_image.hpp"
  15. #include "csv_parser/csv_parser.hpp"
  16.  
  17. #ifndef __CUDACC__  
  18.     #define __CUDACC__
  19. #endif
  20.  
  21. const int Filter_Size = 7;
  22. const int Filter_Total_Size = 49;
  23.  
  24. __device__ __constant__ float g_filter[Filter_Total_Size];
  25. __device__ __constant__ float g_factor[2];
  26.  
  27. __device__ int buffadr(unsigned int x, unsigned int y, unsigned int imageWidth, unsigned int imageHeight)
  28. {
  29.     if (x >= imageWidth || y >= imageHeight)
  30.     {
  31.         return -1;
  32.     }
  33.     return y*imageWidth + x;
  34. }
  35.  
  36. __global__ void filterKernel(const unsigned char *inputColor, unsigned char *outputColor, unsigned int imageWidth, unsigned int imageHeight,
  37.     unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
  38. {
  39.     __shared__ int blockXstart, blockYstart;
  40.     __shared__ int blockXend, blockYend;
  41.  
  42.     if (threadIdx.x == 0 && threadIdx.y == 0)
  43.     {
  44.         //policz i zapisz do shared czym zajmuje sie dany blok
  45.         blockXstart = blockIdx.x * blockPartWidth;
  46.         blockYstart = blockIdx.y * blockPartHeight;
  47.         blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
  48.         blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
  49.         if (blockXend > imageWidth)
  50.         {
  51.             blockXend = imageWidth;
  52.         }
  53.         if (blockYend > imageHeight)
  54.         {
  55.             blockYend = imageHeight;
  56.         }
  57.         //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
  58.     }
  59.  
  60.     __syncthreads();
  61.     //teraz każdy wątek liczy sobie swój start i koniec
  62.     unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
  63.     unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
  64.     unsigned int threadXend = threadXstart + threadPartWidth;
  65.     unsigned int threadYend = threadYstart + threadPartHeight;
  66.     threadXend = (threadXend > blockXend) ? blockXend : threadXend;
  67.     threadYend = (threadYend > blockYend) ? blockYend : threadYend;
  68.  
  69.     int x_, y_, bufadr_;
  70.     int c;
  71.     for (int y = threadYstart; y < threadYend; y++)
  72.     {
  73.         for (int x = threadXstart; x < threadXend; x++)
  74.         {
  75.             bufadr_ = buffadr(x, y, imageWidth, imageHeight);
  76.             if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  77.             {
  78.                 continue;
  79.             }
  80.             c = 0;
  81.             for (int j = -3; j < 4; j++)
  82.             {
  83.                 for (int i = -3; i < 4; i++)
  84.                 {
  85.                     y_ = y + j;
  86.                     x_ = x + i;
  87.                     bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
  88.                     if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  89.                     {
  90.                         continue;
  91.                     }
  92.                     c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
  93.                 }
  94.             }
  95.             c = g_factor[0] * c + g_factor[1];
  96.            
  97.             c = (c < 0) ? 0 : c;
  98.             c = (c > 255) ? 255 : c;
  99.             outputColor[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c;
  100.         }
  101.     }
  102.    
  103. }
  104.  
  105. __global__ void newFilterKernel(const unsigned char *inputR, unsigned char *outputR, const unsigned char *inputG,
  106.     unsigned char *outputG, const unsigned char *inputB, unsigned char *outputB, unsigned int imageWidth, unsigned int imageHeight,
  107.     unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
  108. {
  109.     __shared__ int blockXstart, blockYstart;
  110.     __shared__ int blockXend, blockYend;
  111.  
  112.     if (threadIdx.x == 0 && threadIdx.y == 0)
  113.     {
  114.         //policz i zapisz do shared czym zajmuje sie dany blok
  115.         blockXstart = blockIdx.x * blockPartWidth;
  116.         blockYstart = blockIdx.y * blockPartHeight;
  117.         blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
  118.         blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
  119.         if (blockXend > imageWidth)
  120.         {
  121.             blockXend = imageWidth;
  122.         }
  123.         if (blockYend > imageHeight)
  124.         {
  125.             blockYend = imageHeight;
  126.         }
  127.         //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
  128.     }
  129.  
  130.     __syncthreads();
  131.     //teraz każdy wątek liczy sobie swój start i koniec
  132.     unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
  133.     unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
  134.     unsigned int threadXend = threadXstart + threadPartWidth;
  135.     unsigned int threadYend = threadYstart + threadPartHeight;
  136.     threadXend = (threadXend > blockXend) ? blockXend : threadXend;
  137.     threadYend = (threadYend > blockYend) ? blockYend : threadYend;
  138.  
  139.     int x_, y_, bufadr_;
  140.     int c_r, c_g, c_b;
  141.     float fil;
  142.     for (int y = threadYstart; y < threadYend; y++)
  143.     {
  144.         for (int x = threadXstart; x < threadXend; x++)
  145.         {
  146.             bufadr_ = buffadr(x, y, imageWidth, imageHeight);
  147.             if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  148.             {
  149.                 continue;
  150.             }
  151.             c_r = c_g = c_b = 0;
  152.             for (int j = -3; j < 4; j++)
  153.             {
  154.                 for (int i = -3; i < 4; i++)
  155.                 {
  156.                     fil = g_filter[((j + 3)*7) + i + 3];
  157.                     if(0.0 == fil)
  158.                     { continue; }
  159.                     y_ = y + j;
  160.                     x_ = x + i;
  161.                     bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
  162.                     if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  163.                     { continue; }
  164.                     //c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
  165.                     c_r += fil * (int)inputR[bufadr_];
  166.                     c_g += fil * (int)inputG[bufadr_];
  167.                     c_b += fil * (int)inputB[bufadr_];
  168.                 }
  169.             }
  170.             c_r = g_factor[0] * c_r + g_factor[1];
  171.             c_g = g_factor[0] * c_g + g_factor[1];
  172.             c_b = g_factor[0] * c_b + g_factor[1];
  173.  
  174.             c_r = (c_r < 0) ? 0 : c_r;
  175.             c_r = (c_r > 255) ? 255 : c_r;
  176.             outputR[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_r;
  177.  
  178.             c_g = (c_g < 0) ? 0 : c_g;
  179.             c_g = (c_g > 255) ? 255 : c_g;
  180.             outputG[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_g;
  181.  
  182.             c_b = (c_b < 0) ? 0 : c_b;
  183.             c_b = (c_b > 255) ? 255 : c_b;
  184.             outputB[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_b;
  185.         }
  186.     }
  187. }
  188.  
  189. void printHelp()
  190. {
  191.     std::cout << "HELP: \n";
  192.     std::cout << "exe filter_file_path image_file_path\n";
  193.     std::cout << "\n";
  194.     system("pause");
  195. }
  196.  
  197. void readDeviceAttributes(int devId, int &driverVersion, int &runtimeVersion, unsigned long long &totalGlobalMem,
  198.     int &multiProcessorCount, int &cudaCores, int &warpSize, unsigned int &totalConstantMem,
  199.     unsigned int &sharedMemPerBlock, int &maxThreadsPerBlock, int &maxThreadsPerMP)
  200. {
  201.     /* code from deviceQuery example*/
  202.     cudaSetDevice(devId);
  203.     cudaDeviceProp deviceProp;
  204.     cudaGetDeviceProperties(&deviceProp, devId);
  205.     printf("\nDevice %d: \"%s\"\n", devId, deviceProp.name);
  206.     cudaDriverGetVersion(&driverVersion);
  207.     cudaRuntimeGetVersion(&runtimeVersion);
  208.     printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
  209.     printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
  210.  
  211.     totalGlobalMem = (unsigned long long) deviceProp.totalGlobalMem;
  212.     printf("  Total amount of global memory: (%llu bytes)\n", totalGlobalMem);
  213.     printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
  214.         deviceProp.multiProcessorCount,
  215.         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
  216.         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
  217.  
  218.     printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
  219.     totalConstantMem = deviceProp.totalConstMem;
  220.     printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
  221.     sharedMemPerBlock = deviceProp.sharedMemPerBlock;
  222.     printf("  Warp size:                                     %d\n", deviceProp.warpSize);
  223.  
  224.     maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
  225.     maxThreadsPerMP = deviceProp.maxThreadsPerMultiProcessor;
  226.     multiProcessorCount = deviceProp.multiProcessorCount;
  227.     cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
  228.  
  229.     warpSize = deviceProp.warpSize;
  230.     printf("\n");
  231. }
  232.  
  233. int main(int argc, char **argv)
  234. {
  235.     clock_t start,end,kernelstart,kernelend;
  236.     start = clock();
  237.  
  238.     int devId = 0;
  239.     int warpSize, multiProcesorCount, cudaCores;
  240.     int runtimeVersion, driverVersion;
  241.     unsigned long long totalGlobalMem;
  242.     unsigned int sharedMemPerBlock, totalConstantMem;
  243.     int maxThreadsPerBlock, maxThreadsPerMP;
  244.  
  245.     cudaDeviceProp deviceProp;
  246.  
  247.     int status = EXIT_SUCCESS;
  248.     std::cout << argv[0] << " starting.. \n";
  249.     std::cout << "CUDA Image filtering - 03:40\n";
  250.  
  251.     unsigned char *g_r = 0; //graphics red
  252.     unsigned char *g_g = 0; //graphics green
  253.     unsigned char *g_b = 0; //graphics blue
  254.     unsigned char *g_or = 0; //graphics output red
  255.     unsigned char *g_og = 0; //graphics output green
  256.     unsigned char *g_ob = 0; //graphics output blue
  257.     cudaError_t cudaStatus;
  258.  
  259.     /* read cuda devices */
  260. #pragma region readDevices
  261.     int deviceCount = 0;
  262.     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
  263.     if (error_id != cudaSuccess)
  264.     {
  265.         std::cout << "cudaGetDeviceCount returned " << (int)error_id << "->" << cudaGetErrorString(error_id) << "\n";
  266.         std::cout << "Result = FAIL\n";
  267.         exit(EXIT_FAILURE);
  268.     }
  269.     // This function call returns 0 if there are no CUDA capable devices.
  270.     if (deviceCount == 0)
  271.     {
  272.         std::cout << "There are no available device(s) that support CUDA\n";
  273.         std::cout << "Result = FAIL\n";
  274.         exit(EXIT_FAILURE);
  275.     }
  276.     else
  277.     {
  278.         std::cout << "Detected " << deviceCount << " CUDA Capable device(s)\n";
  279.     }
  280. #pragma endregion readDevices
  281.  
  282.     /* read input parameters */
  283. #pragma region readAndParseInput
  284.     if (argc < 3)
  285.     {
  286.         std::cout << "You passed " << argc << "arguments\n";
  287.         std::cout << "Not enough arguments passed to program\n";
  288.         printHelp();
  289.         std::cout << "Result = FAIL\n";
  290.         exit(EXIT_FAILURE);
  291.     }
  292.     else if(argc == 4 && deviceCount > 1)
  293.     {
  294.         std::cout << "You passed " << argc << " arguments and there are multiple devices to choose from\n";
  295.         int newDev = atoi( argv[3] );
  296.  
  297.         if(newDev >= deviceCount || newDev < 0)
  298.         {
  299.             std::cout << "No such device, program will run on first device found." << endl;
  300.         }
  301.         else
  302.         {
  303.             devId = newDev;
  304.             std::cout << "Device " << devId << " chosen." << endl;
  305.         }
  306.     }
  307.     float filter_value[49];
  308.     float factor[2];
  309.  
  310.     /* read filter file and load to constant mem */
  311.     const char field_terminator = ',';
  312.     const char line_terminator  = '\n';
  313.     const char enclosure_char   = '"';
  314.  
  315.     csv_parser file_parser;
  316.     file_parser.set_skip_lines(0);
  317.     file_parser.init(argv[1]);
  318.  
  319.     file_parser.set_enclosed_char(enclosure_char, ENCLOSURE_OPTIONAL); 
  320.     file_parser.set_field_term_char(field_terminator);
  321.     file_parser.set_line_term_char(line_terminator);
  322.  
  323.     std::string num;
  324.     float tmp;
  325.     for (int i = 0; i < 8; i++)
  326.     {
  327.         if(!file_parser.has_more_rows() && 7 != i)
  328.         {
  329.             std::cout << "Malformed csv filter\n";
  330.             std::cout << "line:" << i << std::endl;
  331.             printHelp();
  332.             std::cout << "Result = FAIL\n";
  333.             exit(EXIT_FAILURE);
  334.         }
  335.         csv_row row = file_parser.get_row();
  336.  
  337.         if (i < 7 && 7 == row.size())
  338.         {
  339.             for (int j = 0; j < 7; j++)
  340.             {
  341.                 num = std::string(row[j]);
  342.                 tmp = ::strtod(num.c_str(), 0);
  343.                 filter_value[i * 7 + j] = (float)tmp;
  344.             }
  345.         }
  346.         else if(7 == i && 2 == row.size())
  347.         {
  348.             num = std::string(row[0]);
  349.             tmp = ::strtod(num.c_str(), 0);
  350.             factor[0] = (float)tmp;
  351.             num = std::string(row[1]);
  352.             tmp = ::strtod(num.c_str(), 0);
  353.             factor[1] = (float)tmp;
  354.         }
  355.         else
  356.         {
  357.             std::cout << "Malformed csv filter\n";
  358.             std::cout << "line: " << i << std::endl;
  359.             std::cout << "row size: " << row.size() << endl;
  360.             printHelp();
  361.             std::cout << "Result = FAIL\n";
  362.             exit(EXIT_FAILURE);
  363.         }
  364.     }
  365.  
  366.     std::cout << "Filter from file " << argv[1] << " read succesfully \n";
  367.  
  368.     /* read bmp from file to gpu global mem */
  369.  
  370.     std::string file_name(argv[2]);
  371.     bitmap_image image(file_name);
  372.     if (!image)
  373.     {
  374.         std::cout << "Failed to open image " << file_name.c_str() << "\n";
  375.         printHelp();
  376.         std::cout << "Result = FAIL\n";
  377.         exit(EXIT_FAILURE);
  378.     }
  379.  
  380.     const unsigned int imgWidth = image.width();
  381.     const unsigned int imgHeight = image.height();
  382.     std::cout << "Loaded image " << imgWidth << "X" << imgHeight << " from file " << file_name.c_str() << " \n";
  383.  
  384.     unsigned int imgSize = imgWidth * imgHeight;
  385.     unsigned int imgSizeBytes = sizeof(unsigned char)* 3 * imgSize;
  386.  
  387.     /* check if filer fits into constant mem, load */
  388.     /* it will not, constant memory is too small*/
  389.     int neededConstMem = sizeof(float)* Filter_Total_Size;
  390.     /* check if image fits into global memory, alocate output tables */
  391.     int neededGlobalMem = 2 * imgSizeBytes;
  392.     std::cout << "Application will need total of " << neededGlobalMem <<  " bytes in global memory \n";
  393.     std::cout << "Application will need total of " << neededConstMem << " bytes in constant memory \n";
  394. #pragma endregion readAndParseInput
  395.  
  396. #pragma region readDeviceProperties
  397.     readDeviceAttributes(devId, driverVersion, runtimeVersion, totalGlobalMem, multiProcesorCount, cudaCores,
  398.         warpSize, totalConstantMem, sharedMemPerBlock, maxThreadsPerBlock, maxThreadsPerMP);
  399.  
  400.     if ((unsigned int)neededConstMem > totalConstantMem || neededGlobalMem > totalGlobalMem)
  401.     {
  402.         std::cout << "Not enough memory. Try smaller image or get better device. \n";
  403.         printHelp();
  404.         std::cout << "Result = FAIL\n";
  405.         exit(EXIT_FAILURE);
  406.     }
  407. #pragma endregion readDeviceProperties
  408.  
  409. #pragma region allocateAndCopyMem
  410.     unsigned char *r = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  411.     unsigned char *g = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  412.     unsigned char *b = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  413.  
  414.     unsigned char red, green, blue;
  415.     for (std::size_t i = 0; i < imgHeight; i++)
  416.     {
  417.         for (std::size_t j = 0; j < imgWidth; j++)
  418.         {
  419.             image.get_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
  420.             r[(int)i * (int)imgWidth + (int)j] = red;
  421.             g[(int)i * (int)imgWidth + (int)j] = green;
  422.             b[(int)i * (int)imgWidth + (int)j] = blue;
  423.         }
  424.     }
  425.  
  426.     std::cout << "Allocated mem for rgb tables. " << imgSizeBytes << " bytes in total \n";
  427.  
  428.     cudaStatus = cudaSetDevice(devId);
  429.     cudaGetDeviceProperties(&deviceProp, devId);
  430.     if (cudaStatus != cudaSuccess) {
  431.         std::cout << "cudaSetDevice failed! Do you have a CUDA - capable GPU installed ?\n";
  432.         status = EXIT_FAILURE;
  433.         goto Error;
  434.     }
  435.  
  436.     // Allocate GPU buffers for six vectors (3 input, 3 output).
  437.     cudaStatus = cudaMalloc((void**)&g_r, sizeof(unsigned char)* imgSize);
  438.     if (cudaStatus != cudaSuccess) {
  439.         std::cout << "cudaMalloc failed!\n";
  440.         status = EXIT_FAILURE;
  441.         goto Error;
  442.     }
  443.     cudaStatus = cudaMalloc((void**)&g_g, sizeof(unsigned char)* imgSize);
  444.     if (cudaStatus != cudaSuccess) {
  445.         std::cout << "cudaMalloc failed!\n";
  446.         status = EXIT_FAILURE;
  447.         goto Error;
  448.     }
  449.     cudaStatus = cudaMalloc((void**)&g_b, sizeof(unsigned char)* imgSize);
  450.     if (cudaStatus != cudaSuccess) {
  451.         std::cout << "cudaMalloc failed!\n";
  452.         status = EXIT_FAILURE;
  453.         goto Error;
  454.     }
  455.     cudaStatus = cudaMalloc((void**)&g_or, sizeof(unsigned char)* imgSize);
  456.     if (cudaStatus != cudaSuccess) {
  457.         std::cout << "cudaMalloc failed!\n";
  458.         status = EXIT_FAILURE;
  459.         goto Error;
  460.     }
  461.     cudaStatus = cudaMalloc((void**)&g_og, sizeof(unsigned char)* imgSize);
  462.     if (cudaStatus != cudaSuccess) {
  463.         std::cout << "cudaMalloc failed!\n";
  464.         status = EXIT_FAILURE;
  465.         goto Error;
  466.     }
  467.     cudaStatus = cudaMalloc((void**)&g_ob, sizeof(unsigned char)* imgSize);
  468.     if (cudaStatus != cudaSuccess) {
  469.         std::cout << "cudaMalloc failed!\n";
  470.         status = EXIT_FAILURE;
  471.         goto Error;
  472.     }
  473.  
  474.     std::cout << "Space for input and output images mallocked.\n";
  475.  
  476.     // Copy input vectors from host memory to GPU buffers.
  477.     cudaStatus = cudaMemcpy(g_r, r, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  478.     if (cudaStatus != cudaSuccess) {
  479.         std::cout << "cudaMemcpy failed!\n";
  480.         status = EXIT_FAILURE;
  481.         goto Error;
  482.     }
  483.     cudaStatus = cudaMemcpy(g_g, g, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  484.     if (cudaStatus != cudaSuccess) {
  485.         std::cout << "cudaMemcpy failed!\n";
  486.         status = EXIT_FAILURE;
  487.         goto Error;
  488.     }
  489.     cudaStatus = cudaMemcpy(g_b, b, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  490.     if (cudaStatus != cudaSuccess) {
  491.         std::cout << "cudaMemcpy failed!\n";
  492.         status = EXIT_FAILURE;
  493.         goto Error;
  494.     }
  495.     std::cout << "Input data coppied into global mem.\n";
  496.     cudaStatus = cudaMemcpyToSymbol(g_filter, filter_value, sizeof(float)* Filter_Total_Size, 0, cudaMemcpyHostToDevice);
  497.     if (cudaStatus != cudaSuccess) {
  498.         std::cout << "cudaMemcpyToSymbol failed!\n";
  499.         status = EXIT_FAILURE;
  500.         goto Error;
  501.     }
  502.     cudaMemcpyToSymbol(g_factor, factor, sizeof(float)*2,0,cudaMemcpyHostToDevice);
  503.     getLastCudaError("Factor copy failed");
  504.     checkCudaErrors(cudaDeviceSynchronize());
  505.  
  506.     std::cout << "Filter data coppied into constant mem.\n";
  507.  
  508. #pragma endregion allocateAndCopyMem
  509.  
  510. #pragma region divideImage
  511.     /*
  512.     int warpSize, multiProcesorCount, cudaCores;
  513.     int runtimeVersion, driverVersion;
  514.     unsigned long long totalGlobalMem;
  515.     unsigned int sharedMemPerBlock, totalConstantMem;
  516.     int maxThreadsPerBlock, maxThreadsPerMP;
  517.     */
  518.  
  519.     float dev_score = (float)cudaCores / (float)warpSize;
  520.     float img_ratio = (float)imgWidth/(float)imgHeight;
  521.    
  522.     std::cout << "Image ratio: " << img_ratio << " cores/warp: " << dev_score << endl;
  523.  
  524.  
  525.     int blocksX = 64;
  526.     int blocksY = 64;
  527.     int threadsX = 8;// 8;
  528.     int threadsY = 8;// 8;
  529.  
  530.     float div_ratio = (float)threadsX / (float)threadsY;
  531.     //zaokrąglone w górę te podziały
  532.     unsigned int blockPartX = (imgWidth / blocksX) + (imgWidth%blocksX != 0);
  533.     unsigned int blockPartY = (imgHeight / blocksY) + (imgHeight%blocksY != 0);
  534.     //to wyjdzie zawsze conajmniej 1, chyba że wymiar obrazu to 0, a to już patola
  535.  
  536.     unsigned int threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  537.     unsigned int threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  538.     //tu też zawsze conajmniej 1
  539.     while(threadsX * threadsY < dev_score * warpSize && (threadsX * threadsY) < maxThreadsPerBlock)
  540.     {
  541.         if(img_ratio > div_ratio && 1 != threadPartX)
  542.         {
  543.             threadsX += 2;
  544.         }
  545.         else if(1 != threadPartY)
  546.         {
  547.             threadsY += 2;
  548.         }
  549.         else
  550.         {
  551.             div_ratio = (float)threadsX / (float)threadsY;
  552.             threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  553.             threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  554.             break;
  555.         }
  556.         div_ratio = (float)threadsX / (float)threadsY;
  557.         threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  558.         threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  559.         if(1 == threadPartX && 1 == threadPartY)
  560.         { break; }
  561.     }
  562.  
  563.     //64x64
  564.     dim3 blocksPerGrid(blocksX, blocksY);
  565.     //and 8x8 threads per block
  566.     dim3 threadsPerBlock(threadsX, threadsY);
  567.  
  568.     std::cout << blocksX << "X" << blocksY << " blocks, each block procesing " << blockPartX << "X" << blockPartY << "pixels.\n";
  569.     std::cout << threadsX << "X" << threadsY << " threads per block, each procesing " << threadPartX << "X" << threadPartY << "pixels.\n";
  570. #pragma endregion divideImage
  571.  
  572. #pragma region runKernels
  573.     /* do magic - run kernel */
  574.     kernelstart = clock();
  575.  
  576.     /*
  577.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  578.     getLastCudaError("Kernel execution failed");
  579.     checkCudaErrors(cudaDeviceSynchronize());
  580.     std::cout << "Red.\n";
  581.  
  582.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_g, g_og, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  583.     getLastCudaError("Kernel execution failed");
  584.     checkCudaErrors(cudaDeviceSynchronize());
  585.     std::cout << "Green.\n";
  586.  
  587.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  588.     getLastCudaError("Kernel execution failed");
  589.     checkCudaErrors(cudaDeviceSynchronize());
  590.     std::cout << "Blue.\n";
  591.     getLastCudaError("Kernel execution failed");
  592.     checkCudaErrors(cudaDeviceSynchronize());
  593.     */
  594.  
  595.     newFilterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, g_g, g_og, g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  596.     getLastCudaError("Kernel execution failed");
  597.     checkCudaErrors(cudaDeviceSynchronize());
  598.     std::cout << "Red.\n";  std::cout << "Green.\n"; std::cout << "Blue.\n";
  599.  
  600.     kernelend = clock();
  601.     //sekwencyjnie, hehe.
  602.     // Check if kernel execution generated an error
  603.  
  604.     /* start clock */
  605.  
  606.     /*Keep in mind that there is some driver overhead the first time you call
  607.     a particular kernel in your program, so when doing timing studies, you
  608.     should warm up your kernels by calling them once before your timer
  609.     starts.*/
  610.  
  611.     /* end clock */
  612.  
  613. #pragma endregion runKernels
  614.  
  615. #pragma region copyResults
  616.     // Copy output vector from GPU buffer to host memory.
  617.  
  618.     cudaStatus = cudaMemcpy(r, g_or, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  619.     if (cudaStatus != cudaSuccess) {
  620.         std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
  621.         status = EXIT_FAILURE;
  622.         goto Error;
  623.     }
  624.     cudaStatus = cudaMemcpy(g, g_og, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  625.     if (cudaStatus != cudaSuccess) {
  626.         std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
  627.         status = EXIT_FAILURE;
  628.         goto Error;
  629.     }
  630.     cudaStatus = cudaMemcpy(b, g_ob, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  631.     if (cudaStatus != cudaSuccess) {
  632.         std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
  633.         status = EXIT_FAILURE;
  634.         goto Error;
  635.     }
  636. #pragma endregion copyResults
  637.  
  638.     /* assemble and save output image */
  639. #pragma region saveOutput
  640.     for (std::size_t i = 0; i < imgHeight; i++)
  641.     {
  642.         for (std::size_t j = 0; j < imgWidth; j++)
  643.         {
  644.             red = r[(int)i * (int)imgWidth + (int)j];
  645.             green = g[(int)i * (int)imgWidth + (int)j];
  646.             blue = b[(int)i * (int)imgWidth + (int)j];
  647.             image.set_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
  648.         }
  649.     }
  650.     time_t timer = NULL;
  651.     time(&timer);
  652.     struct tm * timeinfo = localtime(&timer);
  653.     char matko[14];
  654.     strftime(matko, 14, "%y%m%d_%H%M%S", timeinfo);
  655.     std::cout << "Saving result image to " << file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp" << " file.\n";
  656.     image.save_image(file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp");
  657. #pragma endregion saveOutput
  658.  
  659.  
  660.     // cudaDeviceReset must be called before exiting in order for profiling and
  661.     // tracing tools such as Nsight and Visual Profiler to show complete traces.
  662.     cudaStatus = cudaDeviceReset();
  663.     if (cudaStatus != cudaSuccess) {
  664.         fprintf(stderr, "cudaDeviceReset failed!");
  665.         return 1;
  666.     }
  667.  
  668. /* free every mallocked space and exit */
  669. #pragma region freeAndExit
  670. Error:
  671.     cudaFree(g_r);
  672.     cudaFree(g_g);
  673.     cudaFree(g_b);
  674.     cudaFree(g_or);
  675.     cudaFree(g_og);
  676.     cudaFree(g_ob);
  677.     //cudaFree(g_filter);
  678.     free(r);
  679.     free(g);
  680.     free(b);
  681.     end = clock();
  682.     double total_diff = double(end - start) / CLOCKS_PER_SEC;
  683.     printf ("Total elapsed time is %.6lf seconds.\n", total_diff );
  684.     if(NULL != kernelstart && NULL != kernelend)
  685.     {
  686.         double kernel_diff = double(kernelend - kernelstart) / CLOCKS_PER_SEC;
  687.         printf ("Kernel functions elapsed time is %.6lf seconds.\n", kernel_diff );
  688.     }
  689.     system("pause");
  690.     exit(status);
  691. #pragma endregion freeAndExit
  692. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement