SHARE
TWEET

Untitled

a guest May 19th, 2019 68 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // System includes
  2. #define WIN32
  3. #include <stdio.h>
  4. #include <assert.h>
  5.  
  6. // CUDA runtime
  7. #include <cuda_runtime.h>
  8.  
  9. // Helper functions and utilities to work with CUDA
  10. #include <helper_functions.h>
  11.  
  12. const unsigned int VEC_SIZE = 1073741824;
  13.  
  14. //sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja z rozbieżnością wiązek
  15. __global__ void block_sum_rw(int* dane, int* wyniki) {
  16.  
  17.     //identyfikator pierwszej wartości czytanej zależny od  identyfikatora wątku i bloku (suma 2 elementów)
  18.     unsigned int i = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
  19.  
  20.     //odstęp między wartościami sumowanymi elementów
  21.     for (unsigned int odstep = 1; odstep < 2 * blockDim.x; odstep *= 2) {
  22.         //wykluczenie wątków w kolejnych etapach co 2,4,8,...
  23.         if (threadIdx.x%odstep == 0)
  24.             // test rozmiaru danych, jeśli potrzebny
  25.             if (i + odstep < VEC_SIZE)
  26.                 dane[i] += dane[i + odstep];
  27.         // synchronizacja gotowości danych
  28.         __syncthreads();
  29.     }
  30.  
  31.     //  wątek 0 zapisuje wynik
  32.     if (threadIdx.x == 0) {
  33.         wyniki[blockIdx.x] = dane[i];
  34.     }
  35. }
  36.  
  37. //sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja bez rozbieżności wiązek
  38. __global__ void block_sum_brw(int* dane, int* wyniki) {
  39.  
  40.     //identyfikator pierwszej wartości czytanej zależny od  identyfikatora wątku i bloku (suma 2 elementów)
  41.     unsigned int i = blockIdx.x * 2 * blockDim.x + threadIdx.x;
  42.  
  43.     //odstęp między wartościami sumowanymi elementów
  44.     for (unsigned int odstep = blockDim.x; odstep > 0; odstep >>= 1) {
  45.         if (threadIdx.x < odstep) {
  46.             dane[i] += dane[i + odstep];
  47.         }
  48.         __syncthreads();
  49.     }
  50.  
  51.     //  wątek 0 zapisuje wynik
  52.     if (threadIdx.x == 0) {
  53.         wyniki[blockIdx.x] = dane[i];
  54.     }
  55. }
  56.  
  57. void constantInit(int* data, int val)
  58. {
  59.     for (unsigned int i = 0; i < VEC_SIZE; ++i)
  60.     {
  61.         data[i] = val;
  62.     }
  63. }
  64.  
  65. //1 - wersja RW, 3 - wersja BRW
  66. int reduceVector(int version, int block_size)
  67. {
  68.     // Calculate required grid size (in blocks)
  69.     int grid_size = VEC_SIZE / 2 / block_size;
  70.  
  71.     // Allocate host memory vector
  72.     unsigned long long vec_mem = sizeof(int) * VEC_SIZE;
  73.     unsigned long long res_mem = sizeof(int) * grid_size;
  74.     int* h_vec = (int*)malloc(vec_mem);
  75.     int* h_res = (int*)malloc(res_mem);
  76.  
  77.     // Initialize host memory
  78.     constantInit(h_vec, 1);
  79.  
  80.     // Allocate device memory
  81.     int* d_vec;
  82.     int* d_res;
  83.  
  84.     // Initialize device memory
  85.     cudaError_t error;
  86.     error = cudaMalloc((void **)&d_vec, vec_mem);
  87.     if (error != cudaSuccess)
  88.     {
  89.         printf("cudaMalloc d_vec returned error code %d, line(%d)\n", error, __LINE__);
  90.         exit(EXIT_FAILURE);
  91.     }
  92.     error = cudaMalloc((void **)&d_res, res_mem);
  93.     if (error != cudaSuccess)
  94.     {
  95.         printf("cudaMalloc d_res returned error code %d, line(%d)\n", error, __LINE__);
  96.         exit(EXIT_FAILURE);
  97.     }
  98.  
  99.     // copy host memory to device
  100.     error = cudaMemcpy(d_vec, h_vec, vec_mem, cudaMemcpyHostToDevice);
  101.     if (error != cudaSuccess)
  102.     {
  103.         printf("cudaMemcpy (d_vec,h_vec) returned error code %d, line(%d)\n", error, __LINE__);
  104.         exit(EXIT_FAILURE);
  105.     }
  106.     error = cudaMemcpy(d_res, h_res, res_mem, cudaMemcpyHostToDevice);
  107.     if (error != cudaSuccess)
  108.     {
  109.         printf("cudaMemcpy (d_res,h_res) returned error code %d, line(%d)\n", error, __LINE__);
  110.         exit(EXIT_FAILURE);
  111.     }
  112.  
  113.     // Create and start timer
  114.     printf("Computing result using CUDA Kernel...\n");
  115.  
  116.     // Performs operation using CUDA kernel
  117.     int turn = 0;
  118.         if (version == 1)
  119.         {
  120.             do
  121.             {
  122.                 if (turn == 0)
  123.                     block_sum_rw <<< grid_size, block_size >>> (d_vec, d_res);
  124.                 else
  125.                     block_sum_rw <<< grid_size, block_size >>> (d_res, d_vec);
  126.                 grid_size /= 2 * block_size;
  127.                 turn = abs(turn - 1);
  128.             } while (grid_size >= 1);
  129.         }
  130.         else
  131.         {
  132.             do
  133.             {
  134.                 if (turn == 0)
  135.                     block_sum_brw <<< grid_size, block_size >>> (d_vec, d_res);
  136.                 else
  137.                     block_sum_brw <<< grid_size, block_size >>> (d_res, d_vec);
  138.                 grid_size /= 2 * block_size;
  139.                 turn = abs(turn - 1);
  140.             } while (grid_size >= 1);
  141.         }
  142.  
  143.     printf("done\n");
  144.  
  145.     cudaDeviceSynchronize();
  146.  
  147.     // Copy result from device to host
  148.     error = cudaMemcpy(h_vec, d_vec, vec_mem, cudaMemcpyDeviceToHost);
  149.     if (error != cudaSuccess)
  150.     {
  151.         printf("cudaMemcpy (h_vec,d_vec) returned error code %d, line(%d)\n", error, __LINE__);
  152.         exit(EXIT_FAILURE);
  153.     }
  154.     error = cudaMemcpy(h_res, d_res, res_mem, cudaMemcpyDeviceToHost);
  155.     if (error != cudaSuccess)
  156.     {
  157.         printf("cudaMemcpy (h_res,d_res) returned error code %d, line(%d)\n", error, __LINE__);
  158.         exit(EXIT_FAILURE);
  159.     }
  160.  
  161.     printf("Checking computed result for correctness: ");
  162.     bool correct = true;
  163.  
  164.     if (turn == 1 && h_res[0] != VEC_SIZE)
  165.     {
  166.         printf("Error! Vector reduced to value: %d\n", h_res[0]);
  167.         correct = false;
  168.     }
  169.     else if (turn == 0 && h_vec[0] != VEC_SIZE)
  170.     {
  171.         printf("Error! Vector reduced to value: %d\n", h_vec[0]);
  172.         correct = false;
  173.     }
  174.  
  175.     printf("%s\n", correct ? "OK" : "FAIL");
  176.  
  177.     // Clean up memory
  178.     free(h_vec);
  179.     free(h_res);
  180.     cudaFree(d_vec);
  181.     cudaFree(d_res);
  182.  
  183.     cudaDeviceReset();
  184.  
  185.     if (correct)
  186.     {
  187.         return EXIT_SUCCESS;
  188.     }
  189.     else
  190.     {
  191.         return EXIT_FAILURE;
  192.     }
  193. }
  194.  
  195. /**
  196.  * Program main
  197.  */
  198. int main(int argc, char **argv)
  199. {
  200.     printf("[Reduction Using CUDA] - Starting...\n");
  201.  
  202.     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
  203.     int devID = 0;
  204.  
  205.     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
  206.     {
  207.         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
  208.         cudaSetDevice(devID);
  209.     }
  210.  
  211.     cudaError_t error;
  212.     cudaDeviceProp deviceProp;
  213.     error = cudaGetDevice(&devID);
  214.  
  215.     if (error != cudaSuccess)
  216.     {
  217.         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
  218.     }
  219.  
  220.     error = cudaGetDeviceProperties(&deviceProp, devID);
  221.  
  222.     if (deviceProp.computeMode == cudaComputeModeProhibited)
  223.     {
  224.         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
  225.         exit(EXIT_SUCCESS);
  226.     }
  227.  
  228.     if (error != cudaSuccess)
  229.     {
  230.         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
  231.     }
  232.     else
  233.     {
  234.         printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
  235.     }
  236.  
  237.     // Define block size
  238.     int block_size = 512;
  239.     int version = 1; //1 - RW, 3 - BRW
  240.  
  241.     printf("Block size: %d\n", block_size);
  242.  
  243.     int reduce_result = reduceVector(version, block_size);
  244.  
  245.     exit(reduce_result);
  246. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top