Advertisement
JRsz

CUDA Matrix Transpose shared Memory

Nov 22nd, 2016
317
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 9.52 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_functions.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <chrono>
  6. #include <time.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9.  
  10. #define TILE_DIM 32
  11. #define BLOCK_ROWS 8
  12. #define BLOCK_COLS 32
  13.  
  14. cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
  15. void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
  16. __global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
  17. __global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
  18.  
  19. int main()
  20. {
  21.     int i, width, height, nreps, size, wrong, correct;
  22.     double cpuTime, cpuBandwidth;
  23.     cudaError_t cudaStatus;
  24.  
  25.     float *matrA, *matrATC, *matrATG, *matrAC;
  26.  
  27.     srand(time(NULL));
  28.  
  29.     nreps = 10000;
  30.     width = 500;
  31.     height = 100;
  32.     size = width * height;
  33.  
  34.     matrA = (float*)malloc(size * sizeof(float)); // matrix A
  35.     matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
  36.     matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
  37.     matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU
  38.  
  39.     for (i = 0; i < size; i++)
  40.     {
  41.         matrA[i] = (float)i;
  42.     }
  43.  
  44.     auto start = std::chrono::high_resolution_clock::now();
  45.  
  46.     //CPU Transpose
  47.     cpuMatrTrans(matrATC, matrA, width, height, nreps);
  48.  
  49.     auto end = std::chrono::high_resolution_clock::now();
  50.  
  51.     std::chrono::duration<double> diff = end - start;
  52.     cpuTime = (diff.count() * 1000) / nreps;
  53.     cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
  54.     printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
  55.  
  56.     correct = 0;
  57.     wrong = 0;
  58.  
  59.     //Naive transpose
  60.     matrMagicCuda(matrATG, matrA, width, height, nreps, 1);
  61.  
  62.     //Check if calc was correct
  63.     for (i = 0; i < size; i++)
  64.     {
  65.         if (matrATC[i] != matrATG[i])
  66.         {
  67.             /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
  68.             return;*/
  69.             wrong++;
  70.         }
  71.         else
  72.         {
  73.             correct++;
  74.         }
  75.     }
  76.  
  77.     printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
  78.     correct = 0;
  79.     wrong = 0;
  80.  
  81.     //Transpose with shared memory
  82.     matrMagicCuda(matrATG, matrA, width, height, nreps, 2);
  83.  
  84.     //Check if calc was correct
  85.     for (i = 0; i < size; i++)
  86.     {
  87.         if (matrATC[i] != matrATG[i])
  88.         {
  89.             /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
  90.             return;*/
  91.             wrong++;
  92.         }
  93.         else
  94.         {
  95.             correct++;
  96.         }
  97.     }
  98.  
  99.     //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
  100.     printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
  101.     correct = 0;
  102.     wrong = 0;
  103.  
  104.     // cudaDeviceReset must be called before exiting in order for profiling and
  105.     // tracing tools such as Nsight and Visual Profiler to show complete traces.
  106.     cudaStatus = cudaDeviceReset();
  107.     if (cudaStatus != cudaSuccess)
  108.     {
  109.         fprintf(stderr, "cudaDeviceReset failed!\n");
  110.         return 1;
  111.     }
  112.  
  113.     return 0;
  114. }
  115.  
  116. cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
  117. {
  118.     float elapsed = 0;
  119.     float *dev_matrA = 0;
  120.     float *dev_matrB = 0;
  121.     cudaError_t cudaStatus;
  122.     dim3 dim_grid, dim_block;
  123.     double gpuBandwidth;
  124.  
  125.     int size = width * height;
  126.  
  127.     dim_block.x = TILE_DIM;
  128.     dim_block.y = BLOCK_ROWS;
  129.     dim_block.z = 1;
  130.  
  131.     dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
  132.     dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
  133.     dim_grid.z = 1;
  134.  
  135.     // Choose which GPU to run on, change this on a multi-GPU system.
  136.     cudaStatus = cudaSetDevice(0);
  137.     if (cudaStatus != cudaSuccess)
  138.     {
  139.         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
  140.         goto Error;
  141.     }
  142.  
  143.     // Allocate GPU buffers for three matrix
  144.     cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
  145.     if (cudaStatus != cudaSuccess)
  146.     {
  147.         fprintf(stderr, "cudaMalloc failed!");
  148.         goto Error;
  149.     }
  150.  
  151.     cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
  152.     if (cudaStatus != cudaSuccess)
  153.     {
  154.         fprintf(stderr, "cudaMalloc failed!");
  155.         goto Error;
  156.     }
  157.  
  158.     // Copy input matrix from host memory to GPU buffers.
  159.     cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
  160.     if (cudaStatus != cudaSuccess)
  161.     {
  162.         fprintf(stderr, "cudaMemcpy failed!");
  163.         goto Error;
  164.     }
  165.  
  166.     cudaEvent_t start, stop;
  167.     cudaEventCreate(&start);
  168.     cudaEventCreate(&stop);
  169.  
  170.     switch (operation)
  171.     {
  172.         case(1):
  173.         {
  174.             cudaEventRecord(start);
  175.             // Launch a kernel on the GPU with one thread for each element.
  176.             naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);
  177.  
  178.             cudaEventRecord(stop);
  179.             cudaEventSynchronize(stop);
  180.  
  181.             cudaEventElapsedTime(&elapsed, start, stop);
  182.             cudaEventDestroy(start);
  183.             cudaEventDestroy(stop);
  184.  
  185.             elapsed /= nreps;
  186.  
  187.             gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
  188.             printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
  189.  
  190.             break;
  191.         }
  192.  
  193.         case(2):
  194.         {
  195.             cudaEventRecord(start);
  196.             // Launch a kernel on the GPU with one thread for each element.
  197.             notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);
  198.  
  199.             cudaEventRecord(stop);
  200.             cudaEventSynchronize(stop);
  201.  
  202.             cudaEventElapsedTime(&elapsed, start, stop);
  203.             cudaEventDestroy(start);
  204.             cudaEventDestroy(stop);
  205.  
  206.             elapsed /= nreps;
  207.  
  208.             gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
  209.             printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
  210.  
  211.             break;
  212.         }
  213.  
  214.     default:
  215.         printf("No matching opcode was found.\n");
  216.     }
  217.  
  218.     // Check for any errors launching the kernel
  219.     cudaStatus = cudaGetLastError();
  220.     if (cudaStatus != cudaSuccess)
  221.     {
  222.         fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
  223.         goto Error;
  224.     }
  225.  
  226.     // cudaDeviceSynchronize waits for the kernel to finish, and returns
  227.     // any errors encountered during the launch.
  228.     cudaStatus = cudaDeviceSynchronize();
  229.     if (cudaStatus != cudaSuccess)
  230.     {
  231.         fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
  232.         goto Error;
  233.     }
  234.  
  235.     // Copy output matrix from GPU buffer to host memory.
  236.     cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
  237.     if (cudaStatus != cudaSuccess)
  238.     {
  239.         fprintf(stderr, "cudaMemcpy failed!");
  240.         goto Error;
  241.     }
  242.  
  243. Error:
  244.     cudaFree(dev_matrB);
  245.     cudaFree(dev_matrA);
  246.  
  247.     return cudaStatus;
  248. }
  249.  
  250. void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
  251. {
  252.     int i, j, r;
  253.  
  254. #pragma unroll
  255.     for (r = 0; r < nreps; r++)
  256. #pragma unroll
  257.         for (i = 0; i < height; i++)
  258. #pragma unroll
  259.             for (j = 0; j < width; j++)
  260.                 matrB[j * height + i] = matrA[i * width + j];
  261. }
  262.  
  263. __global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
  264. {
  265.     int i, r;
  266.     int row = blockIdx.x * TILE_DIM + threadIdx.x;
  267.     int col = blockIdx.y * TILE_DIM + threadIdx.y;
  268.     int index_in = row + width * col;
  269.     int index_out = col + height * row;
  270.  
  271. #pragma unroll
  272.     for (r = 0; r < nreps; r++)
  273. #pragma unroll
  274.         for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
  275.             if (index_in + i * width < width * height)
  276.                 matrB[index_out + i] = matrA[index_in + i * width];
  277. }
  278.  
  279. __global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
  280. {
  281.     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
  282.     int blockIdx_y = blockIdx.x;
  283.     int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
  284.     int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
  285.     int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
  286.     int index_in = xIndex + (yIndex)* width;
  287.  
  288.     xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
  289.     yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
  290.     int index_out = xIndex + (yIndex)* height;
  291.  
  292.     int r, i;
  293. #pragma unroll
  294.     for (r = 0; r < nreps; r++)
  295.     {
  296. #pragma unroll
  297.         for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
  298.             tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
  299.  
  300.         __syncthreads();
  301.  
  302. #pragma unroll
  303.         for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
  304.             if (index_in + i * width < width * height)
  305.                matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
  306.     }
  307. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement