Advertisement
JRsz

CUDA Matrix Multiplication

Nov 15th, 2016
519
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 11.48 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_functions.h"
  3. #include "device_launch_parameters.h"
  4.  
  5. #include <chrono>
  6. #include <time.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9.  
  10. cudaError_t matrMulCuda(float *matrR, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y, int tile_width, bool shredMem);
  11. void sequentialMul(float *matrVrfy, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y);
  12. __global__ void mulKernel(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y);
  13. __global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width);
  14.  
  15. int main()
  16. {
  17.     bool match;
  18.     int i, j, k, l, amount, m_x, m_y, n_x, n_y, sizeM, sizeN, sizeR;
  19.     cudaError_t cudaStatus;
  20.  
  21.     float *matrM;
  22.     float *matrN;
  23.     float *matrR;
  24.     float *matrVrfy;
  25.  
  26.     bool shared[2] = { false, true };
  27.     int tile_width[3] = { 16, 32, 64 };
  28.     int amountTiles = 1; //TODO: Change me back to three
  29.  
  30.     int dimensions[5][4] = { { 300, 1000, 1000, 500 },
  31.                              { 30, 50, 50, 40 },
  32.                              { 1000, 1000, 1000, 1000 },
  33.                              { 50, 10000, 10000, 80 },
  34.                              { 5000, 40, 40, 7000 } };
  35.     int amountDimensions = 1;
  36.  
  37.     srand(time(NULL));
  38.  
  39.     for (i = 0; i < amountDimensions; i++)
  40.     {
  41.         m_x = dimensions[i][0];
  42.         m_y = dimensions[i][1];
  43.         n_x = dimensions[i][2];
  44.         n_y = dimensions[i][3];
  45.  
  46.         //allocate memory for one dimensional representation of the 2D array
  47.         sizeM = m_x * m_y;// x down and y right -> 300 rows and 1000 cols
  48.         sizeN = n_x * n_y;// 1000 rows and 500 cols
  49.         sizeR = m_x * n_y;
  50.         matrM = (float*)malloc(sizeM * sizeof(float));
  51.         matrN = (float*)malloc(sizeN * sizeof(float));
  52.         matrR = (float*)malloc(sizeR * sizeof(float));
  53.         matrVrfy = (float*)malloc(sizeR * sizeof(float));
  54.  
  55.         //Fill both matrixes with random values
  56.         for (j = 0; j < sizeM; j++)
  57.         {
  58.             matrM[j] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / 1000.0)); // random floats between 0 and 1000
  59.         }
  60.  
  61.         for (j = 0; j < sizeN; j++)
  62.         {
  63.             matrN[j] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / 1000.0)); // random floats between 0 and 1000
  64.         }
  65.  
  66.         /*float tmp = 1.0;
  67.         //Fill both matrixes with random values
  68.         for (j = 0; j < sizeM; j++)
  69.         {
  70.             matrM[j] = tmp;
  71.             tmp++;
  72.         }
  73.         tmp = 1.0;
  74.         for (j = 0; j < sizeN; j++)
  75.         {
  76.             matrN[j] = tmp;
  77.             tmp++;
  78.         }*/
  79.  
  80.         //calc sequential to confirm correct result
  81.         auto start = std::chrono::high_resolution_clock::now();
  82.  
  83.         sequentialMul(matrVrfy, matrM, matrN, m_x, m_y, n_x, n_y);
  84.  
  85.         auto end = std::chrono::high_resolution_clock::now();
  86.  
  87.         std::chrono::duration<double> diff = end - start;
  88.         printf("Matrix Multiplication with M: %dx%d, N:%dx%d:\n", m_x, m_y, n_x, n_y);
  89.         printf("CPU Time:            %f ms\n", diff.count() * 1000);
  90.  
  91.         for (j = 0; j < amountTiles; j++)
  92.         {
  93.             for (k = 0; k < 2; k++)
  94.             {
  95.                 //Mul both matrixes
  96.                 cudaStatus = matrMulCuda(matrR, matrM, matrN, m_x, m_y, n_x, n_y, tile_width[j], shared[k]);
  97.                 if (cudaStatus != cudaSuccess)
  98.                 {
  99.                     fprintf(stderr, "matrMulCuda failed!");
  100.                     return 1;
  101.                 }
  102.  
  103.                 match = true;
  104.  
  105.                 amount = m_x * n_y;
  106.                 for (l = 0; l < amount; l++)//300*500
  107.                 {
  108.                     //printf("(j, R, Vrfy: %d, %f, %f)\n", j, matrR[j], matrVrfy[j]);
  109.                     if (matrR[l] != matrVrfy[l])
  110.                     {
  111.                         printf("Result does not matche! (l, R, Vrfy: %d, %f, %f)\n", l, matrR[l], matrVrfy[l]);
  112.                         match = false;
  113.  
  114.                         break;
  115.                     }
  116.                 }
  117.  
  118.                 if (match)
  119.                 {
  120.                     //printf("Result matches sequential calculation.\n");
  121.                 }
  122.  
  123.                 //Reset result for next use
  124.                 free(matrR);
  125.                 matrR = (float*)malloc(sizeR * sizeof(float));
  126.             }
  127.  
  128.             printf("\n");
  129.         }
  130.  
  131.         printf("\n");
  132.  
  133.         //free memory
  134.         free(matrM);
  135.         free(matrN);
  136.         free(matrR);
  137.         free(matrVrfy);
  138.     }
  139.  
  140.     // cudaDeviceReset must be called before exiting in order for profiling and
  141.     // tracing tools such as Nsight and Visual Profiler to show complete traces.
  142.     cudaStatus = cudaDeviceReset();
  143.     if (cudaStatus != cudaSuccess)
  144.     {
  145.         fprintf(stderr, "cudaDeviceReset failed!\n");
  146.         return 1;
  147.     }
  148.  
  149.     //printf("\n");
  150.     //printf("Press Enter to exit program...");
  151.     //getchar();
  152.  
  153.     return 0;
  154. }
  155.  
  156. __global__ void mulKernel(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y)
  157. {
  158.     int row = blockIdx.y * blockDim.y + threadIdx.y;
  159.     int col = blockIdx.x * blockDim.x + threadIdx.x;
  160.     int i;
  161.  
  162.     if ((row < m_x) && (col < n_y))
  163.     {
  164.         float tmp = 0.0;
  165.         for (i = 0; i < m_y; i++)
  166.         {
  167.             tmp += matrM[row * m_y + i] * matrN[col + n_y * i];
  168.         }
  169.  
  170.         matrR[row * n_y + col] = tmp;
  171.     }
  172. }
  173.  
  174. __global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width)
  175. {
  176.     int i, j;
  177.     extern __shared__ float shared[];
  178.     float *matrM_sm = shared;
  179.     float *matrN_sm = &shared[tile_width * tile_width];
  180.    
  181.     int bx = blockIdx.x;
  182.     int by = blockIdx.y;
  183.     int tx = threadIdx.x;
  184.     int ty = threadIdx.y;
  185.  
  186.     int row = by * tile_width + ty;
  187.     int col = bx * tile_width + tx;
  188.  
  189.     float tmp;
  190.     int limit = ceil(m_y / (float) tile_width);
  191.     for (i = 0; i < limit; i++)
  192.     {
  193.         tmp = 0.0;
  194.  
  195.         if (i * tile_width + tx < m_y && row < m_x)
  196.             matrM_sm[ty * tile_width + tx] = matrM[row * m_y + (i * tile_width + tx)];
  197.         else
  198.             matrM_sm[ty * tile_width + tx] = 0.0;
  199.            
  200.         if (i * tile_width + ty < n_x && col < n_y)
  201.             matrN_sm[ty * tile_width + tx] = matrN[col + (i * tile_width + ty) * n_y];
  202.         else
  203.             matrN_sm[ty * tile_width + tx] = 0.0;
  204.  
  205.         __syncthreads();
  206.  
  207.         for (j = 0; j < tile_width; j++)
  208.             tmp += matrM_sm[ty * tile_width + j] * matrN_sm[j * tile_width + tx];
  209.  
  210.         __syncthreads();
  211.     }
  212.  
  213.     if (row < m_x && col < n_y)
  214.         matrR[row * n_y + col] = tmp;
  215. }
  216.  
  217. // Helper function for using CUDA to mul matrix M and N.
  218. cudaError_t matrMulCuda(float *matrR, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y, int tile_width, bool shredMem)
  219. {
  220.     float *dev_matrR = 0;
  221.     float *dev_matrM = 0;
  222.     float *dev_matrN = 0;
  223.     cudaError_t cudaStatus;
  224.     dim3 dim_grid, dim_block;
  225.  
  226.     int sizeR = m_x * n_y;
  227.     int sizeM = m_x * m_y;
  228.     int sizeN = n_x * n_y;
  229.     int shared = 2 * tile_width * tile_width * sizeof(float);
  230.  
  231.     dim_block.x = tile_width;
  232.     dim_block.y = tile_width;
  233.     dim_block.z = 1;
  234.  
  235.     dim_grid.x = (n_y + dim_block.x - 1) / tile_width;
  236.     dim_grid.y = (m_x + dim_block.y - 1) / tile_width;
  237.     dim_grid.z = 1;
  238.  
  239.     // Choose which GPU to run on, change this on a multi-GPU system.
  240.     cudaStatus = cudaSetDevice(0);
  241.     if (cudaStatus != cudaSuccess)
  242.     {
  243.         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
  244.         goto Error;
  245.     }
  246.  
  247.     // Allocate GPU buffers for three matrix
  248.     cudaStatus = cudaMalloc((void**)&dev_matrR, sizeR * sizeof(float));
  249.     if (cudaStatus != cudaSuccess)
  250.     {
  251.         fprintf(stderr, "cudaMalloc failed!");
  252.         goto Error;
  253.     }
  254.  
  255.     cudaStatus = cudaMalloc((void**)&dev_matrM, sizeM * sizeof(float));
  256.     if (cudaStatus != cudaSuccess)
  257.     {
  258.         fprintf(stderr, "cudaMalloc failed!");
  259.         goto Error;
  260.     }
  261.  
  262.     cudaStatus = cudaMalloc((void**)&dev_matrN, sizeN * sizeof(float));
  263.     if (cudaStatus != cudaSuccess)
  264.     {
  265.         fprintf(stderr, "cudaMalloc failed!");
  266.         goto Error;
  267.     }
  268.  
  269.     // Copy input matrix from host memory to GPU buffers.
  270.     cudaStatus = cudaMemcpy(dev_matrM, matrM, sizeM * sizeof(float), cudaMemcpyHostToDevice);
  271.     if (cudaStatus != cudaSuccess)
  272.     {
  273.         fprintf(stderr, "cudaMemcpy failed!");
  274.         goto Error;
  275.     }
  276.  
  277.     cudaStatus = cudaMemcpy(dev_matrN, matrN, sizeN * sizeof(float), cudaMemcpyHostToDevice);
  278.     if (cudaStatus != cudaSuccess)
  279.     {
  280.         fprintf(stderr, "cudaMemcpy failed!");
  281.         goto Error;
  282.     }
  283.  
  284.     cudaEvent_t start, stop;
  285.     cudaEventCreate(&start);
  286.     cudaEventCreate(&stop);
  287.  
  288.     if (shredMem)
  289.     {
  290.         cudaEventRecord(start);
  291.         // Launch a kernel on the GPU with one thread for each element.
  292.         mulKernelSM<<<dim_grid, dim_block, shared>>>(dev_matrR, dev_matrM, dev_matrN, m_x, m_y, n_x, n_y, tile_width);
  293.  
  294.         cudaEventRecord(stop);
  295.         cudaEventSynchronize(stop);
  296.     }
  297.     else
  298.     {
  299.         cudaEventRecord(start);
  300.         // Launch a kernel on the GPU with one thread for each element.
  301.         mulKernel<<<dim_grid, dim_block>>>(dev_matrR, dev_matrM, dev_matrN, m_x, m_y, n_x, n_y);
  302.  
  303.         cudaEventRecord(stop);
  304.         cudaEventSynchronize(stop);
  305.     }
  306.    
  307.     float elapsed = 0;
  308.     cudaEventElapsedTime(&elapsed, start, stop);
  309.     cudaEventDestroy(start);
  310.     cudaEventDestroy(stop);
  311.  
  312.     if (shredMem)
  313.     {
  314.         printf("GPU Time with SM:    %f ms, tile_width: %d\n", elapsed, tile_width);
  315.     }
  316.     else
  317.     {
  318.         printf("GPU Time without SM: %f ms, tile_width: %d\n", elapsed, tile_width);
  319.     }
  320.  
  321.     // Check for any errors launching the kernel
  322.     cudaStatus = cudaGetLastError();
  323.     if (cudaStatus != cudaSuccess)
  324.     {
  325.         fprintf(stderr, "mulKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
  326.         goto Error;
  327.     }
  328.  
  329.     // cudaDeviceSynchronize waits for the kernel to finish, and returns
  330.     // any errors encountered during the launch.
  331.     cudaStatus = cudaDeviceSynchronize();
  332.     if (cudaStatus != cudaSuccess)
  333.     {
  334.         fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching mulKernel!\n", cudaStatus);
  335.         goto Error;
  336.     }
  337.  
  338.     // Copy output matrix from GPU buffer to host memory.
  339.     cudaStatus = cudaMemcpy(matrR, dev_matrR, sizeR * sizeof(float), cudaMemcpyDeviceToHost);
  340.     if (cudaStatus != cudaSuccess)
  341.     {
  342.         fprintf(stderr, "cudaMemcpy failed!");
  343.         goto Error;
  344.     }
  345.  
  346. Error:
  347.     cudaFree(dev_matrR);
  348.     cudaFree(dev_matrM);
  349.     cudaFree(dev_matrN);
  350.  
  351.     return cudaStatus;
  352. }
  353.  
  354.  
  355. void sequentialMul(float *matrVrfy, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y)
  356. {
  357.     int i, j, k, curPos;
  358.     float tmp;
  359.  
  360.     for (i = 0; i < m_x; i++)//300
  361.     {
  362.         for (j = 0; j < n_y; j++)//500
  363.         {
  364.             curPos = i * n_y + j;
  365.  
  366.             tmp = 0.0;
  367.             for (k = 0; k < m_y; k++)
  368.             {
  369.                 tmp += matrM[i * m_y + k] * matrN[j + n_y * k];
  370.             }
  371.  
  372.             matrVrfy[curPos] = tmp;
  373.         }
  374.     }
  375. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement