Advertisement
Guest User

Untitled

a guest
May 29th, 2016
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 16.24 KB | None | 0 0
  1. ////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  4. //
  5. // Please refer to the NVIDIA end user license agreement (EULA) associated
  6. // with this source code for terms and conditions that govern your use of
  7. // this software. Any use, reproduction, disclosure, or distribution of
  8. // this software and related documentation outside the terms of the EULA
  9. // is strictly prohibited.
  10. //
  11. ////////////////////////////////////////////////////////////////////////////
  12.  
  13. //
  14. // Matrix multiplication: C = A * B.
  15. // Host code.
  16. //
  17. // This sample implements matrix multiplication as described in Chapter 3
  18. // of the programming guide and uses the CUBLAS library to demonstrate
  19. // the best performance.
  20.  
  21. // SOME PRECAUTIONS:
  22. // IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
  23. // WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
  24. // The reason is explained as follows:
  25.  
  26. // CUBLAS library uses column-major storage, but C/C++ use row-major storage.
  27. // When passing the matrix pointer to CUBLAS, the memory layout alters from
  28. // row-major to column-major, which is equivalent to an implict transpose.
  29.  
  30. // In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
  31. // C = A * B, we can't use the input order like cublasSgemm(A, B)  because of
  32. // implict transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
  33. // If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
  34. // multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
  35. // is a column-based cublas matrix, which means C(T) in C/C++, we need extra
  36. // transpose code to convert it to a row-based C/C++ matrix.
  37.  
  38. // To solve the problem, let's consider our desired result C, a row-major matrix.
  39. // In cublas format, it is C(T) actually (becuase of the implict transpose).
  40. // C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
  41. // happen to be C/C++ matrice B and A (still becuase of the implict transpose)!
  42. // We don't need extra transpose code, we only need alter the input order!
  43. //
  44. // CUBLAS provides high-performance matrix multiplication.
  45. // See also:
  46. // V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
  47. // in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
  48. // Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
  49. //
  50.  
  51. // Utilities and system includes
  52. #include <assert.h>
  53. #include <helper_string.h>  // helper for shared functions common to CUDA Samples
  54.  
  55. // CUDA runtime
  56. #include <cuda_runtime.h>
  57. #include <cublas_v2.h>
  58.  
  59. // CUDA and CUBLAS functions
  60. #include <helper_functions.h>
  61.  
  62. #ifndef min
  63. #define min(a,b) ((a < b) ? a : b)
  64. #endif
  65. #ifndef max
  66. #define max(a,b) ((a > b) ? a : b)
  67. #endif
  68.  
  69. typedef struct _matrixSize      // Optional Command-line multiplier for matrix sizes
  70. {
  71.     unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
  72. } sMatrixSize;
  73.  
  74. ////////////////////////////////////////////////////////////////////////////////
  75. //! Compute reference data set matrix multiply on CPU
  76. //! C = A * B
  77. //! @param C          reference data, computed but preallocated
  78. //! @param A          matrix A as provided to device
  79. //! @param B          matrix B as provided to device
  80. //! @param hA         height of matrix A
  81. //! @param wB         width of matrix B
  82. ////////////////////////////////////////////////////////////////////////////////
  83. void
  84. matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
  85. {
  86.     for (unsigned int i = 0; i < hA; ++i)
  87.         for (unsigned int j = 0; j < wB; ++j)
  88.         {
  89.             double sum = 0;
  90.  
  91.             for (unsigned int k = 0; k < wA; ++k)
  92.             {
  93.                 double a = A[i * wA + k];
  94.                 double b = B[k * wB + j];
  95.                 sum += a * b;
  96.             }
  97.  
  98.             C[i * wB + j] = (float)sum;
  99.         }
  100. }
  101.  
  102. ////////////////////////////////////////////////////////////////////////////////
  103. // These are CUDA Helper functions (in addition to helper_cuda.h)
  104.  
  105. void inline checkError(cublasStatus_t status, const char *msg)
  106. {
  107.     if (status != CUBLAS_STATUS_SUCCESS)
  108.     {
  109.         printf("%s", msg);
  110.         exit(EXIT_FAILURE);
  111.     }
  112. }
  113. // end of CUDA Helper Functions
  114.  
  115. // Allocates a matrix with random float entries.
  116. void randomInit(float *data, int size)
  117. {
  118.     for (int i = 0; i < size; ++i)
  119.         data[i] = rand() / (float)RAND_MAX;
  120. }
  121.  
  122. void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
  123. {
  124.     printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
  125.     int i,j,k;
  126.     int error_count=0;
  127.  
  128.     for (j = 0; j < height; j++)
  129.     {
  130.         if (error_count < iListLength)
  131.         {
  132.             printf("\n  Row %d:\n", j);
  133.         }
  134.  
  135.         for (i = 0; i < width; i++)
  136.         {
  137.             k = j * width + i;
  138.             float fDiff = fabs(data1[k] - data2[k]);
  139.  
  140.             if (fDiff > fListTol)
  141.             {
  142.                 if (error_count < iListLength)
  143.                 {
  144.                     printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
  145.                 }
  146.  
  147.                 error_count++;
  148.             }
  149.         }
  150.     }
  151.  
  152.     printf(" \n  Total Errors = %d\n", error_count);
  153. }
  154.  
  155. void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
  156. {
  157.     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
  158.     cudaError_t error;
  159.     devID = 0;
  160.  
  161.     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
  162.     {
  163.         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
  164.         error = cudaSetDevice(devID);
  165.  
  166.         if (error != cudaSuccess)
  167.         {
  168.             printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
  169.             exit(EXIT_FAILURE);
  170.         }
  171.     }
  172.  
  173.     // get number of SMs on this GPU
  174.     error = cudaGetDevice(&devID);
  175.  
  176.     if (error != cudaSuccess)
  177.     {
  178.         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
  179.         exit(EXIT_FAILURE);
  180.     }
  181.  
  182.  
  183.     if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
  184.     {
  185.         iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
  186.     }
  187.  
  188.     iSizeMultiple = min(iSizeMultiple, 10);
  189.     iSizeMultiple = max(iSizeMultiple, 1);
  190.  
  191.     cudaDeviceProp deviceProp;
  192.  
  193.     error = cudaGetDeviceProperties(&deviceProp, devID);
  194.  
  195.     if (error != cudaSuccess)
  196.     {
  197.         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
  198.         exit(EXIT_FAILURE);
  199.     }
  200.  
  201.     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
  202.  
  203.     // use a larger block size for Fermi and above
  204.     int block_size = (deviceProp.major < 2) ? 16 : 32;
  205.  
  206.     matrix_size.uiWA = 2 * block_size * iSizeMultiple;
  207.     matrix_size.uiHA = 4 * block_size * iSizeMultiple;
  208.     matrix_size.uiWB = 2 * block_size * iSizeMultiple;
  209.     matrix_size.uiHB = 4 * block_size * iSizeMultiple;
  210.     matrix_size.uiWC = 2 * block_size * iSizeMultiple;
  211.     matrix_size.uiHC = 4 * block_size * iSizeMultiple;
  212.  
  213.     printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
  214.            matrix_size.uiWA, matrix_size.uiHA,
  215.            matrix_size.uiWB, matrix_size.uiHB,
  216.            matrix_size.uiWC, matrix_size.uiHC);
  217. }
  218.  
  219. ////////////////////////////////////////////////////////////////////////////////
  220. //! Run a simple test matrix multiply using CUBLAS
  221. ////////////////////////////////////////////////////////////////////////////////
  222. int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
  223. {
  224.     cudaDeviceProp deviceProp;
  225.     cudaError_t error;
  226.  
  227.     error = cudaGetDeviceProperties(&deviceProp, devID);
  228.  
  229.     if (error != cudaSuccess)
  230.     {
  231.         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
  232.         exit(EXIT_FAILURE);
  233.     }
  234.  
  235.     // use a larger block size for Fermi and above
  236.     int block_size = (deviceProp.major < 2) ? 16 : 32;
  237.  
  238.     // set seed for rand()
  239.     srand(2006);
  240.  
  241.     // allocate host memory for matrices A and B
  242.     unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
  243.     unsigned int mem_size_A = sizeof(float) * size_A;
  244.     float *h_A = (float *)malloc(mem_size_A);
  245.     unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
  246.     unsigned int mem_size_B = sizeof(float) * size_B;
  247.     float *h_B = (float *)malloc(mem_size_B);
  248.  
  249.     // set seed for rand()
  250.     srand(2006);
  251.  
  252.     // initialize host memory
  253.     randomInit(h_A, size_A);
  254.     randomInit(h_B, size_B);
  255.  
  256.     // allocate device memory
  257.     float *d_A, *d_B, *d_C;
  258.     unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
  259.     unsigned int mem_size_C = sizeof(float) * size_C;
  260.  
  261.     // allocate host memory for the result
  262.     float *h_C      = (float *) malloc(mem_size_C);
  263.     float *h_CUBLAS = (float *) malloc(mem_size_C);
  264.  
  265.     error = cudaMalloc((void **) &d_A, mem_size_A);
  266.  
  267.     if (error != cudaSuccess)
  268.     {
  269.         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
  270.         exit(EXIT_FAILURE);
  271.     }
  272.  
  273.     error = cudaMalloc((void **) &d_B, mem_size_B);
  274.  
  275.     if (error != cudaSuccess)
  276.     {
  277.         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
  278.         exit(EXIT_FAILURE);
  279.     }
  280.  
  281.     // copy host memory to device
  282.     error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
  283.  
  284.     if (error != cudaSuccess)
  285.     {
  286.         printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
  287.         exit(EXIT_FAILURE);
  288.     }
  289.  
  290.     error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
  291.  
  292.     if (error != cudaSuccess)
  293.     {
  294.         printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
  295.         exit(EXIT_FAILURE);
  296.     }
  297.  
  298.     error = cudaMalloc((void **) &d_C, mem_size_C);
  299.  
  300.     if (error != cudaSuccess)
  301.     {
  302.         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
  303.         exit(EXIT_FAILURE);
  304.     }
  305.  
  306.     // setup execution parameters
  307.     dim3 threads(block_size, block_size);
  308.     dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
  309.  
  310.     // create and start timer
  311.     printf("Computing result using CUBLAS...");
  312.  
  313.     // execute the kernel
  314.     int nIter = 30;
  315.  
  316.     // CUBLAS version 2.0
  317.     {
  318.         cublasHandle_t handle;
  319.  
  320.         cublasStatus_t ret;
  321.  
  322.         ret = cublasCreate(&handle);
  323.  
  324.         if (ret != CUBLAS_STATUS_SUCCESS)
  325.         {
  326.             printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
  327.             exit(EXIT_FAILURE);
  328.         }
  329.  
  330.         const float alpha = 1.0f;
  331.         const float beta  = 0.0f;
  332.         //Perform warmup operation with cublas
  333.         ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);
  334.  
  335.         if (ret != CUBLAS_STATUS_SUCCESS)
  336.         {
  337.             printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
  338.             exit(EXIT_FAILURE);
  339.         }
  340.  
  341.         // Allocate CUDA events that we'll use for timing
  342.         cudaEvent_t start;
  343.         error = cudaEventCreate(&start);
  344.  
  345.         if (error != cudaSuccess)
  346.         {
  347.             fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
  348.             exit(EXIT_FAILURE);
  349.         }
  350.  
  351.         cudaEvent_t stop;
  352.         error = cudaEventCreate(&stop);
  353.  
  354.         if (error != cudaSuccess)
  355.         {
  356.             fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
  357.             exit(EXIT_FAILURE);
  358.         }
  359.  
  360.         // Record the start event
  361.         error = cudaEventRecord(start, NULL);
  362.  
  363.         if (error != cudaSuccess)
  364.         {
  365.             fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
  366.             exit(EXIT_FAILURE);
  367.         }
  368.  
  369.         for (int j = 0; j < nIter; j++)
  370.         {
  371.             //note cublas is column primary!
  372.             //need to transpose the order
  373.             ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);
  374.  
  375.             if (ret != CUBLAS_STATUS_SUCCESS)
  376.             {
  377.                 printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
  378.                 exit(EXIT_FAILURE);
  379.             }
  380.         }
  381.  
  382.         printf("done.\n");
  383.  
  384.         // Record the stop event
  385.         error = cudaEventRecord(stop, NULL);
  386.  
  387.         if (error != cudaSuccess)
  388.         {
  389.             fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
  390.             exit(EXIT_FAILURE);
  391.         }
  392.  
  393.         // Wait for the stop event to complete
  394.         error = cudaEventSynchronize(stop);
  395.  
  396.         if (error != cudaSuccess)
  397.         {
  398.             fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
  399.             exit(EXIT_FAILURE);
  400.         }
  401.  
  402.         float msecTotal = 0.0f;
  403.         error = cudaEventElapsedTime(&msecTotal, start, stop);
  404.  
  405.         if (error != cudaSuccess)
  406.         {
  407.             fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
  408.             exit(EXIT_FAILURE);
  409.         }
  410.  
  411.         // Compute and print the performance
  412.         float msecPerMatrixMul = msecTotal / nIter;
  413.         double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
  414.         double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
  415.         printf(
  416.             "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
  417.             gigaFlops,
  418.             msecPerMatrixMul,
  419.             flopsPerMatrixMul);
  420.  
  421.         // copy result from device to host
  422.         error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
  423.  
  424.         if (error != cudaSuccess)
  425.         {
  426.             printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
  427.             exit(EXIT_FAILURE);
  428.         }
  429.  
  430.         checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
  431.     }
  432.  
  433.     // compute reference solution
  434.     printf("Computing result using host CPU...");
  435.     float *reference = (float *)malloc(mem_size_C);
  436.     matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
  437.     printf("done.\n");
  438.  
  439.     // check result (CUBLAS)
  440.     bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
  441.  
  442.     if (resCUBLAS != true)
  443.     {
  444.         printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
  445.     }
  446.  
  447.     printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
  448.  
  449.     // clean up memory
  450.     free(h_A);
  451.     free(h_B);
  452.     free(h_C);
  453.     free(reference);
  454.     cudaFree(d_A);
  455.     cudaFree(d_B);
  456.     cudaFree(d_C);
  457.  
  458.     // cudaDeviceReset causes the driver to clean up all state. While
  459.     // not mandatory in normal operation, it is good practice.  It is also
  460.     // needed to ensure correct operation when the application is being
  461.     // profiled. Calling cudaDeviceReset causes all profile data to be
  462.     // flushed before the application exits
  463.     cudaDeviceReset();
  464.  
  465.     if (resCUBLAS == true)
  466.     {
  467.         return EXIT_SUCCESS;    // return value = 1
  468.     }
  469.     else
  470.     {
  471.         return EXIT_FAILURE;     // return value = 0
  472.     }
  473. }
  474.  
  475. ////////////////////////////////////////////////////////////////////////////////
  476. // Program main
  477. ////////////////////////////////////////////////////////////////////////////////
  478. int main(int argc, char **argv)
  479. {
  480.     printf("[Matrix Multiply CUBLAS] - Starting...\n");
  481.  
  482.     int devID = 0, sizeMult = 5;
  483.     sMatrixSize matrix_size;
  484.  
  485.     initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
  486.  
  487.     int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
  488.  
  489.     exit(matrix_result);
  490. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement