Advertisement
Guest User

Untitled

a guest
Feb 24th, 2017
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 15.21 KB | None | 0 0
  1. /**
  2. * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
  3. *
  4. * Please refer to the NVIDIA end user license agreement (EULA) associated
  5. * with this source code for terms and conditions that govern your use of
  6. * this software. Any use, reproduction, disclosure, or distribution of
  7. * this software and related documentation outside the terms of the EULA
  8. * is strictly prohibited.
  9. *
  10. */
  11.  
  12. /**
  13. * Matrix multiplication: C = A * B.
  14. * Host code.
  15. *
  16. * This sample implements matrix multiplication as described in Chapter 3
  17. * of the programming guide.
  18. * It has been written for clarity of exposition to illustrate various CUDA
  19. * programming principles, not with the goal of providing the most
  20. * performant generic kernel for matrix multiplication.
  21. *
  22. * See also:
  23. * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
  24. * in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
  25. * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
  26. */
  27.  
  28. // System includes
  29. #define WIN32
  30. #include <stdio.h>
  31. #include <assert.h>
  32.  
  33. // CUDA runtime
  34. #include <cuda_runtime.h>
  35.  
  36. // Helper functions and utilities to work with CUDA
  37. #include <helper_functions.h>
  38.  
  39. /**
  40. * Matrix multiplication (CUDA Kernel) on the device: C = A * B
  41. * wA is A's width and wB is B's width
  42. */
  43.  
  44. /**
  45. badanie wpływu organizacji dostępu do pamięci globalnej na efektywność przetwarzania (wersje 2,3,4 ) –
  46. przygotować kody z dostępami łączonymi (w miarę możliwości)
  47. 2. grid wieloblokowy, jeden wątek oblicza jeden element macierzy wynikowej, obliczenia przy wykorzystaniu
  48. pamięci globalnej,
  49. 3. grid wieloblokowy, jeden wątek oblicza jeden element macierzy wynikowej, obliczenia przy
  50. wykorzystaniu pamięci współdzielonej bloku wątków,
  51. 4. grid wieloblokowy, jeden wątek oblicza jeden element macierzy wynikowej, obliczenia danych przy
  52. wykorzystaniu pamięci współdzielonej bloku wątków ze zrównolegleniem obliczeń i pobierania danych z
  53. pamięci globalnej w ramach każdego bloku wątków,
  54. */
  55. template <int BLOCK_SIZE> __global__ void
  56. matrixMulCUDA_v2(float *C, float *A, float *B, int wA, int wB)
  57. {
  58.     int row;
  59.     row = blockIdx.y * blockDim.y + threadIdx.y;
  60.     int col;
  61.     col = blockIdx.x * blockDim.x + threadIdx.x;
  62.  
  63.     float Csub;
  64.     Csub = 0;
  65.  
  66.     for (int i = 0; i < wA; i++)
  67.     {
  68.         Csub += A[row*wA + i] * B[i + col*wA];
  69.     }
  70.  
  71.     C[row*wA + col] = Csub;
  72. }
  73.  
  74. template <int BLOCK_SIZE> __global__ void
  75. matrixMulCUDA_v3(float *C, float *A, float *B, int wA, int wB)
  76. {
  77.     int tx = threadIdx.x;
  78.     int ty = threadIdx.y;
  79.     int row;
  80.     row = blockIdx.y * blockDim.y + threadIdx.y;
  81.     int col;
  82.     col = blockIdx.x * blockDim.x + threadIdx.x;
  83.  
  84.     const int SUB_WIDTH = 32;
  85.     __shared__ float Ads[SUB_WIDTH][SUB_WIDTH];
  86.     __shared__ float Bds[SUB_WIDTH][SUB_WIDTH];
  87.  
  88.     float Csub;
  89.     Csub = 0;
  90.  
  91.     for (int i = 0; i < wA/SUB_WIDTH; i++)
  92.     {
  93.         Ads[tx][ty] = A[row*wA + i * SUB_WIDTH + tx];
  94.         Bds[tx][ty] = B[i * SUB_WIDTH + ty + col*wA];
  95.        
  96.         __syncthreads();
  97.  
  98.         for (int k = 0; k < SUB_WIDTH; k++)
  99.         {
  100.             Csub += Ads[tx][k] * Bds[k][ty];
  101.  
  102.         }
  103.         __syncthreads();
  104.     }
  105.  
  106.     C[row*wA + col] = Csub;
  107. }
  108.  
  109. template <int BLOCK_SIZE> __global__ void
  110. matrixMulCUDA_v4(float *C, float *A, float *B, int wA, int wB)
  111. {
  112.     int tx = threadIdx.x;
  113.     int ty = threadIdx.y;
  114.     int row;
  115.     row = blockIdx.y * blockDim.y + threadIdx.y;
  116.     int col;
  117.     col = blockIdx.x * blockDim.x + threadIdx.x;
  118.  
  119.     const int SUB_WIDTH = 16;
  120.     __shared__ float Ads[SUB_WIDTH][SUB_WIDTH];
  121.     __shared__ float Bds[SUB_WIDTH][SUB_WIDTH];
  122.  
  123.     __shared__ float A_shared[SUB_WIDTH][SUB_WIDTH];
  124.     __shared__ float B_shared[SUB_WIDTH][SUB_WIDTH];
  125.  
  126.     float Csub;
  127.     Csub = 0;
  128.  
  129.     //pierwsze pobranie danych z pamieci globalnej do pamieci wspoldzielonej 'A'
  130.     A_shared[tx][ty] = A[row*wA + 0 * SUB_WIDTH + tx];
  131.     B_shared[tx][ty] = B[0 * SUB_WIDTH + ty + col*wA];
  132.  
  133.     for (int i = 0; i < wA / SUB_WIDTH; i++)
  134.     {
  135.         //przepisanie danych z 'A' do pamiêci wspó³dzielonej 'B'
  136.         Ads[tx][ty] = A_shared[tx][ty];
  137.         Bds[tx][ty] = B_shared[tx][ty];
  138.  
  139.         __syncthreads();
  140.  
  141.         //pobranie kolejnego bloku danych z pamieci globalnej do 'A'
  142.         A_shared[tx][ty] = A[row*wA + i * SUB_WIDTH + tx];
  143.         B_shared[tx][ty] = B[i * SUB_WIDTH + ty + col*wA];
  144.  
  145.         //obliczenia na pamieci 'B'
  146.         for (int k = 0; k < SUB_WIDTH; k++)
  147.         {
  148.             Csub += Ads[tx][k] * Bds[k][ty];
  149.  
  150.         }
  151.         __syncthreads();
  152.     }
  153.  
  154.     C[row*wA + col] = Csub;
  155. }
  156.  
  157.  
  158. void constantInit(float *data, int size, float val)
  159. {
  160.     for (int i = 0; i < size; ++i) {
  161.         data[i] = val;
  162.     }
  163. }
  164.  
  165. /**
  166. * Run a simple test of matrix multiplication using CUDA
  167. */
  168. int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
  169. {
  170.     // Allocate host memory for matrices A and B
  171.     unsigned int size_A = dimsA.x * dimsA.y;
  172.     unsigned int mem_size_A = sizeof(float) * size_A;
  173.     float *h_A = (float *)malloc(mem_size_A);
  174.     unsigned int size_B = dimsB.x * dimsB.y;
  175.     unsigned int mem_size_B = sizeof(float) * size_B;
  176.     float *h_B = (float *)malloc(mem_size_B);
  177.  
  178.     // Initialize host memory
  179.     const float valB = 0.01f;
  180.     constantInit(h_A, size_A, 1.0f);
  181.     constantInit(h_B, size_B, valB);
  182.  
  183.     // Allocate device memory
  184.     float *d_A, *d_B, *d_C;
  185.  
  186.     // Allocate host matrix C
  187.     dim3 dimsC(dimsB.x, dimsA.y, 1);
  188.     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
  189.     float *h_C = (float *)malloc(mem_size_C);
  190.  
  191.     if (h_C == NULL) {
  192.         fprintf(stderr, "Failed to allocate host matrix C!\n");
  193.         exit(EXIT_FAILURE);
  194.     }
  195.  
  196.     cudaError_t error;
  197.  
  198.     //error = cudaMallocHost((void **)&d_A, mem_size_A);
  199.     // error = cudaMalloc((void **)&d_A, mem_size_A);
  200.     error = cudaHostAlloc(&d_A, mem_size_A, cudaHostAllocMapped); //zmiany
  201.  
  202.     if (error != cudaSuccess) {
  203.         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
  204.         exit(EXIT_FAILURE);
  205.     }
  206.  
  207.     //error = cudaMallocHost((void **)&d_B, mem_size_B);
  208.     // error = cudaMalloc((void **)&d_B, mem_size_B);
  209.     error = cudaHostAlloc(&d_B, mem_size_B, cudaHostAllocMapped); //zmiany
  210.  
  211.     if (error != cudaSuccess) {
  212.         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
  213.         exit(EXIT_FAILURE);
  214.     }
  215.  
  216.     //error = cudaMallocHost((void **)&d_C, mem_size_C);
  217.     // error = cudaMalloc((void **)&d_C, mem_size_C);
  218.     error = cudaHostAlloc(&d_C, mem_size_B, cudaHostAllocMapped); //zmiany
  219.  
  220.     if (error != cudaSuccess) {
  221.         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
  222.         exit(EXIT_FAILURE);
  223.     }
  224.  
  225.     cudaStream_t stream1, stream2, stream3;
  226.     cudaStreamCreate(&stream1);
  227.     cudaStreamCreate(&stream2);
  228.     cudaStreamCreate(&stream3);
  229.  
  230.     // copy host memory to device
  231.     //error = cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream1);
  232.     // error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
  233.     error = cudaHostGetDevicePointer(&d_A, h_A, 0); //zmiany
  234.  
  235.     if (error != cudaSuccess) {
  236.         printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
  237.         exit(EXIT_FAILURE);
  238.     }
  239.  
  240.     //error = cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream2);
  241.     // error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
  242.     error = cudaHostGetDevicePointer(&d_B, h_B, 0); //zmiany
  243.  
  244.     if (error != cudaSuccess) {
  245.         printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
  246.         exit(EXIT_FAILURE);
  247.     }
  248.  
  249.     // Setup execution parameters
  250.     dim3 threads(block_size, block_size);
  251.     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
  252.  
  253.     // Create and start timer
  254.     printf("Computing result using CUDA Kernel...\n");
  255.  
  256.     // Performs warmup operation using matrixMul CUDA kernel
  257.     int version = 3;
  258.     switch (version) {
  259.         case 2:
  260.             if (block_size == 16) {
  261.                 matrixMulCUDA_v2<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  262.             }
  263.             else {
  264.                 matrixMulCUDA_v2<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  265.             }
  266.             break;
  267.         case 3:
  268.             if (block_size == 16) {
  269.                 matrixMulCUDA_v3<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  270.             }
  271.             else {
  272.                 matrixMulCUDA_v3<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  273.             }
  274.             break;
  275.         case 4:
  276.             if (block_size == 16) {
  277.                 matrixMulCUDA_v4<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  278.             }
  279.             else {
  280.                 matrixMulCUDA_v4<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  281.             }
  282.             break;
  283.     }
  284.  
  285.     printf("done\n");
  286.  
  287.     cudaDeviceSynchronize();
  288.  
  289.     // Allocate CUDA events that we'll use for timing
  290.     cudaEvent_t start;
  291.     error = cudaEventCreate(&start);
  292.  
  293.  
  294.     if (error != cudaSuccess) {
  295.         fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
  296.         exit(EXIT_FAILURE);
  297.     }
  298.  
  299.     cudaEvent_t stop;
  300.     error = cudaEventCreate(&stop);
  301.  
  302.     if (error != cudaSuccess) {
  303.         fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
  304.         exit(EXIT_FAILURE);
  305.     }
  306.  
  307.     // Record the start event
  308.     error = cudaEventRecord(start, NULL);
  309.  
  310.     if (error != cudaSuccess) {
  311.         fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
  312.         exit(EXIT_FAILURE);
  313.     }
  314.  
  315.     // Execute the kernel
  316.     int nIter = 300;
  317.     switch (version) {
  318.         case 2:
  319.             for (int j = 0; j < nIter; j++) {
  320.                 if (block_size == 16) {
  321.                     matrixMulCUDA_v2<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  322.                 }
  323.                 else {
  324.                     matrixMulCUDA_v2<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  325.                 }
  326.             }
  327.             break;
  328.         case 3:
  329.             for (int j = 0; j < nIter; j++) {
  330.                 if (block_size == 16) {
  331.                     matrixMulCUDA_v3<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  332.                 }
  333.                 else {
  334.                     matrixMulCUDA_v3<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  335.                 }
  336.             }
  337.             break;
  338.         case 4:
  339.             for (int j = 0; j < nIter; j++) {
  340.                 if (block_size == 16) {
  341.                     matrixMulCUDA_v4<16> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  342.                 }
  343.                 else {
  344.                     matrixMulCUDA_v4<32> << < grid, threads >> >(d_C, d_A, d_B, dimsA.x, dimsB.x);
  345.                 }
  346.             }
  347.             break;
  348.     }
  349.  
  350.     // Record the stop event
  351.     error = cudaEventRecord(stop, NULL);
  352.     //dobra teraz mamy tak mala macierz ze mozemy to sami obliczyc, moze dowiemy sie co jest nei tak
  353.     if (error != cudaSuccess) {
  354.         fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
  355.         exit(EXIT_FAILURE);
  356.     }
  357.  
  358.     // Wait for the stop event to complete
  359.     error = cudaEventSynchronize(stop);
  360.  
  361.     if (error != cudaSuccess) {
  362.         fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
  363.         exit(EXIT_FAILURE);
  364.     }
  365.  
  366.     float msecTotal = 0.0f;
  367.     error = cudaEventElapsedTime(&msecTotal, start, stop);
  368.  
  369.     if (error != cudaSuccess) {
  370.         fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
  371.         exit(EXIT_FAILURE);
  372.     }
  373.  
  374.     // Compute and print the performance
  375.     float msecPerMatrixMul = msecTotal / nIter;
  376.     double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
  377.     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
  378.     printf(
  379.         "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
  380.         gigaFlops,
  381.         msecPerMatrixMul,
  382.         flopsPerMatrixMul,
  383.         threads.x * threads.y);
  384.  
  385.     // Copy result from device to host
  386.     //error = cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream3);
  387.     // error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
  388.     error = cudaHostGetDevicePointer(&h_C, d_C, 0); //zmiany
  389.  
  390.     if (error != cudaSuccess) {
  391.         printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
  392.         exit(EXIT_FAILURE);
  393.     }
  394.  
  395.     printf("Checking computed result for correctness: ");
  396.     bool correct = true;
  397.  
  398.     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
  399.         if (fabs(h_C[i] - (dimsA.x * valB)) > 1e-5) {
  400.             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], dimsA.x*valB);
  401.             correct = false;
  402.         }
  403.     }
  404.  
  405.     printf("%s\n", correct ? "OK" : "FAIL");
  406.  
  407.     // Clean up memory
  408.     free(h_A);
  409.     free(h_B);
  410.     free(h_C);
  411.     cudaFree(d_A);
  412.     cudaFree(d_B);
  413.     cudaFree(d_C);
  414.     //ZMIANY NADCHODZĄ #zmiany
  415.     cudaStreamDestroy(stream1);
  416.     cudaStreamDestroy(stream2);
  417.     cudaStreamDestroy(stream3);
  418.  
  419.     printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n");
  420.  
  421.     cudaDeviceReset();
  422.  
  423.     if (correct) {
  424.         return EXIT_SUCCESS;
  425.     } else {
  426.         return EXIT_FAILURE;
  427.     }
  428. }
  429.  
  430.  
  431. /**
  432. * Program main
  433. */
  434. int main(int argc, char **argv)
  435. {
  436.     printf("[Matrix Multiply Using CUDA] - Starting...\n");
  437.  
  438.     if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
  439.         checkCmdLineFlag(argc, (const char **)argv, "?")) {
  440.         printf("Usage -device=n (n >= 0 for deviceID)\n");
  441.         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
  442.         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
  443.         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
  444.  
  445.         exit(EXIT_SUCCESS);
  446.     }
  447.  
  448.     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
  449.     int devID = 0;
  450.  
  451.     if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
  452.         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
  453.         cudaSetDevice(devID);
  454.     }
  455.  
  456.     cudaError_t error;
  457.     cudaDeviceProp deviceProp;
  458.     error = cudaGetDevice(&devID);
  459.  
  460.     if (error != cudaSuccess) {
  461.         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
  462.     }
  463.  
  464.     error = cudaGetDeviceProperties(&deviceProp, devID);
  465.  
  466.     if (deviceProp.computeMode == cudaComputeModeProhibited){
  467.         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
  468.         exit(EXIT_SUCCESS);
  469.     }
  470.  
  471.     if (error != cudaSuccess){
  472.         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
  473.     } else {
  474.         printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
  475.     }
  476.  
  477.     cudaGetDeviceProperties(&prop, 0);
  478.     if (!prop.canMapHostMemory) {
  479.         printf("Nie obsluguje zero-copy")
  480.         exit(0);
  481.     }
  482.     cudaSetDeviceFlags(cudaDeviceMapHost);
  483.  
  484.     // Use a larger block size for Fermi and above
  485.     int block_size = (deviceProp.major < 2) ? 16 : 32;
  486.     int multiplier = 8;
  487.     dim3 dimsA(multiplier * block_size, multiplier * block_size, 1);
  488.     dim3 dimsB(multiplier * block_size, multiplier * block_size, 1);
  489.  
  490.     // width of Matrix A
  491.     if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
  492.         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
  493.     }
  494.  
  495.     // height of Matrix A
  496.     if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
  497.         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
  498.     }
  499.  
  500.     // width of Matrix B
  501.     if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
  502.         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
  503.     }
  504.  
  505.     // height of Matrix B
  506.     if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
  507.         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
  508.     }
  509.  
  510.     if (dimsA.x != dimsB.y) {
  511.         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
  512.             dimsA.x, dimsB.y);
  513.         exit(EXIT_FAILURE);
  514.     }
  515.  
  516.     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
  517.  
  518.     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
  519.     system("PAUSE");
  520.     exit(matrix_result);
  521. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement