Advertisement
Guest User

Untitled

a guest
Feb 21st, 2019
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.07 KB | None | 0 0
  1. /**
  2. * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
  3. * OSTATECZNY KOD
  4. * Please refer to the NVIDIA end user license agreement (EULA) associated
  5. * with this source code for terms and conditions that govern your use of
  6. * this software. Any use, reproduction, disclosure, or distribution of
  7. * this software and related documentation outside the terms of the EULA
  8. * is strictly prohibited.
  9. *
  10. */
  11.  
  12. /**
  13. * Matrix multiplication: C = A * B.
  14. * Host code.
  15. *
  16. * This sample implements matrix multiplication as described in Chapter 3
  17. * of the programming guide.
  18. * It has been written for clarity of exposition to illustrate various CUDA
  19. * programming principles, not with the goal of providing the most
  20. * performant generic kernel for matrix multiplication.
  21. *
  22. * See also:
  23. * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
  24. * in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
  25. * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
  26. */
  27.  
  28. // System includes
  29. #define WIN32
  30. #include <stdio.h>
  31. #include <assert.h>
  32. #include "stdio.h"
  33. // CUDA runtime
  34. #include <cuda_runtime.h>
  35.  
  36. // Helper functions and utilities to work with CUDA
  37. #include "helper_functions.h"
  38.  
  39. /**
  40. * Matrix multiplication (CUDA Kernel) on the device: C = A * B
  41. * wA is A's width and wB is B's width
  42. */
  43. template <int BLOCK_SIZE> __global__ void
  44. matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
  45. {
  46. // Block index
  47. int bx = blockIdx.x;
  48. int by = blockIdx.y;
  49.  
  50. // Thread index
  51. int tx = threadIdx.x;
  52. int ty = threadIdx.y;
  53. // Index of the first sub-matrix of A processed by the block
  54. int aBegin = wA * BLOCK_SIZE * by * 2;
  55.  
  56. // Index of the last sub-matrix of A processed by the block
  57. int aEnd = aBegin + wA - 1;
  58.  
  59. // Step size used to iterate through the sub-matrices of A
  60. int aStep = BLOCK_SIZE;
  61.  
  62. // Index of the first sub-matrix of B processed by the block
  63. int bBegin = BLOCK_SIZE * bx;
  64.  
  65. // Step size used to iterate through the sub-matrices of B
  66. int bStep = BLOCK_SIZE * wB;
  67. // Csub is used to store the element of the block sub-matrix
  68. // that is computed by the thread
  69. float Csub = 0;
  70.  
  71. __syncthreads();
  72. // Loop over all the sub-matrices of A and B
  73. // required to compute the block sub-matrix
  74. for (int a = aBegin, b = bBegin;
  75. a <= aEnd;
  76. a += aStep, b += bStep)
  77. {
  78.  
  79. // Declaration of the shared memory array As used to
  80. // store the sub-matrix of A
  81.  
  82. // Load the matrices from device memory
  83. // to shared memory; each thread loads
  84. // one element of each matrix
  85. // Synchronize to make sure the matrices are loaded
  86.  
  87. // Multiply the two matrices together;
  88. // each thread computes one element
  89. // of the block sub-matrix
  90. #pragma unroll
  91.  
  92. for (int k = 0; k < BLOCK_SIZE; ++k)
  93. {
  94. Csub += A[aBegin + wA * ty + tx] * B[bBegin + wB * ty + tx];
  95. }
  96.  
  97. // Synchronize to make sure that the preceding
  98. // computation is done before loading two new
  99. // sub-matrices of A and B in the next iteration
  100. }
  101. __syncthreads();
  102. // Write the block sub-matrix to device memory;
  103. // each thread writes one element
  104. int c = wB * BLOCK_SIZE * by * 2 + BLOCK_SIZE * bx;
  105. C[c + wB * ty + tx] = Csub;
  106. //printf(" %f, %i, %i \n", Csub, c + wB * ty + tx, c + wB * ty + 10240 + tx);
  107.  
  108. }
  109.  
  110. void constantInit(float *data, int size, float val)
  111. {
  112. for (int i = 0; i < size; ++i)
  113. {
  114. data[i] = (float)rand() / RAND_MAX;
  115. }
  116. }
  117.  
  118. /**
  119. * Run a simple test of matrix multiplication using CUDA
  120. */
  121. int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
  122. {
  123. // Allocate host memory for matrices A and B and C
  124. unsigned int size_A = dimsA.x * dimsA.y;
  125. unsigned int mem_size_A = sizeof(float) * size_A;
  126. float *h_A = (float *)malloc(mem_size_A);
  127.  
  128. unsigned int size_B = dimsB.x * dimsB.y;
  129. unsigned int mem_size_B = sizeof(float) * size_B;
  130. float *h_B = (float *)malloc(mem_size_B);
  131.  
  132. dim3 dimsC(dimsB.x, dimsA.y, 1);
  133. unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
  134. float *h_C = (float *)malloc(mem_size_C);
  135.  
  136.  
  137.  
  138. // Allocate device memory
  139. float *d_A, *d_B, *d_C;
  140.  
  141.  
  142. const float valB = 0.01f;
  143. constantInit(h_A, size_A, 1.0f);
  144. constantInit(h_B, size_B, valB);
  145.  
  146.  
  147. cudaError_t error;
  148. error = cudaMalloc((void **)&d_A, mem_size_A);
  149. error = cudaMalloc((void **)&d_B, mem_size_B);
  150. error = cudaMalloc((void **)&d_C, mem_size_C);
  151.  
  152. // copy host memory to device
  153. error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
  154.  
  155. error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
  156. // Setup execution parameters
  157. dim3 threads(block_size, block_size);
  158. dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y / 2);//Zmniejszam gridy o połowę
  159. // Create and start timer
  160. printf("Computing result using CUDA Kernel..%i \n", grid.y);
  161.  
  162. // Performs warmup operation using matrixMul CUDA kernel
  163. matrixMulCUDA<32> << < grid, threads >> > (d_C, d_A, d_B, dimsA.x, dimsB.x);
  164. printf("done\n");
  165.  
  166. cudaDeviceSynchronize();
  167.  
  168. // Allocate CUDA events that we'll use for timing
  169. cudaEvent_t start;
  170. error = cudaEventCreate(&start);
  171.  
  172.  
  173. // Record the start event
  174. error = cudaEventRecord(start, NULL);
  175.  
  176. // Execute the kernel
  177. int nIter = 100;
  178.  
  179. for (int j = 0; j < nIter; j++)
  180. {
  181. if (block_size == 16)
  182. {
  183. matrixMulCUDA<16> << < grid, threads >> > (d_C, d_A, d_B, dimsA.x, dimsB.x);
  184. }
  185. else
  186. {
  187. matrixMulCUDA<32> << < grid, threads >> > (d_C, d_A, d_B, dimsA.x, dimsB.x);
  188. }
  189.  
  190. }
  191.  
  192.  
  193. cudaEvent_t stop;
  194. error = cudaEventCreate(&stop);
  195.  
  196. // Record the stop event
  197. error = cudaEventRecord(stop, NULL);
  198.  
  199. // Wait for the stop event to complete
  200. error = cudaEventSynchronize(stop);
  201.  
  202.  
  203. float msecTotal = 0.0f;
  204. error = cudaEventElapsedTime(&msecTotal, start, stop);
  205.  
  206.  
  207. // Compute and print the performance
  208. float msecPerMatrixMul = msecTotal / nIter;
  209. double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
  210. double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
  211. printf(
  212. "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
  213. gigaFlops,
  214. msecPerMatrixMul,
  215. flopsPerMatrixMul,
  216. threads.x * threads.y);
  217.  
  218. // Copy result from device to host
  219. error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
  220.  
  221. if (error != cudaSuccess)
  222. {
  223. printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
  224. exit(EXIT_FAILURE);
  225. }
  226.  
  227. printf("Checking computed result for correctness: ");
  228. bool correct = true;
  229.  
  230. //for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
  231. //{
  232. // if (fabs(h_C[i] - (dimsA.x * valB)) > 1e-3)
  233. // {
  234. // //printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-3\n", i, h_C[i], dimsA.x*valB);
  235. // correct = false;
  236. // }
  237. //}
  238.  
  239. //printf("%s\n", correct ? "OK" : "FAIL");
  240.  
  241. // Clean up memory
  242. free(h_A);
  243. free(h_B);
  244. free(h_C);
  245. cudaFree(d_A);
  246. cudaFree(d_B);
  247. cudaFree(d_C);
  248.  
  249. cudaDeviceReset();
  250.  
  251. }
  252.  
  253.  
  254. /**
  255. * Program main
  256. */
  257. int main(int argc, char **argv)
  258. {
  259. printf("[Matrix Multiply Using CUDA] - Starting...\n");
  260.  
  261. if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
  262. checkCmdLineFlag(argc, (const char **)argv, "?"))
  263. {
  264. printf("Usage -device=n (n >= 0 for deviceID)\n");
  265. printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
  266. printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
  267. printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
  268.  
  269. exit(EXIT_SUCCESS);
  270. }
  271.  
  272. // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
  273. int devID = 0;
  274. cudaError_t error;
  275. cudaDeviceProp deviceProp;
  276. error = cudaGetDevice(&devID);
  277. error = cudaGetDeviceProperties(&deviceProp, devID);
  278. // Use a larger block size for Fermi and above
  279. int block_size = (deviceProp.major < 2) ? 16 : 32;
  280.  
  281. int i = 512;
  282.  
  283. dim3 dimsA(i, i, 1);
  284. dim3 dimsB(i, i, 1);
  285.  
  286. printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
  287. int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
  288. std::cin >> i;
  289. exit(matrix_result);
  290. return 0;
  291.  
  292. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement