Advertisement
Guest User

Untitled

a guest
Apr 26th, 2018
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.41 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <cuda.h>
  4.  
  5. #define BLOCK_SIZE 16
  6.  
  7. // Matrices are stored in row-major order
  8.  
  9.  
  10. typedef struct {
  11.  
  12. int width;
  13. int height;
  14. float* elements;
  15. }Matrix;
  16.  
  17. __global__ void MatrixMultKern(const Matrix A, const Matrix B, const Matrix C) {
  18.  
  19. // Calculate the column index of C and B
  20.  
  21. int col = blockIdx.x * blockDim.x + threadIdx.x;
  22.  
  23. // Calculate the row index of C and of A
  24.  
  25. int row = blockIdx.y * blockDim.y + threadIdx.y;
  26.  
  27. if ((row < A.height) && (col < B.width)) {
  28.  
  29. float Cvalue = 0;
  30.  
  31. // each thread computes one element of the block sub-matrix
  32.  
  33. for (int k = 0; k < A.width; ++k) {
  34.  
  35. Cvalue += A.elements[row * A.width + k] * B.elements[k*B.width + col];
  36.  
  37. }
  38.  
  39. C.elements[row * C.width + col] = Cvalue;
  40.  
  41. }
  42. }
  43.  
  44.  
  45. // Matrix multiplication - Host Code
  46. // Matrix dimensions are assumed to be multiples of BLOCK_SIZE
  47. void MatrixMult(const Matrix h_A, const Matrix h_B, Matrix h_C)
  48. {
  49.  
  50. cudaEvent_t start, stop;
  51. cudaEventCreate(&start);
  52. cudaEventCreate(&stop);
  53. // Load A and B into device memory
  54. Matrix d_A;
  55. d_A.width = h_A.width; d_A.height = h_A.height;
  56. size_t size = h_A.width * h_A.height * sizeof(float);
  57. cudaMalloc(&d_A.elements, size);
  58. cudaMemcpy(d_A.elements, h_A.elements, size, cudaMemcpyHostToDevice);
  59.  
  60.  
  61. Matrix d_B;
  62. d_B.width = h_B.width; d_B.height = h_B.height;
  63. size = h_B.width * h_B.height * sizeof(float);
  64. cudaMalloc(&d_B.elements, size);
  65. cudaMemcpy(d_B.elements, h_B.elements, size, cudaMemcpyHostToDevice);
  66.  
  67.  
  68. // Allocate C in Device memory
  69. Matrix d_C;
  70. d_C.width = h_C.width; d_C.height = h_C.height;
  71. size = h_C.width * h_C.height * sizeof(float);
  72. cudaMalloc(&d_C.elements, size);
  73.  
  74.  
  75. // Invoke Kernel
  76. dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  77. dim3 dimGrid(d_B.width / dimBlock.x, d_A.height / dimBlock.y);
  78.  
  79. cudaEventRecord(start);
  80. MatrixMultKern<<< dimGrid, dimBlock >>>(d_A, d_B, d_C);
  81.  
  82. cudaEventRecord(stop);
  83.  
  84.  
  85. // Read C from Device to Host
  86. cudaMemcpy(h_C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
  87.  
  88. float milliseconds = 0;
  89. cudaEventElapsedTime(&milliseconds, start, stop);
  90. printf("elapsed time was: %f\n milliseconds", milliseconds);
  91.  
  92.  
  93. // Free Device Memory
  94. cudaFree(d_A.elements);
  95. cudaFree(d_B.elements);
  96. cudaFree(d_C.elements);
  97. }
  98.  
  99. int main(int argc, char* argv[]) {
  100. Matrix A, B, C;
  101.  
  102. // Read Dimensions of A and B
  103. A.height = atoi(argv[1]);
  104. A.width = atoi(argv[2]);
  105. B.height = A.width;
  106. B.width = atoi(argv[3]);
  107.  
  108.  
  109. A.elements = (float*)malloc(A.width * A.height * sizeof(float));
  110. B.elements = (float*)malloc(B.width * B.height * sizeof(float));
  111. C.height = A.height;
  112. C.width = B.width;
  113. C.elements = (float*)malloc(C.width * C.height * sizeof(float));
  114.  
  115.  
  116. for(int i = 0; i < 10; i++)
  117. for(int j = 0; j < 10; j++)
  118. A.elements[i*A.width + j] = (float)(rand() % 3);
  119. for(int i = 0; i < 10; i++)
  120. for(int j = 0; j < 10; j++)
  121. B.elements[i*B.width + j] = (float)(rand() % 2);
  122. MatrixMult(A, B, C);
  123. for(int i = 0; i < 10; i++){
  124. for(int j = 0; j < 10; j++)
  125. printf("%f ", A.elements[i*A.width + j]);
  126. printf("\n");
  127. }
  128. printf("\n");
  129. for(int i = 0; i < 10; i++){
  130. for(int j = 0; j < 10; j++)
  131. printf("%f ", B.elements[i*B.width + j]);
  132. printf("\n");
  133. }
  134. printf("\n");
  135. for(int i = 0; i < 10; i++){
  136. for(int j = 0; j < 10; j++)
  137. printf("%f ", C.elements[i*C.width + j]);
  138. printf("\n");
  139. }
  140. printf("\n");
  141. return 0;
  142. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement