Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "time.h"
- #include "stdio.h"
- #include "stdlib.h"
- #include "assert.h"
- #include "cuda.h"
- #define tx threadIdx.x
- #define ty threadIdx.y
- #define KER_MONO 0x0
- #define KER_GRID 0x1
- #define MEM_STD 0x0
- #define MEM_UNI 0x1
- #define KER_SIZE 0x20
- const int OP1_ROWS = 2000,
- OP1_COLS = 500,
- OP2_COLS = 2000,
- SIZE = OP1_ROWS * OP1_COLS,
- ACC_SIZE = OP1_ROWS * OP2_COLS;
- ////////////////////////////DEVICE CODE HERE////////////////////////////////////
- //////////////////////////////////////////////////////////////////////////////
- __global__
- void cudaFill(float *buffer, float value, const int len) //aka matrixInit
- {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- if(i < len)
- {
- buffer[i] = value;
- }
- }
- __global__
- void cudaMatrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols)
- {
- int i = blockIdx.x * blockDim.x + threadIdx.x,
- j = 0x0,
- k = 0x0,
- rowStride = blockDim.x * gridDim.x,
- colStride = blockDim.y * gridDim.y;
- for(; i < op1Rows; i += rowStride)
- {
- for(j = blockIdx.y * blockDim.y + threadIdx.y; j < op2Cols; j += colStride)
- {
- float scalarProduct = 0.0f;
- for(k = 0x0; k < dim; k++)
- {
- scalarProduct += op1[i * dim + k] * op2[k * dim + j];
- }
- acc[i * op1Rows + j] = scalarProduct;
- }
- }
- }
- __global__
- void cudaTiledMatrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols)
- {
- __shared__ int s_op1[KER_SIZE][KER_SIZE];
- __shared__ int s_op2[KER_SIZE][KER_SIZE];
- int row = blockIdx.y * blockDim.y + ty,
- col = 0x0,
- p = 0x0,
- k = 0x0,
- rowStride = blockDim.y * gridDim.y,
- colStride = blockDim.x * gridDim.x,
- rowLimit = op1Rows + (op1Rows % rowStride),
- colLimit = op2Cols + (op2Cols % colStride);
- for(; row < rowLimit; row += rowStride)
- {
- for(col = blockIdx.x * blockDim.x + tx; col < colLimit; col += colStride)
- {
- float scalarProduct = 0.0f;
- for(p = 0x0; p < (KER_SIZE + dim - 0x1) / KER_SIZE; p++) //TODO redefine condition!
- {
- if(p * KER_SIZE + tx < dim && row < op1Rows)
- {
- s_op1[ty][tx] = op1[row * dim + k * KER_SIZE + tx];
- }
- else
- {
- s_op1[ty][tx] = 0.0f;
- }
- if(p * KER_SIZE + ty < dim && col < op2Cols)
- {
- s_op2[ty][tx] = op2[(p * KER_SIZE + ty) * op2Cols + col];
- }
- else
- {
- s_op2[ty][tx] = 0.0f;
- }
- __syncthreads();
- for(k = 0x0; k < KER_SIZE; k++)
- {
- scalarProduct += s_op1[ty][k] * s_op2[k][tx];
- }
- __syncthreads();
- }
- if(row < op1Rows && col < op2Cols)
- {
- acc[row * op2Cols + col] = scalarProduct;
- }
- }
- }
- }
- //////////////////////////////////////////////////////////////////////////////
- ////////////////////////////HOST CODE HERE////////////////////////////////////
- //////////////////////////////////////////////////////////////////////////////
- void matrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols);
- void fill(float *buffer, const float value, const int len);
- void assertfy(float *buffer, const float value, const int len);
- void printUpTo(float *buffer, const int len);
- void matrixMultiplication(float *op1, float *op2, float *acc, const int dim, const int op1Rows, const int op2Cols)
- {
- int i = 0x0,
- j = 0x0,
- k = 0x0;
- for(; i < op1Rows; i++)
- {
- for(j = 0x0; j < op2Cols; j++)
- {
- float scalarProduct = 0.0f;
- for(k = 0x0; k < dim; k++)
- {
- scalarProduct += op1[i * dim + k] * op2[k * dim + j];
- }
- acc[i * op1Rows + j] = scalarProduct;
- }
- }
- }
- void fill(float *buffer, const float value, const int len)
- {
- int i = 0x0;
- for(; i < len; i++)
- {
- buffer[i] = value;
- }
- }
- void assertfy(float *buffer, const float value, const int len)
- {
- int i = 0x0;
- for(; i < len; i++)
- {
- if(buffer[i] != value)
- {
- printf("[x] Illegal Found: %1.2f\n", buffer[i]);
- return;
- }
- assert(buffer[i] == value);
- }
- }
- void printUpTo(float *buffer, const int len)
- {
- int i = 0x0;
- for(; i < len; i++)
- {
- printf("%1.2f\t", buffer[i]);
- }
- printf("\n");
- }
- //////////////////////////////////////////////////////////////////////////////
- //////////////////////EXECTUTION TYPE HERE////////////////////////////////////
- //////////////////////////////////////////////////////////////////////////////
- void exec_host_only();
- void exec_cuda_std(const int ky, const int kx);
- void exec_cuda_tiled(const int ky, const int kx);
- void exec_host_only()
- {
- float *op1,
- *op2,
- *acc;
- op1 = (float*) malloc(SIZE * sizeof(float));
- op2 = (float*) malloc(SIZE * sizeof(float));
- acc = (float*) malloc(ACC_SIZE * sizeof(float));
- clock_t begin = clock();
- fill(op1, 1.0f, SIZE);
- fill(op2, 2.0f, SIZE);
- matrixMultiplication(op1, op2, acc, OP1_COLS, OP1_ROWS, OP2_COLS);
- clock_t end = clock();
- float elapsedTime = (float)(end - begin) / CLOCKS_PER_SEC;
- printf("Elapsed Time: %1.2f\n", elapsedTime);
- assertfy(acc, 1000.0f, ACC_SIZE);
- printUpTo(acc, 0xa);
- free(op1);
- free(op2);
- free(acc);
- }
- void exec_cuda_std(const int ky, const int kx)
- {
- float *op1,
- *op2,
- *acc;
- cudaMallocManaged((void**) &op1, SIZE * sizeof(float), cudaMemAttachGlobal);
- cudaMallocManaged((void**) &op2, SIZE * sizeof(float), cudaMemAttachGlobal);
- cudaMallocManaged((void**) &acc, ACC_SIZE * sizeof(float), cudaMemAttachGlobal);
- dim3 blockDim(0x400, 0x1, 0x1);
- dim3 gridDim(ceil(SIZE / (float) 0x400), 0x1, 0x1);
- cudaFill<<<gridDim, blockDim>>>(op1, 1.0f, SIZE);
- cudaFill<<<gridDim, blockDim>>>(op2, 2.0f, SIZE);
- dim3 blockDim_(ky, kx, 0x1);
- dim3 gridDim_(0x20, 0x20, 0x1);
- printf("[*] gridDim(%d, %d)\n", gridDim_.x, gridDim_.y);
- printf("[*] blockDim(%d, %d)\n", blockDim_.x, blockDim_.y);
- cudaMatrixMultiplication<<<gridDim_, blockDim_>>>(op1, op2, acc, OP1_COLS, OP1_ROWS, OP2_COLS);
- cudaDeviceSynchronize();
- assertfy(acc, 1000.0f, ACC_SIZE);
- printUpTo(acc, 0xa);
- cudaFree(op1);
- cudaFree(op2);
- cudaFree(acc);
- }
- void exec_cuda_tiled(const int ky, const int kx)
- {
- float *op1,
- *op2,
- *acc;
- cudaMallocManaged((void**) &op1, SIZE * sizeof(float), cudaMemAttachGlobal);
- cudaMallocManaged((void**) &op2, SIZE * sizeof(float), cudaMemAttachGlobal);
- cudaMallocManaged((void**) &acc, ACC_SIZE * sizeof(float), cudaMemAttachGlobal);
- dim3 blockDim(0x400, 0x1, 0x1);
- dim3 gridDim(ceil(SIZE / (float) 0x400), 0x1, 0x1);
- cudaFill<<<gridDim, blockDim>>>(op1, 1.0f, SIZE);
- cudaFill<<<gridDim, blockDim>>>(op2, 2.0f, SIZE);
- dim3 blockDim_(ky, kx, 0x1);
- dim3 gridDim_(0x200, 0x200, 0x1);
- printf("[*] gridDim(%d, %d)\n", gridDim_.x, gridDim_.y);
- printf("[*] blockDim(%d, %d)\n", blockDim_.x, blockDim_.y);
- cudaTiledMatrixMultiplication<<<gridDim_, blockDim_>>>(op1, op2, acc, OP1_COLS, OP1_ROWS, OP2_COLS);
- cudaDeviceSynchronize();
- assertfy(acc, 100.0f, ACC_SIZE);
- printUpTo(acc, 0xa);
- cudaFree(op1);
- cudaFree(op2);
- cudaFree(acc);
- }
- //////////////////////////////////////////////////////////////////////////////
- int main(int argc, char const *argv[])
- {
- exec_cuda_tiled(KER_SIZE, KER_SIZE);
- return 0x0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement