Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_functions.h"
- #include "device_launch_parameters.h"
- #include <chrono>
- #include <time.h>
- #include <stdio.h>
- #include <stdlib.h>
- cudaError_t matrMulCuda(float *matrR, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y, int tile_width, bool shredMem);
- void sequentialMul(float *matrVrfy, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y);
- __global__ void mulKernel(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y);
- __global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width);
- int main()
- {
- bool match;
- int i, j, k, l, amount, m_x, m_y, n_x, n_y, sizeM, sizeN, sizeR;
- cudaError_t cudaStatus;
- float *matrM;
- float *matrN;
- float *matrR;
- float *matrVrfy;
- bool shared[2] = { false, true };
- int tile_width[3] = { 16, 32, 64 };
- int amountTiles = 1; //TODO: Change me back to three
- int dimensions[5][4] = { { 300, 1000, 1000, 500 },
- { 30, 50, 50, 40 },
- { 1000, 1000, 1000, 1000 },
- { 50, 10000, 10000, 80 },
- { 5000, 40, 40, 7000 } };
- int amountDimensions = 1;
- srand(time(NULL));
- for (i = 0; i < amountDimensions; i++)
- {
- m_x = dimensions[i][0];
- m_y = dimensions[i][1];
- n_x = dimensions[i][2];
- n_y = dimensions[i][3];
- //allocate memory for one dimensional representation of the 2D array
- sizeM = m_x * m_y;// x down and y right -> 300 rows and 1000 cols
- sizeN = n_x * n_y;// 1000 rows and 500 cols
- sizeR = m_x * n_y;
- matrM = (float*)malloc(sizeM * sizeof(float));
- matrN = (float*)malloc(sizeN * sizeof(float));
- matrR = (float*)malloc(sizeR * sizeof(float));
- matrVrfy = (float*)malloc(sizeR * sizeof(float));
- //Fill both matrixes with random values
- for (j = 0; j < sizeM; j++)
- {
- matrM[j] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / 1000.0)); // random floats between 0 and 1000
- }
- for (j = 0; j < sizeN; j++)
- {
- matrN[j] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / 1000.0)); // random floats between 0 and 1000
- }
- /*float tmp = 1.0;
- //Fill both matrixes with random values
- for (j = 0; j < sizeM; j++)
- {
- matrM[j] = tmp;
- tmp++;
- }
- tmp = 1.0;
- for (j = 0; j < sizeN; j++)
- {
- matrN[j] = tmp;
- tmp++;
- }*/
- //calc sequential to confirm correct result
- auto start = std::chrono::high_resolution_clock::now();
- sequentialMul(matrVrfy, matrM, matrN, m_x, m_y, n_x, n_y);
- auto end = std::chrono::high_resolution_clock::now();
- std::chrono::duration<double> diff = end - start;
- printf("Matrix Multiplication with M: %dx%d, N:%dx%d:\n", m_x, m_y, n_x, n_y);
- printf("CPU Time: %f ms\n", diff.count() * 1000);
- for (j = 0; j < amountTiles; j++)
- {
- for (k = 0; k < 2; k++)
- {
- //Mul both matrixes
- cudaStatus = matrMulCuda(matrR, matrM, matrN, m_x, m_y, n_x, n_y, tile_width[j], shared[k]);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "matrMulCuda failed!");
- return 1;
- }
- match = true;
- amount = m_x * n_y;
- for (l = 0; l < amount; l++)//300*500
- {
- //printf("(j, R, Vrfy: %d, %f, %f)\n", j, matrR[j], matrVrfy[j]);
- if (matrR[l] != matrVrfy[l])
- {
- printf("Result does not matche! (l, R, Vrfy: %d, %f, %f)\n", l, matrR[l], matrVrfy[l]);
- match = false;
- break;
- }
- }
- if (match)
- {
- //printf("Result matches sequential calculation.\n");
- }
- //Reset result for next use
- free(matrR);
- matrR = (float*)malloc(sizeR * sizeof(float));
- }
- printf("\n");
- }
- printf("\n");
- //free memory
- free(matrM);
- free(matrN);
- free(matrR);
- free(matrVrfy);
- }
- // cudaDeviceReset must be called before exiting in order for profiling and
- // tracing tools such as Nsight and Visual Profiler to show complete traces.
- cudaStatus = cudaDeviceReset();
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaDeviceReset failed!\n");
- return 1;
- }
- //printf("\n");
- //printf("Press Enter to exit program...");
- //getchar();
- return 0;
- }
- __global__ void mulKernel(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y)
- {
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
- int i;
- if ((row < m_x) && (col < n_y))
- {
- float tmp = 0.0;
- for (i = 0; i < m_y; i++)
- {
- tmp += matrM[row * m_y + i] * matrN[col + n_y * i];
- }
- matrR[row * n_y + col] = tmp;
- }
- }
- __global__ void mulKernelSM(float *matrR, const float *matrM, const float *matrN, const int m_x, const int m_y, const int n_x, const int n_y, const int tile_width)
- {
- int i, j;
- extern __shared__ float shared[];
- float *matrM_sm = shared;
- float *matrN_sm = &shared[tile_width * tile_width];
- int bx = blockIdx.x;
- int by = blockIdx.y;
- int tx = threadIdx.x;
- int ty = threadIdx.y;
- int row = by * tile_width + ty;
- int col = bx * tile_width + tx;
- float tmp;
- int limit = ceil(m_y / (float) tile_width);
- for (i = 0; i < limit; i++)
- {
- tmp = 0.0;
- if (i * tile_width + tx < m_y && row < m_x)
- matrM_sm[ty * tile_width + tx] = matrM[row * m_y + (i * tile_width + tx)];
- else
- matrM_sm[ty * tile_width + tx] = 0.0;
- if (i * tile_width + ty < n_x && col < n_y)
- matrN_sm[ty * tile_width + tx] = matrN[col + (i * tile_width + ty) * n_y];
- else
- matrN_sm[ty * tile_width + tx] = 0.0;
- __syncthreads();
- for (j = 0; j < tile_width; j++)
- tmp += matrM_sm[ty * tile_width + j] * matrN_sm[j * tile_width + tx];
- __syncthreads();
- }
- if (row < m_x && col < n_y)
- matrR[row * n_y + col] = tmp;
- }
- // Helper function for using CUDA to mul matrix M and N.
- cudaError_t matrMulCuda(float *matrR, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y, int tile_width, bool shredMem)
- {
- float *dev_matrR = 0;
- float *dev_matrM = 0;
- float *dev_matrN = 0;
- cudaError_t cudaStatus;
- dim3 dim_grid, dim_block;
- int sizeR = m_x * n_y;
- int sizeM = m_x * m_y;
- int sizeN = n_x * n_y;
- int shared = 2 * tile_width * tile_width * sizeof(float);
- dim_block.x = tile_width;
- dim_block.y = tile_width;
- dim_block.z = 1;
- dim_grid.x = (n_y + dim_block.x - 1) / tile_width;
- dim_grid.y = (m_x + dim_block.y - 1) / tile_width;
- dim_grid.z = 1;
- // Choose which GPU to run on, change this on a multi-GPU system.
- cudaStatus = cudaSetDevice(0);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- // Allocate GPU buffers for three matrix
- cudaStatus = cudaMalloc((void**)&dev_matrR, sizeR * sizeof(float));
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_matrM, sizeM * sizeof(float));
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_matrN, sizeN * sizeof(float));
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- // Copy input matrix from host memory to GPU buffers.
- cudaStatus = cudaMemcpy(dev_matrM, matrM, sizeM * sizeof(float), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- cudaStatus = cudaMemcpy(dev_matrN, matrN, sizeN * sizeof(float), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- if (shredMem)
- {
- cudaEventRecord(start);
- // Launch a kernel on the GPU with one thread for each element.
- mulKernelSM<<<dim_grid, dim_block, shared>>>(dev_matrR, dev_matrM, dev_matrN, m_x, m_y, n_x, n_y, tile_width);
- cudaEventRecord(stop);
- cudaEventSynchronize(stop);
- }
- else
- {
- cudaEventRecord(start);
- // Launch a kernel on the GPU with one thread for each element.
- mulKernel<<<dim_grid, dim_block>>>(dev_matrR, dev_matrM, dev_matrN, m_x, m_y, n_x, n_y);
- cudaEventRecord(stop);
- cudaEventSynchronize(stop);
- }
- float elapsed = 0;
- cudaEventElapsedTime(&elapsed, start, stop);
- cudaEventDestroy(start);
- cudaEventDestroy(stop);
- if (shredMem)
- {
- printf("GPU Time with SM: %f ms, tile_width: %d\n", elapsed, tile_width);
- }
- else
- {
- printf("GPU Time without SM: %f ms, tile_width: %d\n", elapsed, tile_width);
- }
- // Check for any errors launching the kernel
- cudaStatus = cudaGetLastError();
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "mulKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
- goto Error;
- }
- // cudaDeviceSynchronize waits for the kernel to finish, and returns
- // any errors encountered during the launch.
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching mulKernel!\n", cudaStatus);
- goto Error;
- }
- // Copy output matrix from GPU buffer to host memory.
- cudaStatus = cudaMemcpy(matrR, dev_matrR, sizeR * sizeof(float), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess)
- {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- Error:
- cudaFree(dev_matrR);
- cudaFree(dev_matrM);
- cudaFree(dev_matrN);
- return cudaStatus;
- }
- void sequentialMul(float *matrVrfy, const float *matrM, const float *matrN, int m_x, int m_y, int n_x, int n_y)
- {
- int i, j, k, curPos;
- float tmp;
- for (i = 0; i < m_x; i++)//300
- {
- for (j = 0; j < n_y; j++)//500
- {
- curPos = i * n_y + j;
- tmp = 0.0;
- for (k = 0; k < m_y; k++)
- {
- tmp += matrM[i * m_y + k] * matrN[j + n_y * k];
- }
- matrVrfy[curPos] = tmp;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement