Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <fstream>
- using namespace std;
- #define TILE_WIDTH 16
- void randomFloats(float* data, int size){
- for(int i=0; i<size; i++)
- data[i] = rand() / (float)RAND_MAX;
- }
- // Compute C = A * B
- __global__ void matrixMultiplyShared(float * A, float * B, float * C,
- int numARows, int numAColumns,
- int numBRows, int numBColumns,
- int numCRows, int numCColumns) {
- //@@ Insert code to implement matrix multiplication here
- //@@ You have to use shared memory for this MP
- __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
- __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];
- int bx = blockIdx.x; int by = blockIdx.y;
- int tx = threadIdx.x; int ty = threadIdx.y;
- int row = by * TILE_WIDTH + ty;
- int col = bx * TILE_WIDTH + tx;
- float Cvalue = 0.0;
- // Loop over the M and N tiles required to compute the Pd element
- for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {
- if(row<numARows && m*TILE_WIDTH+tx < numAColumns){
- ds_A[ty][tx] = A[row*numAColumns + m*TILE_WIDTH+tx];
- } else {
- ds_A[ty][tx] = 0;
- }
- if(m*TILE_WIDTH+ty < numBRows && col < numBColumns){
- ds_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+col];
- } else {
- ds_B[ty][tx] = 0;
- }
- __syncthreads();
- if(row < numCRows && col < numCColumns){
- for (int k = 0; k < TILE_WIDTH; ++k)
- Cvalue += ds_A[ty][k] * ds_B[k][tx];
- }
- __syncthreads();
- }
- if(row < numCRows && col < numCColumns)
- C[row*numCColumns+col] = Cvalue;
- }
- int main(int argc, char ** argv) {
- float * hostA; // The A matrix
- float * hostB; // The B matrix
- float * hostC; // The output C matrix
- float * deviceA;
- float * deviceB;
- float * deviceC;
- int numARows; // number of rows in the matrix A
- int numAColumns; // number of columns in the matrix A
- int numBRows; // number of rows in the matrix B
- int numBColumns; // number of columns in the matrix B
- int numCRows; // number of rows in the matrix C
- int numCColumns; // number of columns in the matrix C
- ofstream fout("output.raw");
- numARows = numAColumns = 4096;
- numBRows = numBColumns = 4096;
- hostA = (float*)malloc(numARows*numAColumns * sizeof(float));
- hostB = (float*)malloc(numBRows*numBColumns * sizeof(float));
- randomFloats(hostA, numARows*numAColumns);
- randomFloats(hostB, numBRows*numBColumns);
- //@@ Set numCRows and numCColumns
- numCRows = numARows;
- numCColumns = numBColumns;
- //@@ Allocate the hostC matrix
- hostC = (float*)malloc(numCColumns * numCRows * sizeof(float));
- //@@ Allocate GPU memory here
- cudaMalloc((void**)&deviceA, numARows * numAColumns * sizeof(float));
- cudaMalloc((void**)&deviceB, numBRows * numBColumns * sizeof(float));
- cudaMalloc((void**)&deviceC, numCRows * numCColumns * sizeof(float));
- //@@ Copy memory to the GPU here
- cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);
- //@@ Initialize the grid and block dimensions here
- dim3 dimBlock(16, 16);
- dim3 dimGrid((numBColumns + dimBlock.x - 1) / dimBlock.x,
- (numARows + dimBlock.y - 1) / dimBlock.y);
- //@@ Launch the GPU Kernel here
- matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC,
- numARows, numAColumns,
- numBRows, numBColumns,
- numCRows, numCColumns);
- cudaThreadSynchronize();
- //@@ Copy the GPU memory back to the CPU here
- cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost);
- //@@ Free the GPU memory here
- cudaFree(deviceA);
- cudaFree(deviceB);
- cudaFree(deviceC);
- /*
- for(int i=0; i<numCRows; i++){
- for(int j=0; j<numCColumns; j++){
- fout << hostC[i*numCColumns+j] << " ";
- }
- fout << "\n";
- }
- */
- free(hostA);
- free(hostB);
- free(hostC);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement