Advertisement
Guest User

A Maz

a guest
Jan 21st, 2013
224
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.20 KB | None | 0 0
  1. #include <iostream>
  2. #include <fstream>
  3. using namespace std;
  4.  
  5. #define TILE_WIDTH 16
  6.  
  7. void randomFloats(float* data, int size){
  8.     for(int i=0; i<size; i++)
  9.         data[i] = rand() / (float)RAND_MAX;
  10. }
  11.  
  12. // Compute C = A * B
  13. __global__ void matrixMultiplyShared(float * A, float * B, float * C,
  14.                          int numARows, int numAColumns,
  15.                          int numBRows, int numBColumns,
  16.                          int numCRows, int numCColumns) {
  17.     //@@ Insert code to implement matrix multiplication here
  18.     //@@ You have to use shared memory for this MP
  19.     __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
  20.     __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];
  21.  
  22.     int bx = blockIdx.x; int by = blockIdx.y;
  23.     int tx = threadIdx.x; int ty = threadIdx.y;
  24.  
  25.     int row = by * TILE_WIDTH + ty;
  26.     int col = bx * TILE_WIDTH + tx;
  27.  
  28.     float Cvalue = 0.0;
  29.  
  30. // Loop over the M and N tiles required to compute the Pd element
  31.     for (int m = 0; m < (numAColumns-1)/TILE_WIDTH+1; ++m) {
  32.         if(row<numARows && m*TILE_WIDTH+tx < numAColumns){
  33.             ds_A[ty][tx] = A[row*numAColumns + m*TILE_WIDTH+tx];
  34.         } else {
  35.             ds_A[ty][tx] = 0;
  36.         }
  37.         if(m*TILE_WIDTH+ty < numBRows && col < numBColumns){
  38.             ds_B[ty][tx] = B[(m*TILE_WIDTH+ty)*numBColumns+col];
  39.         } else {
  40.             ds_B[ty][tx] = 0;
  41.         }
  42.         __syncthreads();
  43.         if(row < numCRows && col < numCColumns){
  44.             for (int k = 0; k < TILE_WIDTH; ++k)
  45.                 Cvalue += ds_A[ty][k] * ds_B[k][tx];
  46.         }
  47.         __syncthreads();
  48.     }
  49.     if(row < numCRows && col < numCColumns)
  50.         C[row*numCColumns+col] = Cvalue;
  51. }
  52.  
  53. int main(int argc, char ** argv) {
  54.     float * hostA; // The A matrix
  55.     float * hostB; // The B matrix
  56.     float * hostC; // The output C matrix
  57.     float * deviceA;
  58.     float * deviceB;
  59.     float * deviceC;
  60.     int numARows; // number of rows in the matrix A
  61.     int numAColumns; // number of columns in the matrix A
  62.     int numBRows; // number of rows in the matrix B
  63.     int numBColumns; // number of columns in the matrix B
  64.     int numCRows; // number of rows in the matrix C
  65.     int numCColumns; // number of columns in the matrix C
  66.     ofstream fout("output.raw");
  67.  
  68.     numARows = numAColumns = 4096;
  69.     numBRows = numBColumns = 4096;
  70.     hostA = (float*)malloc(numARows*numAColumns * sizeof(float));
  71.     hostB = (float*)malloc(numBRows*numBColumns * sizeof(float));
  72.  
  73.     randomFloats(hostA, numARows*numAColumns);
  74.     randomFloats(hostB, numBRows*numBColumns);
  75.  
  76.     //@@ Set numCRows and numCColumns
  77.     numCRows = numARows;
  78.     numCColumns = numBColumns;
  79.     //@@ Allocate the hostC matrix
  80.     hostC = (float*)malloc(numCColumns * numCRows * sizeof(float));
  81.  
  82.  
  83.     //@@ Allocate GPU memory here
  84.     cudaMalloc((void**)&deviceA, numARows * numAColumns * sizeof(float));
  85.     cudaMalloc((void**)&deviceB, numBRows * numBColumns * sizeof(float));
  86.     cudaMalloc((void**)&deviceC, numCRows * numCColumns * sizeof(float));
  87.  
  88.     //@@ Copy memory to the GPU here
  89.     cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
  90.     cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);
  91.  
  92.    
  93.     //@@ Initialize the grid and block dimensions here
  94.     dim3 dimBlock(16, 16);
  95.     dim3 dimGrid((numBColumns + dimBlock.x - 1) / dimBlock.x,
  96.        (numARows + dimBlock.y - 1) / dimBlock.y);
  97.    
  98.     //@@ Launch the GPU Kernel here
  99.     matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC,
  100.                                         numARows, numAColumns,
  101.                                         numBRows, numBColumns,
  102.                                         numCRows, numCColumns);
  103.     cudaThreadSynchronize();
  104.     //@@ Copy the GPU memory back to the CPU here
  105.     cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost);
  106.  
  107.     //@@ Free the GPU memory here
  108.     cudaFree(deviceA);
  109.     cudaFree(deviceB);
  110.     cudaFree(deviceC);
  111.     /*
  112.     for(int i=0; i<numCRows; i++){
  113.         for(int j=0; j<numCColumns; j++){
  114.             fout << hostC[i*numCColumns+j] << " ";
  115.         }
  116.         fout << "\n";
  117.     }
  118.     */
  119.     free(hostA);
  120.     free(hostB);
  121.     free(hostC);
  122.  
  123.     return 0;
  124. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement