Untitled

/*
Matt Michels
10/23/11
lab2 #1a

*/

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define BLOCK_SIZE 3

#define wA 3   // Matrix A width
#define hA 3   // Matrix A height
#define wB 3   // Matrix B width
#define hB wA  // Matrix B height
#define wC wB  // Matrix C width
#define hC hA  // Matrix C height


void randomInit(float* data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = (rand() % 10 + 1);
}

//create kernel
__global__ void matrixMul( float* C, float* A, float* B, int wA, int wB)
{

   int row = threadIdx.y;
   int col = threadIdx.x;

   float value = 0;
   for (int i = 0; i < wA; ++i)
   {
      float elementA = A[col * wA + i];
      float elementB = B[i * wB + row];
      value += elementA * elementB;
   }

   C[col * wA + row] = value;
}


int main(int argc, char** argv)
{
    srand(1020);

    unsigned int size_A = wA * hA;
    unsigned int matrix_size_A = sizeof(float) * size_A;
    float* h_A = (float*) malloc(matrix_size_A);

    unsigned int size_B = wB * hB;
    unsigned int matrix_size_B = sizeof(float) * size_B;
    float* h_B = (float*) malloc(matrix_size_B);

    randomInit(h_A, size_A);
    randomInit(h_B, size_B);


    printf("\n\nMatrix A\n");
    for(int i = 0; i < size_A; i++)
    {
       printf("%f ", h_A[i]);
       if(((i + 1) % wA) == 0)
          printf("\n");
    }

    printf("\n\nMatrix B\n");
    for(int i = 0; i < size_B; i++)
    {
       printf("%f ", h_B[i]);
       if(((i + 1) % wB) == 0)
          printf("\n");
    }


    float* d_A;
    float* d_B;
    cudaMalloc((void**) &d_A, matrix_size_A);
    cudaMalloc((void**) &d_B, matrix_size_B);


    cudaMemcpy(d_A, h_A, matrix_size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, matrix_size_B, cudaMemcpyHostToDevice);


    unsigned int size_C = wC * hC;
    unsigned int matrix_size_C = sizeof(float) * size_C;
    float* h_C = (float*) malloc(matrix_size_C);

    float* d_C;
    cudaMalloc((void**) &d_C, matrix_size_C);


    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(wC / threads.x, hC / threads.y);

    //float* val;
    //cudaMalloc((void**) &d_C, matrix_size_C);


    //   execute the kernel
    matrixMul<<< grid, threads >>>(d_C, d_A, d_B, wA, wB);


////////////////////////////////
    //DEBUGGING
    //debugging purposes:
/*    float* test = (float*) malloc(matrix_size_C);
    cudaMemcpy(test, val, matrix_size_C, cudaMemcpyDeviceToHost);

    // 6. print out the results
    printf("\n\nMatrix VAL (Results)\n");
    for(int i = 0; i < size_C; i++)
    {
       printf("%f ", test[i]);
       if(((i + 1) % wC) == 0)
          printf("\n");
    }

*/
/////////////////////////////

    cudaMemcpy(h_C, d_C, matrix_size_C, cudaMemcpyDeviceToHost);

    printf("\n\nMatrix C (Results)\n");
    for(int i = 0; i < size_C; i++)
    {
       printf("%f ", h_C[i]);
       if(((i + 1) % wC) == 0)
          printf("\n");
    }
    printf("\n");

    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

}