Untitled

#define BLOCK_SIZE 32
#define MATRIX_SIZE 4096

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <sys/time.h>

bool InitCUDA(void)
{
        int count = 0;
        int i = 0;
        cudaGetDeviceCount(&count);
        if(count == 0) {
                fprintf(stderr, "There is no device.\n");
                return false;
        }
        for(i = 0; i < count; i++) {
                cudaDeviceProp prop;
                if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                        if(prop.major >= 1) {
                                break;
                        }
                }
        }
        if(i == count) {
                fprintf(stderr, "There is no device supporting CUDA.\n");
                return false;
        }
        cudaSetDevice(i);
        printf("CUDA initialized.\n");
        return true;
}

__global__ void MatMulKernel(float* Md, float* Nd, float* Pd, int Width)
{
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    float Pvalue = 0;

    for( int k = 0; k < Width; ++k)
    {
        float Mdelement = Md[ty * Width + k];
        float Ndelement = Nd[k * Width + tx];
        Pvalue += Mdelement * Ndelement;
    }
    Pd[ty * Width + tx] = Pvalue;

}

int main(int argc, char* argv[])
{
    struct timeval t0,t1;

    if(!InitCUDA()) {
            return 0;
    }

    unsigned int size = MATRIXSIZE * MATRIXSIZE;
    unsigned int mem_size = sizeof(float) * size;

    float* A = (float*) malloc(mem_size);
    float* B = (float*) malloc(mem_size);
    float* C = (float*) malloc(mem_size);
    float* Md;
    float* Nd;
    float* Pd;

    int row,col;

    A.width = A.height = MATRIXSIZE;
    B.width = B.height = MATRIXSIZE;
    C.width = C.height = MATRIXSIZE;

    for( int i=0; i<size; i++)
    {
        row = i/MATRIXSIZE;
        col = i%MATRIXSIZE;
        A[i] = ((row + 1.0)*(col + 1.0))/MATRIXSIZE;
        B[i] = (col + 1.0)/(row + 1.0);
    }

    gettimeofday(&t0,0);

    cudaMalloc((void**) &Md.elements, mem_size);
    cudaMalloc((void**) &Nd.elements, mem_size);
    cudaMalloc((void**) &Pd.elements, mem_size);

    cudaMemcpy(Md.elements, A.elements, mem_size, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd.elements, B.elements, mem_size, cudaMemcpyHostToDevice);

    dim3 dimBlock(MATRIXSIZE,MATRIXSIZE);
    dim3 dimGrid(1,1);

    MatMulKernel<<<dimGrid, dimBlock>>>(Md,Nd,Pd,MATRIXSIZE);

    cudaMemcpy(C, Pd, mem_size, cudaMemcpyDeviceToHost);

    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);

    gettimeofday(&t1,0);

    printf("\nMatrix C\n");
    for(int i=0;i<size;i++)
    {
        printf("%6.2f ",C[i]);
        if(((i+1)%MATRIXSIZE)==0)
            printf("\n");
    }

    printf("\nTime Results\n");
    float totalInt = t1.tv_sec - t0.tv_sec + (t1.tv_usec - t0.tv_usec)*1.0E-06;
    printf("Total Execution Time:\t%e\n",totalInt);

    free(A);
    free(B);
    free(C);

    return 0;
}