Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define BLOCK_SIZE 32
- #define MATRIX_SIZE 4096
- #include <stdio.h>
- #include <stdlib.h>
- #include <cuda_runtime.h>
- #include <cutil.h>
- #include <sys/time.h>
- bool InitCUDA(void)
- {
- int count = 0;
- int i = 0;
- cudaGetDeviceCount(&count);
- if(count == 0) {
- fprintf(stderr, "There is no device.\n");
- return false;
- }
- for(i = 0; i < count; i++) {
- cudaDeviceProp prop;
- if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
- if(prop.major >= 1) {
- break;
- }
- }
- }
- if(i == count) {
- fprintf(stderr, "There is no device supporting CUDA.\n");
- return false;
- }
- cudaSetDevice(i);
- printf("CUDA initialized.\n");
- return true;
- }
- __global__ void MatMulKernel(float* Md, float* Nd, float* Pd, int Width)
- {
- int tx = threadIdx.x;
- int ty = threadIdx.y;
- float Pvalue = 0;
- for( int k = 0; k < Width; ++k)
- {
- float Mdelement = Md[ty * Width + k];
- float Ndelement = Nd[k * Width + tx];
- Pvalue += Mdelement * Ndelement;
- }
- Pd[ty * Width + tx] = Pvalue;
- }
- int main(int argc, char* argv[])
- {
- struct timeval t0,t1;
- if(!InitCUDA()) {
- return 0;
- }
- unsigned int size = MATRIXSIZE * MATRIXSIZE;
- unsigned int mem_size = sizeof(float) * size;
- float* A = (float*) malloc(mem_size);
- float* B = (float*) malloc(mem_size);
- float* C = (float*) malloc(mem_size);
- float* Md;
- float* Nd;
- float* Pd;
- int row,col;
- A.width = A.height = MATRIXSIZE;
- B.width = B.height = MATRIXSIZE;
- C.width = C.height = MATRIXSIZE;
- for( int i=0; i<size; i++)
- {
- row = i/MATRIXSIZE;
- col = i%MATRIXSIZE;
- A[i] = ((row + 1.0)*(col + 1.0))/MATRIXSIZE;
- B[i] = (col + 1.0)/(row + 1.0);
- }
- gettimeofday(&t0,0);
- cudaMalloc((void**) &Md.elements, mem_size);
- cudaMalloc((void**) &Nd.elements, mem_size);
- cudaMalloc((void**) &Pd.elements, mem_size);
- cudaMemcpy(Md.elements, A.elements, mem_size, cudaMemcpyHostToDevice);
- cudaMemcpy(Nd.elements, B.elements, mem_size, cudaMemcpyHostToDevice);
- dim3 dimBlock(MATRIXSIZE,MATRIXSIZE);
- dim3 dimGrid(1,1);
- MatMulKernel<<<dimGrid, dimBlock>>>(Md,Nd,Pd,MATRIXSIZE);
- cudaMemcpy(C, Pd, mem_size, cudaMemcpyDeviceToHost);
- cudaFree(Md);
- cudaFree(Nd);
- cudaFree(Pd);
- gettimeofday(&t1,0);
- printf("\nMatrix C\n");
- for(int i=0;i<size;i++)
- {
- printf("%6.2f ",C[i]);
- if(((i+1)%MATRIXSIZE)==0)
- printf("\n");
- }
- printf("\nTime Results\n");
- float totalInt = t1.tv_sec - t0.tv_sec + (t1.tv_usec - t0.tv_usec)*1.0E-06;
- printf("Total Execution Time:\t%e\n",totalInt);
- free(A);
- free(B);
- free(C);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement