Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- Matt Michels
- 10/23/11
- lab2 #1a
- */
- #include <stdlib.h>
- #include <stdio.h>
- #include <math.h>
- #define BLOCK_SIZE 3
- #define wA 3 // Matrix A width
- #define hA 3 // Matrix A height
- #define wB 3 // Matrix B width
- #define hB wA // Matrix B height
- #define wC wB // Matrix C width
- #define hC hA // Matrix C height
- void randomInit(float* data, int size)
- {
- for (int i = 0; i < size; ++i)
- data[i] = (rand() % 10 + 1);
- }
- //create kernel
- __global__ void matrixMul( float* C, float* A, float* B, int wA, int wB)
- {
- int row = threadIdx.y;
- int col = threadIdx.x;
- float value = 0;
- for (int i = 0; i < wA; ++i)
- {
- float elementA = A[col * wA + i];
- float elementB = B[i * wB + row];
- value += elementA * elementB;
- }
- C[col * wA + row] = value;
- }
- int main(int argc, char** argv)
- {
- srand(1020);
- unsigned int size_A = wA * hA;
- unsigned int matrix_size_A = sizeof(float) * size_A;
- float* h_A = (float*) malloc(matrix_size_A);
- unsigned int size_B = wB * hB;
- unsigned int matrix_size_B = sizeof(float) * size_B;
- float* h_B = (float*) malloc(matrix_size_B);
- randomInit(h_A, size_A);
- randomInit(h_B, size_B);
- printf("\n\nMatrix A\n");
- for(int i = 0; i < size_A; i++)
- {
- printf("%f ", h_A[i]);
- if(((i + 1) % wA) == 0)
- printf("\n");
- }
- printf("\n\nMatrix B\n");
- for(int i = 0; i < size_B; i++)
- {
- printf("%f ", h_B[i]);
- if(((i + 1) % wB) == 0)
- printf("\n");
- }
- float* d_A;
- float* d_B;
- cudaMalloc((void**) &d_A, matrix_size_A);
- cudaMalloc((void**) &d_B, matrix_size_B);
- cudaMemcpy(d_A, h_A, matrix_size_A, cudaMemcpyHostToDevice);
- cudaMemcpy(d_B, h_B, matrix_size_B, cudaMemcpyHostToDevice);
- unsigned int size_C = wC * hC;
- unsigned int matrix_size_C = sizeof(float) * size_C;
- float* h_C = (float*) malloc(matrix_size_C);
- float* d_C;
- cudaMalloc((void**) &d_C, matrix_size_C);
- dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
- dim3 grid(wC / threads.x, hC / threads.y);
- //float* val;
- //cudaMalloc((void**) &d_C, matrix_size_C);
- // execute the kernel
- matrixMul<<< grid, threads >>>(d_C, d_A, d_B, wA, wB);
- ////////////////////////////////
- //DEBUGGING
- //debugging purposes:
- /* float* test = (float*) malloc(matrix_size_C);
- cudaMemcpy(test, val, matrix_size_C, cudaMemcpyDeviceToHost);
- // 6. print out the results
- printf("\n\nMatrix VAL (Results)\n");
- for(int i = 0; i < size_C; i++)
- {
- printf("%f ", test[i]);
- if(((i + 1) % wC) == 0)
- printf("\n");
- }
- */
- /////////////////////////////
- cudaMemcpy(h_C, d_C, matrix_size_C, cudaMemcpyDeviceToHost);
- printf("\n\nMatrix C (Results)\n");
- for(int i = 0; i < size_C; i++)
- {
- printf("%f ", h_C[i]);
- if(((i + 1) % wC) == 0)
- printf("\n");
- }
- printf("\n");
- free(h_A);
- free(h_B);
- free(h_C);
- cudaFree(d_A);
- cudaFree(d_B);
- cudaFree(d_C);
- }
Add Comment
Please, Sign In to add comment