Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // MatrixMultiplication.cpp: 콘솔 응용 프로그램의 진입점을 정의합니다.
- //
- // CPU - Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
- // Memory - 16.0 GB
- //
- // Platform Number - 1
- // Platform Name - AMD Accelerated Parallel Processing
- // Platform vender - Advanced Micro Devices, Inc.
- //
- // Device Type - CL_DEVICE_TYPE_GPU
- // Device Name - Ellesmere (Radeon RX480)
- // CL_DEVICE_MAX_WORK_GROUP_SIZE - 256
- // CL_DEVICE_GLOBAL_MEM_SIZE - 3221225472
- // CL_DEVICE_LOCAL_MEM_SIZE - 32768
- // CL_DEVICE_MAX_MEM_ALLOC_SIZE - 3221225472
- //
- #include <CL/cl.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #include <string.h>
- #define N 1024
- int ROW_A = N;
- int COL_A = N;
- int ROW_B = COL_A;
- int COL_B = N;
- clock_t start, end;
- char *get_source_code(const char *file_name, size_t *len);
- int mat_equal(int* A, int* B);
- int* mat_mul_seq(int* A, int* B);
- int* mat_mul_seq_2(int* A, int* B);
- int* mat_mul_opencl(int* A, int* B);
- int* mat_mul_opencl_2(int* A, int* B);
- int* mat_mul_opencl_3(int* A, int* B);
- int* mat_mul_opencl_4(int* A, int* B);
- int* mat_mul_opencl_5(int* A, int* B);
- int* mat_mul_opencl_6(int* A, int* B);
- int* mat_mul_strassen(int* A, int* B);
- int *strassen(int *C, int *A, int *B, int n);
- void strassen_result(int *C, int *c11, int *c12, int *c21, int *c22, int n);
- int *strassen_add(int *res, int *x, int *y, int n);
- int *strassen_sub(int *res, int *x, int *y, int n);
- int *strassen_mul(int *res, int *x, int *y, int n);
- int main()
- {
- int* A = (int*)malloc(sizeof(int) * ROW_A * COL_A);
- int* B = (int*)malloc(sizeof(int) * ROW_B * COL_B);
- int *C, *D, *E, *F, *G, *H, *I, *J, *K;
- int i, j;
- printf("Make matrix A...\n");
- for (i = 0; i < ROW_A * COL_A; i++) A[i] = rand() % 10;
- printf("Make matrix B...\n\n");
- for (i = 0; i < ROW_B * COL_B; i++) B[i] = rand() % 10;
- printf("Sequential version 1...\n");
- C = mat_mul_seq(A, B);
- printf("\n");
- printf("Sequential version 2...\n");
- D = mat_mul_seq_2(A, B);
- mat_equal(C, D);
- printf("\n");
- printf("OpenCL version 1...\n");
- E = mat_mul_opencl(A, B);
- mat_equal(C, E);
- printf("\n");
- printf("OpenCL version 2...\n");
- F = mat_mul_opencl_2(A, B);
- mat_equal(C, F);
- printf("\n");
- printf("OpenCL version 3...\n");
- G = mat_mul_opencl_3(A, B);
- mat_equal(C, G);
- printf("\n");
- printf("OpenCL version 4...\n");
- H = mat_mul_opencl_4(A, B);
- mat_equal(C, H);
- printf("\n");
- printf("OpenCL version 5...\n");
- I = mat_mul_opencl_5(A, B);
- mat_equal(C, I);
- printf("\n");
- printf("OpenCL version 6...\n");
- J = mat_mul_opencl_6(A, B);
- mat_equal(C, J);
- printf("\n");
- printf("Strassen version...\n");
- K = mat_mul_strassen(A, B);
- mat_equal(C, K);
- printf("\n");
- free(A);
- free(B);
- free(C);
- free(D);
- free(E);
- free(F);
- free(G);
- free(H);
- free(I);
- free(J);
- free(K);
- return 0;
- }
- int mat_equal(int* A, int* B) {
- int i;
- for (i = 0; i < ROW_A * COL_B; i++)
- if (A[i] != B[i]) {
- printf("Calculation Fail!\n");
- return 0;
- }
- printf("Calculation Succeed!\n");
- return 1;
- }
- int* mat_mul_seq(int* A, int* B) {
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- int i, j, k;
- start = clock();
- for (i = 0; i < ROW_A; i++)
- for (j = 0; j < COL_B; j++) {
- C[i * COL_B + j] = 0;
- for (k = 0; k < COL_A; k++)
- C[i * COL_B + j] += A[i * COL_A + k] * B[k * COL_B + j];
- }
- end = clock();
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_seq_2(int* A, int* B) {
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- int i, j, k, tmp;
- for (i = 0; i < ROW_A * COL_B; i++) C[i] = 0;
- start = clock();
- for (i = 0; i < ROW_A; i++)
- for (j = 0; j < COL_B; j++) {
- tmp = 0;
- for (k = 0; k < COL_A; k++)
- tmp += A[i * COL_A + k] * B[k * COL_B + j];
- C[i * COL_B + j] = tmp;
- }
- end = clock();
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- #define CHECK_ERROR(err) \
- if(err != CL_SUCCESS) { \
- printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
- exit(EXIT_FAILURE); \
- }
- char *get_source_code(const char *file_name, size_t *len) {
- char *source_code;
- char buf[2] = "\0";
- int cnt = 0;
- size_t length;
- FILE *file = fopen(file_name, "r");
- if (file == NULL) {
- printf("[%s:%d] Failed to open %s ", __FILE__, __LINE__, file_name);
- exit(EXIT_FAILURE);
- }
- fseek(file, 0, SEEK_END);
- length = (size_t)ftell(file);
- rewind(file);
- source_code = (char*)malloc(length + 1);
- fread(source_code, length, 1, file);
- for (int i = 0; i < length; i++) {
- buf[0] = source_code[i];
- if (buf[0] == '\n')
- cnt++;
- }
- source_code[length - cnt] = '\0';
- fclose(file);
- *len = length - cnt;
- return source_code;
- }
- int* mat_mul_opencl(int* A, int* B) {
- cl_uint num_platforms;
- cl_platform_id *platforms;
- cl_uint num_devices;
- char str[1024];
- cl_device_type device_type;
- cl_device_id device;
- cl_context context;
- cl_command_queue queue;
- cl_program program;
- char *kernel_source;
- size_t kernel_source_size;
- cl_kernel kernel_mat_mul;
- cl_mem bufA, bufB, bufC;
- cl_int err;
- size_t global_size[2] = { ROW_A, COL_B };
- size_t local_size[2] = { 1, 1 };
- int i;
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- err = clGetPlatformIDs(0, NULL, &num_platforms);
- CHECK_ERROR(err);
- platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
- err = clGetPlatformIDs(num_platforms, platforms, NULL);
- CHECK_ERROR(err);
- err = clGetDeviceIDs(platforms[1], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
- CHECK_ERROR(err);
- // Create Context
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- CHECK_ERROR(err);
- // Create Command Queue
- queue = clCreateCommandQueue(context, device, 0, &err);
- CHECK_ERROR(err);
- // Create Program Object
- kernel_source = get_source_code("kernel.cl", &kernel_source_size);
- program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, &kernel_source_size, &err);
- CHECK_ERROR(err);
- // Build Program
- err = clBuildProgram(program, 1, &device, "", NULL, NULL);
- CHECK_ERROR(err);
- // Create Kernel
- kernel_mat_mul = clCreateKernel(program, "mat_mul", &err);
- CHECK_ERROR(err);
- // Write Buffer
- bufA = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_A, A, &err);
- CHECK_ERROR(err);
- bufB = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_B * COL_B, B, &err);
- CHECK_ERROR(err);
- bufC = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_B, C, &err);
- CHECK_ERROR(err);
- // Set Kernel Arg
- err = clSetKernelArg(kernel_mat_mul, 0, sizeof(cl_mem), &bufA);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 1, sizeof(cl_mem), &bufB);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 2, sizeof(cl_mem), &bufC);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 3, sizeof(int), &ROW_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 4, sizeof(int), &COL_B);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 5, sizeof(int), &COL_A);
- CHECK_ERROR(err);
- start = clock();
- // Execute Kernel
- clEnqueueNDRangeKernel(queue, kernel_mat_mul, 2, NULL, global_size, local_size, 0, NULL, NULL);
- err = clFinish(queue);
- CHECK_ERROR(err);
- end = clock();
- // Read Buffer
- err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
- CHECK_ERROR(err);
- err = clReleaseKernel(kernel_mat_mul);
- CHECK_ERROR(err);
- err = clReleaseProgram(program);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufA);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufB);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufC);
- CHECK_ERROR(err);
- err = clReleaseCommandQueue(queue);
- CHECK_ERROR(err);
- err = clReleaseContext(context);
- CHECK_ERROR(err);
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_opencl_2(int* A, int* B) {
- cl_uint num_platforms;
- cl_platform_id *platforms;
- cl_uint num_devices;
- char str[1024];
- cl_device_type device_type;
- cl_device_id device;
- cl_context context;
- cl_command_queue queue;
- cl_program program;
- char *kernel_source;
- size_t kernel_source_size;
- cl_kernel kernel_mat_mul;
- cl_mem bufA, bufB, bufC;
- cl_int err;
- size_t global_size[2] = { ROW_A, COL_B };
- size_t local_size[2] = { 1, 1 };
- int i;
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- for (i = 0; i < ROW_A * COL_B; i++) C[i] = 0;
- err = clGetPlatformIDs(0, NULL, &num_platforms);
- CHECK_ERROR(err);
- platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
- err = clGetPlatformIDs(num_platforms, platforms, NULL);
- CHECK_ERROR(err);
- err = clGetDeviceIDs(platforms[1], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
- CHECK_ERROR(err);
- // Create Context
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- CHECK_ERROR(err);
- // Create Command Queue
- queue = clCreateCommandQueue(context, device, 0, &err);
- CHECK_ERROR(err);
- // Create Program Object
- kernel_source = get_source_code("kernel2.cl", &kernel_source_size);
- program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, &kernel_source_size, &err);
- CHECK_ERROR(err);
- // Build Program
- err = clBuildProgram(program, 1, &device, "", NULL, NULL);
- CHECK_ERROR(err);
- // Create Kernel
- kernel_mat_mul = clCreateKernel(program, "mat_mul", &err);
- CHECK_ERROR(err);
- // Write Buffer
- bufA = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_A, A, &err);
- CHECK_ERROR(err);
- bufB = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_B * COL_B, B, &err);
- CHECK_ERROR(err);
- bufC = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_B, C, &err);
- CHECK_ERROR(err);
- // Set Kernel Arg
- err = clSetKernelArg(kernel_mat_mul, 0, sizeof(cl_mem), &bufA);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 1, sizeof(cl_mem), &bufB);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 2, sizeof(cl_mem), &bufC);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 3, sizeof(int), &ROW_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 4, sizeof(int), &COL_B);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 5, sizeof(int), &COL_A);
- CHECK_ERROR(err);
- start = clock();
- // Execute Kernel
- clEnqueueNDRangeKernel(queue, kernel_mat_mul, 2, NULL, global_size, local_size, 0, NULL, NULL);
- err = clFinish(queue);
- CHECK_ERROR(err);
- end = clock();
- // Read Buffer
- err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
- CHECK_ERROR(err);
- err = clReleaseKernel(kernel_mat_mul);
- CHECK_ERROR(err);
- err = clReleaseProgram(program);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufA);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufB);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufC);
- CHECK_ERROR(err);
- err = clReleaseCommandQueue(queue);
- CHECK_ERROR(err);
- err = clReleaseContext(context);
- CHECK_ERROR(err);
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_opencl_3(int* A, int* B) {
- cl_uint num_platforms;
- cl_platform_id *platforms;
- cl_uint num_devices;
- char str[1024];
- cl_device_type device_type;
- cl_device_id device;
- cl_context context;
- cl_command_queue queue;
- cl_program program;
- char *kernel_source;
- size_t kernel_source_size;
- cl_kernel kernel_mat_mul;
- cl_mem bufA, bufB, bufC;
- cl_int err;
- size_t global_size[1] = { ROW_A };
- size_t local_size[1] = { 256 };
- int i;
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- for (i = 0; i < ROW_A * COL_B; i++) C[i] = 0;
- err = clGetPlatformIDs(0, NULL, &num_platforms);
- CHECK_ERROR(err);
- platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
- err = clGetPlatformIDs(num_platforms, platforms, NULL);
- CHECK_ERROR(err);
- err = clGetDeviceIDs(platforms[1], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
- CHECK_ERROR(err);
- // Create Context
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- CHECK_ERROR(err);
- // Create Command Queue
- queue = clCreateCommandQueue(context, device, 0, &err);
- CHECK_ERROR(err);
- // Create Program Object
- kernel_source = get_source_code("kernel3.cl", &kernel_source_size);
- program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, &kernel_source_size, &err);
- CHECK_ERROR(err);
- // Build Program
- err = clBuildProgram(program, 1, &device, "", NULL, NULL);
- CHECK_ERROR(err);
- // Create Kernel
- kernel_mat_mul = clCreateKernel(program, "mat_mul", &err);
- CHECK_ERROR(err);
- // Write Buffer
- bufA = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_A, A, &err);
- CHECK_ERROR(err);
- bufB = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_B * COL_B, B, &err);
- CHECK_ERROR(err);
- bufC = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_B, C, &err);
- CHECK_ERROR(err);
- // Set Kernel Arg
- err = clSetKernelArg(kernel_mat_mul, 0, sizeof(cl_mem), &bufA);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 1, sizeof(cl_mem), &bufB);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 2, sizeof(cl_mem), &bufC);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 3, sizeof(int), &ROW_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 4, sizeof(int), &COL_B);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 5, sizeof(int), &COL_A);
- CHECK_ERROR(err);
- start = clock();
- // Execute Kernel
- clEnqueueNDRangeKernel(queue, kernel_mat_mul, 1, NULL, global_size, local_size, 0, NULL, NULL);
- err = clFinish(queue);
- CHECK_ERROR(err);
- end = clock();
- // Read Buffer
- err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
- CHECK_ERROR(err);
- err = clReleaseKernel(kernel_mat_mul);
- CHECK_ERROR(err);
- err = clReleaseProgram(program);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufA);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufB);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufC);
- CHECK_ERROR(err);
- err = clReleaseCommandQueue(queue);
- CHECK_ERROR(err);
- err = clReleaseContext(context);
- CHECK_ERROR(err);
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_opencl_4(int* A, int* B) {
- cl_uint num_platforms;
- cl_platform_id *platforms;
- cl_uint num_devices;
- char str[1024];
- cl_device_type device_type;
- cl_device_id device;
- cl_context context;
- cl_command_queue queue;
- cl_program program;
- char *kernel_source;
- size_t kernel_source_size;
- cl_kernel kernel_mat_mul;
- cl_mem bufA, bufB, bufC;
- cl_int err;
- size_t global_size[2] = { ROW_A, COL_B };
- size_t local_size[2] = { 16, 16 };
- int i;
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- err = clGetPlatformIDs(0, NULL, &num_platforms);
- CHECK_ERROR(err);
- platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
- err = clGetPlatformIDs(num_platforms, platforms, NULL);
- CHECK_ERROR(err);
- err = clGetDeviceIDs(platforms[1], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
- CHECK_ERROR(err);
- // Create Context
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- CHECK_ERROR(err);
- // Create Command Queue
- queue = clCreateCommandQueue(context, device, 0, &err);
- CHECK_ERROR(err);
- // Create Program Object
- kernel_source = get_source_code("kernel4.cl", &kernel_source_size);
- program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, &kernel_source_size, &err);
- CHECK_ERROR(err);
- // Build Program
- err = clBuildProgram(program, 1, &device, "", NULL, NULL);
- CHECK_ERROR(err);
- // Create Kernel
- kernel_mat_mul = clCreateKernel(program, "mat_mul", &err);
- CHECK_ERROR(err);
- // Write Buffer
- bufA = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_A, A, &err);
- CHECK_ERROR(err);
- bufB = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_B * COL_B, B, &err);
- CHECK_ERROR(err);
- bufC = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_B, C, &err);
- CHECK_ERROR(err);
- // Set Kernel Arg
- err = clSetKernelArg(kernel_mat_mul, 0, sizeof(cl_mem), &bufA);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 1, sizeof(cl_mem), &bufB);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 2, sizeof(cl_mem), &bufC);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 3, sizeof(int), &ROW_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 4, sizeof(int), &COL_B);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 5, sizeof(int), &COL_A);
- CHECK_ERROR(err);
- start = clock();
- // Execute Kernel
- clEnqueueNDRangeKernel(queue, kernel_mat_mul, 2, NULL, global_size, local_size, 0, NULL, NULL);
- err = clFinish(queue);
- CHECK_ERROR(err);
- end = clock();
- // Read Buffer
- err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
- CHECK_ERROR(err);
- err = clReleaseKernel(kernel_mat_mul);
- CHECK_ERROR(err);
- err = clReleaseProgram(program);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufA);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufB);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufC);
- CHECK_ERROR(err);
- err = clReleaseCommandQueue(queue);
- CHECK_ERROR(err);
- err = clReleaseContext(context);
- CHECK_ERROR(err);
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_opencl_5(int* A, int* B) {
- cl_uint num_platforms;
- cl_platform_id *platforms;
- cl_uint num_devices;
- char str[1024];
- cl_device_type device_type;
- cl_device_id device;
- cl_context context;
- cl_command_queue queue;
- cl_program program;
- char *kernel_source;
- size_t kernel_source_size;
- cl_kernel kernel_mat_mul;
- cl_mem bufA, bufB, bufC;
- cl_int err;
- size_t global_size[1] = { ROW_A };
- size_t local_size[1] = { 256 };
- int i;
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- for (i = 0; i < ROW_A * COL_B; i++) C[i] = 0;
- err = clGetPlatformIDs(0, NULL, &num_platforms);
- CHECK_ERROR(err);
- platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
- err = clGetPlatformIDs(num_platforms, platforms, NULL);
- CHECK_ERROR(err);
- err = clGetDeviceIDs(platforms[1], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
- CHECK_ERROR(err);
- // Create Context
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- CHECK_ERROR(err);
- // Create Command Queue
- queue = clCreateCommandQueue(context, device, 0, &err);
- CHECK_ERROR(err);
- // Create Program Object
- kernel_source = get_source_code("kernel5.cl", &kernel_source_size);
- program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, &kernel_source_size, &err);
- CHECK_ERROR(err);
- // Build Program
- err = clBuildProgram(program, 1, &device, "", NULL, NULL);
- CHECK_ERROR(err);
- // Create Kernel
- kernel_mat_mul = clCreateKernel(program, "mat_mul", &err);
- CHECK_ERROR(err);
- // Write Buffer
- bufA = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_A, A, &err);
- CHECK_ERROR(err);
- bufB = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_B * COL_B, B, &err);
- CHECK_ERROR(err);
- bufC = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_B, C, &err);
- CHECK_ERROR(err);
- // Set Kernel Arg
- err = clSetKernelArg(kernel_mat_mul, 0, sizeof(cl_mem), &bufA);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 1, sizeof(cl_mem), &bufB);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 2, sizeof(cl_mem), &bufC);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 3, sizeof(int), &ROW_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 4, sizeof(int), &COL_B);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 5, sizeof(int), &COL_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 6, sizeof(int) * COL_B, NULL);
- CHECK_ERROR(err);
- start = clock();
- // Execute Kernel
- clEnqueueNDRangeKernel(queue, kernel_mat_mul, 1, NULL, global_size, local_size, 0, NULL, NULL);
- err = clFinish(queue);
- CHECK_ERROR(err);
- end = clock();
- // Read Buffer
- err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
- CHECK_ERROR(err);
- err = clReleaseKernel(kernel_mat_mul);
- CHECK_ERROR(err);
- err = clReleaseProgram(program);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufA);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufB);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufC);
- CHECK_ERROR(err);
- err = clReleaseCommandQueue(queue);
- CHECK_ERROR(err);
- err = clReleaseContext(context);
- CHECK_ERROR(err);
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_opencl_6(int* A, int* B) {
- cl_uint num_platforms;
- cl_platform_id *platforms;
- cl_uint num_devices;
- char str[1024];
- cl_device_type device_type;
- cl_device_id device;
- cl_context context;
- cl_command_queue queue;
- cl_program program;
- char *kernel_source;
- size_t kernel_source_size;
- cl_kernel kernel_mat_mul;
- cl_mem bufA, bufB, bufC;
- cl_int err;
- size_t global_size[2] = { ROW_A, COL_B };
- size_t local_size[2] = { 256, 1 };
- int i;
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- for (i = 0; i < ROW_A * COL_B; i++) C[i] = 0;
- err = clGetPlatformIDs(0, NULL, &num_platforms);
- CHECK_ERROR(err);
- platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
- err = clGetPlatformIDs(num_platforms, platforms, NULL);
- CHECK_ERROR(err);
- err = clGetDeviceIDs(platforms[1], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
- CHECK_ERROR(err);
- // Create Context
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- CHECK_ERROR(err);
- // Create Command Queue
- queue = clCreateCommandQueue(context, device, 0, &err);
- CHECK_ERROR(err);
- // Create Program Object
- kernel_source = get_source_code("kernel6.cl", &kernel_source_size);
- program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, &kernel_source_size, &err);
- CHECK_ERROR(err);
- // Build Program
- err = clBuildProgram(program, 1, &device, "", NULL, NULL);
- CHECK_ERROR(err);
- // Create Kernel
- kernel_mat_mul = clCreateKernel(program, "mat_mul", &err);
- CHECK_ERROR(err);
- // Write Buffer
- bufA = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_A, A, &err);
- CHECK_ERROR(err);
- bufB = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_B * COL_B, B, &err);
- CHECK_ERROR(err);
- bufC = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int) * ROW_A * COL_B, C, &err);
- CHECK_ERROR(err);
- // Set Kernel Arg
- err = clSetKernelArg(kernel_mat_mul, 0, sizeof(cl_mem), &bufA);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 1, sizeof(cl_mem), &bufB);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 2, sizeof(cl_mem), &bufC);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 3, sizeof(int), &ROW_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 4, sizeof(int), &COL_B);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 5, sizeof(int), &COL_A);
- CHECK_ERROR(err);
- err = clSetKernelArg(kernel_mat_mul, 6, sizeof(int) * COL_B, NULL);
- CHECK_ERROR(err);
- start = clock();
- // Execute Kernel
- clEnqueueNDRangeKernel(queue, kernel_mat_mul, 2, NULL, global_size, local_size, 0, NULL, NULL);
- err = clFinish(queue);
- CHECK_ERROR(err);
- end = clock();
- // Read Buffer
- err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * ROW_A * COL_B, C, 0, NULL, NULL);
- CHECK_ERROR(err);
- err = clReleaseKernel(kernel_mat_mul);
- CHECK_ERROR(err);
- err = clReleaseProgram(program);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufA);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufB);
- CHECK_ERROR(err);
- err = clReleaseMemObject(bufC);
- CHECK_ERROR(err);
- err = clReleaseCommandQueue(queue);
- CHECK_ERROR(err);
- err = clReleaseContext(context);
- CHECK_ERROR(err);
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int* mat_mul_strassen(int* A, int* B) {
- int* C = (int*)malloc(sizeof(int) * ROW_A * COL_B);
- int i, j, k;
- start = clock();
- C = strassen(C, A, B, N);
- end = clock();
- printf("Elapsed time: %f sec\n", (double)(end - start) / CLK_TCK);
- return C;
- }
- int *strassen(int *C, int *A, int *B, int n)
- {
- int nn = n / 2;
- int i, j;
- int size = nn * nn;
- int *buffer = (int *)malloc(sizeof(int) * size * 21);
- int *a11 = buffer,
- *a12 = buffer + size,
- *a21 = buffer + size * 2,
- *a22 = buffer + size * 3,
- *b11 = buffer + size * 4,
- *b12 = buffer + size * 5,
- *b21 = buffer + size * 6,
- *b22 = buffer + size * 7,
- *c11 = buffer + size * 8,
- *c12 = buffer + size * 9,
- *c21 = buffer + size * 10,
- *c22 = buffer + size * 11,
- *m1 = buffer + size * 12,
- *m2 = buffer + size * 13,
- *m3 = buffer + size * 14,
- *m4 = buffer + size * 15,
- *m5 = buffer + size * 16,
- *m6 = buffer + size * 17,
- *m7 = buffer + size * 18,
- *AA = buffer + size * 19,
- *BB = buffer + size * 20;
- for (i = 0; i < nn; i++) {
- for (j = 0; j < nn; j++) {
- a11[i * nn + j] = A[i * n + j];
- a12[i * nn + j] = A[i * n + j + nn];
- a21[i * nn + j] = A[(i + nn) * n + j];
- a22[i * nn + j] = A[(i + nn) * n + j + nn];
- b11[i * nn + j] = B[(i * n) + j];
- b12[i * nn + j] = B[(i * n) + j + nn];
- b21[i * nn + j] = B[(i + nn) * n + j];
- b22[i * nn + j] = B[(i + nn) * n + j + nn];
- }
- }
- strassen_mul(m1, strassen_add(AA, a11, a22, nn), strassen_add(BB, b11, b22, nn), nn);
- strassen_mul(m2, strassen_add(AA, a21, a22, nn), b11, nn);
- strassen_mul(m3, a11, strassen_sub(BB, b12, b22, nn), nn);
- strassen_mul(m4, a22, strassen_sub(BB, b21, b11, nn), nn);
- strassen_mul(m5, strassen_add(AA, a11, a12, nn), b22, nn);
- strassen_mul(m6, strassen_sub(AA, a21, a11, nn), strassen_add(BB, b11, b12, nn), nn);
- strassen_mul(m7, strassen_sub(AA, a12, a22, nn), strassen_add(BB, b21, b22, nn), nn);
- strassen_sub(c11, strassen_add(AA, m1, m4, nn), strassen_sub(BB, m5, m7, nn), nn);
- strassen_add(c12, m3, m5, nn);
- strassen_add(c21, m2, m4, nn);
- strassen_add(c22, strassen_sub(AA, m1, m2, nn), strassen_add(BB, m3, m6, nn), nn);
- strassen_result(C, c11, c12, c21, c22, nn);
- free(buffer);
- return C;
- }
- void strassen_result(int *C, int *c11, int *c12, int *c21, int *c22, int n) {
- int i, j;
- for (i = 0; i < n; i++)
- for (j = 0; j < n; j++) {
- C[i * 2 * n + j] = c11[i * n + j];
- C[i * 2 * n + j + n] = c12[i * n + j];
- C[(i + n) * n * 2 + j] = c21[i * n + j];
- C[(i + n) * n * 2 + j + n] = c22[i * n + j];
- }
- }
- int *strassen_add(int *res, int *x, int *y, int n) {
- int i, j;
- memset(res, 0, sizeof(int) * n * n);
- for (i = 0; i < n; i++)
- for (j = 0; j < n; j++)
- res[i * n + j] = x[i * n + j] + y[i * n + j];
- return res;
- }
- int *strassen_sub(int *res, int *x, int *y, int n) {
- int i, j;
- memset(res, 0, sizeof(int) * n * n);
- for (i = 0; i < n; i++)
- for (j = 0; j < n; j++)
- res[i * n + j] = x[i * n + j] - y[i * n + j];
- return res;
- }
- int *strassen_mul(int *res, int *a, int *b, int n) {
- int m1, m2, m3, m4, m5, m6, m7;
- if (n == 2) {
- m1 = (a[0 * n + 0] + a[1 * n + 1]) * (b[0 * n + 0] + b[1 * n + 1]);
- m2 = (a[1 * n + 0] + a[1 * n + 1]) * b[0 * n + 0];
- m3 = a[0 * n + 0] * (b[0 * n + 1] - b[1 * n + 1]);
- m4 = a[1 * n + 1] * (b[1 * n + 0] - b[0 * n + 0]);
- m5 = (a[0 * n + 0] + a[0 * n + 1]) * b[1 * n + 1];
- m6 = (a[1 * n + 0] - a[0 * n + 0]) * (b[0 * n + 0] + b[0 * n + 1]);
- m7 = (a[0 * n + 1] - a[1 * n + 1]) * (b[1 * n + 0] + b[1 * n + 1]);
- res[0 * n + 0] = m1 + m4 - m5 + m7;
- res[0 * n + 1] = m3 + m5;
- res[1 * n + 0] = m2 + m4;
- res[1 * n + 1] = m1 - m2 + m3 + m6;
- return res;
- }
- return strassen(res, a, b, n);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement