Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <fstream>
- #include <CL/cl.h>
- using namespace std;
- #define TILE_WIDTH 16
- #define PROGRAM_FILE "kernel.cl"
- void randomFloats(float* data, int size){
- for(int i=0; i<size; i++)
- data[i] = rand() / (float)RAND_MAX;
- }
- cl_device_id create_device() {
- cl_platform_id platform;
- cl_device_id dev;
- int err;
- /* Identify a platform */
- err = clGetPlatformIDs(1, &platform, NULL);
- if(err < 0) {
- perror("Couldn't identify a platform");
- exit(1);
- }
- /* Access a device */
- err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
- //if(err == CL_DEVICE_NOT_FOUND) {
- // err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
- //}
- if(err < 0) {
- perror("Couldn't access any devices");
- exit(1);
- }
- return dev;
- }
- /* Create program from a file and compile it */
- cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
- cl_program program;
- FILE *program_handle;
- char *program_buffer, *program_log;
- size_t program_size, log_size;
- int err;
- /* Read program file and place content into buffer */
- program_handle = fopen(filename, "r");
- if(program_handle == NULL) {
- perror("Couldn't find the program file");
- exit(1);
- }
- fseek(program_handle, 0, SEEK_END);
- program_size = ftell(program_handle);
- rewind(program_handle);
- program_buffer = (char*)malloc(program_size + 1);
- program_buffer[program_size] = '\0';
- fread(program_buffer, sizeof(char), program_size, program_handle);
- fclose(program_handle);
- /* Create program from file */
- program = clCreateProgramWithSource(ctx, 1,
- (const char**)&program_buffer, &program_size, &err);
- if(err < 0) {
- perror("Couldn't create the program");
- exit(1);
- }
- free(program_buffer);
- /* Build program */
- err = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
- if(err < 0) {
- /* Find size of log and print to std output */
- clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
- 0, NULL, &log_size);
- program_log = (char*) malloc(log_size + 1);
- program_log[log_size] = '\0';
- clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
- log_size + 1, program_log, NULL);
- printf("%d: %s\n", err, program_log);
- free(program_log);
- exit(1);
- }
- size_t nbread;
- int nb_devices;
- err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nb_devices, &nbread);// Return 1 devices
- size_t *np = new size_t[nb_devices];//Create size array
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nb_devices, np, &nbread);//Load in np the size of my binary
- char** bn = new char* [nb_devices]; //Create the binary array
- for(int i =0; i < nb_devices;i++) bn[i] = new char[np[i]]; // I know... it's bad... but if i use new char[np[i]], i have a segfault... :/
- err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *)*nb_devices, bn, &nbread); //Load the binary itself
- FILE *fp = fopen("binar.bin", "wb");
- fwrite(bn[0], sizeof(char), np[0], fp);
- fclose(fp);
- return program;
- }
- int main(int argc, char ** argv) {
- float * hostA; // The A matrix
- float * hostB; // The B matrix
- float * hostC; // The output C matrix
- cl_mem deviceA;
- cl_mem deviceB;
- cl_mem deviceC;
- int numARows; // number of rows in the matrix A
- int numAColumns; // number of columns in the matrix A
- int numBRows; // number of rows in the matrix B
- int numBColumns; // number of columns in the matrix B
- int numCRows; // number of rows in the matrix C
- int numCColumns; // number of columns in the matrix C
- ofstream fout("output.raw");
- cl_device_id device;
- cl_context context;
- cl_program program;
- cl_kernel matrixMultiplyShared;
- cl_command_queue queue;
- cl_int err;
- /* Create device and determine local size */
- device = create_device();
- /* Create a context */
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
- if(err < 0) {
- perror("Couldn't create a context");
- exit(1);
- }
- /* Build program */
- program = build_program(context, device, PROGRAM_FILE);
- /* Create a command queue */
- queue = clCreateCommandQueue(context, device,
- CL_QUEUE_PROFILING_ENABLE, &err);
- if(err < 0) {
- perror("Couldn't create a command queue");
- exit(1);
- };
- /* Create kernels */
- matrixMultiplyShared = clCreateKernel(program, "matrixMultiplyShared", &err);
- if(err < 0) {
- perror("Couldn't create a kernel");
- exit(1);
- };
- numARows = numAColumns = 4096;
- numBRows = numBColumns = 4096;
- hostA = (float*)malloc(numARows*numAColumns * sizeof(float));
- hostB = (float*)malloc(numBRows*numBColumns * sizeof(float));
- randomFloats(hostA, numARows*numAColumns);
- randomFloats(hostB, numBRows*numBColumns);
- //@@ Set numCRows and numCColumns
- numCRows = numARows;
- numCColumns = numBColumns;
- //@@ Allocate the hostC matrix
- hostC = (float*)malloc(numCColumns * numCRows * sizeof(float));
- //@@ Allocate GPU memory here
- deviceA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numARows * numAColumns * sizeof(float), hostA, &err);
- deviceB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numBRows * numBColumns * sizeof(float), hostB, &err);
- deviceC = clCreateBuffer(context, CL_MEM_READ_WRITE, numCRows * numCColumns * sizeof(float), NULL, &err);
- //@@ Initialize the grid and block dimensions here
- err = clSetKernelArg(matrixMultiplyShared, 0, sizeof(cl_mem), &deviceA);
- err |= clSetKernelArg(matrixMultiplyShared, 1, sizeof(cl_mem), &deviceB);
- err |= clSetKernelArg(matrixMultiplyShared, 2, sizeof(cl_mem), &deviceC);
- err |= clSetKernelArg(matrixMultiplyShared, 3, sizeof(int), &numARows);
- err |= clSetKernelArg(matrixMultiplyShared, 4, sizeof(int), &numAColumns);
- err |= clSetKernelArg(matrixMultiplyShared, 5, sizeof(int), &numBRows);
- err |= clSetKernelArg(matrixMultiplyShared, 6, sizeof(int), &numBColumns);
- err |= clSetKernelArg(matrixMultiplyShared, 7, sizeof(int), &numCRows);
- err |= clSetKernelArg(matrixMultiplyShared, 8, sizeof(int), &numCColumns);
- size_t local_size[2] = {16, 16};
- size_t global_size[2] = {((numBColumns+local_size[0]-1)/local_size[0])*local_size[0], ((numARows+local_size[1]-1)/local_size[1])*local_size[1]};
- cl_event timeEvent;
- cl_ulong timeStart, timeEnd;
- //@@ Launch the GPU Kernel here
- err = clEnqueueNDRangeKernel(queue, matrixMultiplyShared, 2, NULL, global_size, local_size, 0, NULL, NULL);
- clFinish(queue);
- //clGetEventProfilingInfo(timeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &timeStart, NULL);
- //clGetEventProfilingInfo(timeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &timeEnd, NULL);
- //printf("timeK1: %f\n",(double)(timeEnd - timeStart)/1e9);
- //@@ Copy the GPU memory back to the CPU here
- clEnqueueReadBuffer(queue, deviceC, CL_TRUE, 0,
- numCRows*numCColumns*sizeof(float), hostC, 0, NULL, NULL);
- clFinish(queue);
- //@@ Free the GPU memory here
- clReleaseMemObject(deviceA);
- clReleaseMemObject(deviceB);
- clReleaseMemObject(deviceC);
- /*
- for(int i=0; i<numCRows; i++){
- for(int j=0; j<numCColumns; j++){
- fout << hostC[i*numCColumns+j] << " ";
- }
- fout << "\n";
- }
- */
- free(hostA);
- free(hostB);
- free(hostC);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement