Advertisement
Guest User

A Maz

a guest
Jan 21st, 2013
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.70 KB | None | 0 0
  1. #include <iostream>
  2. #include <fstream>
  3. #include <CL/cl.h>
  4.  
  5. using namespace std;
  6.  
  7. #define TILE_WIDTH 16
  8. #define PROGRAM_FILE "kernel.cl"
  9.  
  10. void randomFloats(float* data, int size){
  11.     for(int i=0; i<size; i++)
  12.         data[i] = rand() / (float)RAND_MAX;
  13. }
  14.  
  15. cl_device_id create_device() {
  16.     cl_platform_id platform;
  17.     cl_device_id dev;
  18.     int err;
  19.    
  20.     /* Identify a platform */
  21.     err = clGetPlatformIDs(1, &platform, NULL);
  22.     if(err < 0) {
  23.         perror("Couldn't identify a platform");
  24.         exit(1);
  25.     }
  26.    
  27.     /* Access a device */
  28.     err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
  29.     //if(err == CL_DEVICE_NOT_FOUND) {
  30.     //    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
  31.     //}
  32.     if(err < 0) {
  33.         perror("Couldn't access any devices");
  34.         exit(1);  
  35.     }
  36.    
  37.     return dev;
  38. }
  39.  
  40. /* Create program from a file and compile it */
  41. cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
  42.    
  43.     cl_program program;
  44.     FILE *program_handle;
  45.     char *program_buffer, *program_log;
  46.     size_t program_size, log_size;
  47.     int err;
  48.    
  49.     /* Read program file and place content into buffer */
  50.     program_handle = fopen(filename, "r");
  51.     if(program_handle == NULL) {
  52.         perror("Couldn't find the program file");
  53.         exit(1);
  54.     }
  55.     fseek(program_handle, 0, SEEK_END);
  56.     program_size = ftell(program_handle);
  57.     rewind(program_handle);
  58.     program_buffer = (char*)malloc(program_size + 1);
  59.     program_buffer[program_size] = '\0';
  60.     fread(program_buffer, sizeof(char), program_size, program_handle);
  61.     fclose(program_handle);
  62.    
  63.     /* Create program from file */
  64.     program = clCreateProgramWithSource(ctx, 1,
  65.                                         (const char**)&program_buffer, &program_size, &err);
  66.     if(err < 0) {
  67.         perror("Couldn't create the program");
  68.         exit(1);
  69.     }
  70.     free(program_buffer);
  71.    
  72.     /* Build program */
  73.     err = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
  74.     if(err < 0) {
  75.        
  76.         /* Find size of log and print to std output */
  77.         clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
  78.                               0, NULL, &log_size);
  79.         program_log = (char*) malloc(log_size + 1);
  80.         program_log[log_size] = '\0';
  81.         clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
  82.                               log_size + 1, program_log, NULL);
  83.         printf("%d: %s\n", err, program_log);
  84.         free(program_log);
  85.         exit(1);
  86.     }
  87.     size_t nbread;
  88.     int nb_devices;
  89.     err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nb_devices, &nbread);// Return 1 devices
  90.     size_t *np = new size_t[nb_devices];//Create size array
  91.     err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nb_devices, np, &nbread);//Load in np the size of my binary
  92.     char** bn = new char* [nb_devices]; //Create the binary array
  93.     for(int i =0; i < nb_devices;i++)  bn[i] = new char[np[i]]; // I know... it's bad... but if i use new char[np[i]], i have a segfault... :/
  94.     err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *)*nb_devices, bn, &nbread); //Load the binary itself
  95.     FILE *fp = fopen("binar.bin", "wb");
  96.     fwrite(bn[0], sizeof(char), np[0], fp);
  97.     fclose(fp);
  98.  
  99.     return program;
  100. }
  101.  
  102. int main(int argc, char ** argv) {
  103.     float * hostA; // The A matrix
  104.     float * hostB; // The B matrix
  105.     float * hostC; // The output C matrix
  106.     cl_mem deviceA;
  107.     cl_mem deviceB;
  108.     cl_mem deviceC;
  109.     int numARows; // number of rows in the matrix A
  110.     int numAColumns; // number of columns in the matrix A
  111.     int numBRows; // number of rows in the matrix B
  112.     int numBColumns; // number of columns in the matrix B
  113.     int numCRows; // number of rows in the matrix C
  114.     int numCColumns; // number of columns in the matrix C
  115.     ofstream fout("output.raw");
  116.  
  117.     cl_device_id device;
  118.     cl_context context;
  119.     cl_program program;
  120.     cl_kernel matrixMultiplyShared;
  121.     cl_command_queue queue;
  122.     cl_int err;
  123.    
  124.      /* Create device and determine local size */
  125.     device = create_device();
  126.    
  127.     /* Create a context */
  128.     context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
  129.     if(err < 0) {
  130.         perror("Couldn't create a context");
  131.         exit(1);  
  132.     }
  133.    
  134.     /* Build program */
  135.     program = build_program(context, device, PROGRAM_FILE);
  136.    
  137.     /* Create a command queue */
  138.     queue = clCreateCommandQueue(context, device,
  139.                                  CL_QUEUE_PROFILING_ENABLE, &err);
  140.     if(err < 0) {
  141.         perror("Couldn't create a command queue");
  142.         exit(1);  
  143.     };
  144.    
  145.     /* Create kernels */
  146.     matrixMultiplyShared = clCreateKernel(program, "matrixMultiplyShared", &err);
  147.     if(err < 0) {
  148.         perror("Couldn't create a kernel");
  149.         exit(1);
  150.     };
  151.  
  152.     numARows = numAColumns = 4096;
  153.     numBRows = numBColumns = 4096;
  154.  
  155.     hostA = (float*)malloc(numARows*numAColumns * sizeof(float));
  156.     hostB = (float*)malloc(numBRows*numBColumns * sizeof(float));
  157.  
  158.     randomFloats(hostA, numARows*numAColumns);
  159.     randomFloats(hostB, numBRows*numBColumns);
  160.  
  161.     //@@ Set numCRows and numCColumns
  162.     numCRows = numARows;
  163.     numCColumns = numBColumns;
  164.     //@@ Allocate the hostC matrix
  165.     hostC = (float*)malloc(numCColumns * numCRows * sizeof(float));
  166.  
  167.  
  168.     //@@ Allocate GPU memory here
  169.     deviceA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numARows * numAColumns * sizeof(float), hostA, &err);
  170.     deviceB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numBRows * numBColumns * sizeof(float), hostB, &err);
  171.     deviceC = clCreateBuffer(context, CL_MEM_READ_WRITE, numCRows * numCColumns * sizeof(float), NULL, &err);
  172.    
  173.     //@@ Initialize the grid and block dimensions here
  174.     err = clSetKernelArg(matrixMultiplyShared, 0, sizeof(cl_mem), &deviceA);
  175.     err |= clSetKernelArg(matrixMultiplyShared, 1, sizeof(cl_mem), &deviceB);
  176.     err |= clSetKernelArg(matrixMultiplyShared, 2, sizeof(cl_mem), &deviceC);
  177.     err |= clSetKernelArg(matrixMultiplyShared, 3, sizeof(int), &numARows);
  178.     err |= clSetKernelArg(matrixMultiplyShared, 4, sizeof(int), &numAColumns);
  179.     err |= clSetKernelArg(matrixMultiplyShared, 5, sizeof(int), &numBRows);
  180.     err |= clSetKernelArg(matrixMultiplyShared, 6, sizeof(int), &numBColumns);
  181.     err |= clSetKernelArg(matrixMultiplyShared, 7, sizeof(int), &numCRows);
  182.     err |= clSetKernelArg(matrixMultiplyShared, 8, sizeof(int), &numCColumns);
  183.  
  184.     size_t local_size[2] = {16, 16};
  185.     size_t global_size[2] = {((numBColumns+local_size[0]-1)/local_size[0])*local_size[0], ((numARows+local_size[1]-1)/local_size[1])*local_size[1]};
  186.  
  187.     cl_event timeEvent;
  188.     cl_ulong timeStart, timeEnd;
  189.     //@@ Launch the GPU Kernel here
  190.     err = clEnqueueNDRangeKernel(queue, matrixMultiplyShared, 2, NULL, global_size, local_size, 0, NULL, NULL);
  191.     clFinish(queue);
  192.     //clGetEventProfilingInfo(timeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &timeStart, NULL);
  193.     //clGetEventProfilingInfo(timeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &timeEnd, NULL);
  194.     //printf("timeK1: %f\n",(double)(timeEnd - timeStart)/1e9);
  195.    
  196.     //@@ Copy the GPU memory back to the CPU here
  197.     clEnqueueReadBuffer(queue, deviceC, CL_TRUE, 0,
  198.                         numCRows*numCColumns*sizeof(float), hostC, 0, NULL, NULL);
  199.     clFinish(queue);
  200.    
  201.  
  202.     //@@ Free the GPU memory here
  203.     clReleaseMemObject(deviceA);
  204.     clReleaseMemObject(deviceB);
  205.     clReleaseMemObject(deviceC);
  206.     /*
  207.     for(int i=0; i<numCRows; i++){
  208.         for(int j=0; j<numCColumns; j++){
  209.             fout << hostC[i*numCColumns+j] << " ";
  210.         }
  211.         fout << "\n";
  212.     }
  213.     */
  214.     free(hostA);
  215.     free(hostB);
  216.     free(hostC);
  217.  
  218.     return 0;
  219. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement