A Maz

#include <iostream>
#include <fstream>
#include <CL/cl.h>

using namespace std;

#define TILE_WIDTH 16
#define PROGRAM_FILE "kernel.cl"

void randomFloats(float* data, int size){
    for(int i=0; i<size; i++)
        data[i] = rand() / (float)RAND_MAX;
}

cl_device_id create_device() {
    cl_platform_id platform;
    cl_device_id dev;
    int err;

    /* Identify a platform */
    err = clGetPlatformIDs(1, &platform, NULL);
    if(err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Access a device */
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
    //if(err == CL_DEVICE_NOT_FOUND) {
    //    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
    //}
    if(err < 0) {
        perror("Couldn't access any devices");
        exit(1);
    }

    return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

    cl_program program;
    FILE *program_handle;
    char *program_buffer, *program_log;
    size_t program_size, log_size;
    int err;

    /* Read program file and place content into buffer */
    program_handle = fopen(filename, "r");
    if(program_handle == NULL) {
        perror("Couldn't find the program file");
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    /* Create program from file */
    program = clCreateProgramWithSource(ctx, 1,
                                        (const char**)&program_buffer, &program_size, &err);
    if(err < 0) {
        perror("Couldn't create the program");
        exit(1);
    }
    free(program_buffer);

    /* Build program */
    err = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
    if(err < 0) {

        /* Find size of log and print to std output */
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
                              0, NULL, &log_size);
        program_log = (char*) malloc(log_size + 1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
                              log_size + 1, program_log, NULL);
        printf("%d: %s\n", err, program_log);
        free(program_log);
        exit(1);
    }
    size_t nbread;
    int nb_devices;
    err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nb_devices, &nbread);// Return 1 devices
    size_t *np = new size_t[nb_devices];//Create size array
    err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nb_devices, np, &nbread);//Load in np the size of my binary
    char** bn = new char* [nb_devices]; //Create the binary array
    for(int i =0; i < nb_devices;i++)  bn[i] = new char[np[i]]; // I know... it's bad... but if i use new char[np[i]], i have a segfault... :/
    err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *)*nb_devices, bn, &nbread); //Load the binary itself
    FILE *fp = fopen("binar.bin", "wb");
    fwrite(bn[0], sizeof(char), np[0], fp);
    fclose(fp);

    return program;
}

int main(int argc, char ** argv) {
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    cl_mem deviceA;
    cl_mem deviceB;
    cl_mem deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows; // number of rows in the matrix C
    int numCColumns; // number of columns in the matrix C
    ofstream fout("output.raw");

    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel matrixMultiplyShared;
    cl_command_queue queue;
    cl_int err;

     /* Create device and determine local size */
    device = create_device();

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device,
                                 CL_QUEUE_PROFILING_ENABLE, &err);
    if(err < 0) {
        perror("Couldn't create a command queue");
        exit(1);
    };

    /* Create kernels */
    matrixMultiplyShared = clCreateKernel(program, "matrixMultiplyShared", &err);
    if(err < 0) {
        perror("Couldn't create a kernel");
        exit(1);
    };

    numARows = numAColumns = 4096;
    numBRows = numBColumns = 4096;

    hostA = (float*)malloc(numARows*numAColumns * sizeof(float));
    hostB = (float*)malloc(numBRows*numBColumns * sizeof(float));

    randomFloats(hostA, numARows*numAColumns);
    randomFloats(hostB, numBRows*numBColumns);

    //@@ Set numCRows and numCColumns
    numCRows = numARows;
    numCColumns = numBColumns;
    //@@ Allocate the hostC matrix
    hostC = (float*)malloc(numCColumns * numCRows * sizeof(float));


    //@@ Allocate GPU memory here
    deviceA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numARows * numAColumns * sizeof(float), hostA, &err);
    deviceB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, numBRows * numBColumns * sizeof(float), hostB, &err);
    deviceC = clCreateBuffer(context, CL_MEM_READ_WRITE, numCRows * numCColumns * sizeof(float), NULL, &err);

    //@@ Initialize the grid and block dimensions here
    err = clSetKernelArg(matrixMultiplyShared, 0, sizeof(cl_mem), &deviceA);
    err |= clSetKernelArg(matrixMultiplyShared, 1, sizeof(cl_mem), &deviceB);
    err |= clSetKernelArg(matrixMultiplyShared, 2, sizeof(cl_mem), &deviceC);
    err |= clSetKernelArg(matrixMultiplyShared, 3, sizeof(int), &numARows);
    err |= clSetKernelArg(matrixMultiplyShared, 4, sizeof(int), &numAColumns);
    err |= clSetKernelArg(matrixMultiplyShared, 5, sizeof(int), &numBRows);
    err |= clSetKernelArg(matrixMultiplyShared, 6, sizeof(int), &numBColumns);
    err |= clSetKernelArg(matrixMultiplyShared, 7, sizeof(int), &numCRows);
    err |= clSetKernelArg(matrixMultiplyShared, 8, sizeof(int), &numCColumns);

    size_t local_size[2] = {16, 16};
    size_t global_size[2] = {((numBColumns+local_size[0]-1)/local_size[0])*local_size[0], ((numARows+local_size[1]-1)/local_size[1])*local_size[1]};

    cl_event timeEvent;
    cl_ulong timeStart, timeEnd;
    //@@ Launch the GPU Kernel here
    err = clEnqueueNDRangeKernel(queue, matrixMultiplyShared, 2, NULL, global_size, local_size, 0, NULL, NULL);
    clFinish(queue);
    //clGetEventProfilingInfo(timeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &timeStart, NULL);
    //clGetEventProfilingInfo(timeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &timeEnd, NULL);
    //printf("timeK1: %f\n",(double)(timeEnd - timeStart)/1e9);

    //@@ Copy the GPU memory back to the CPU here
    clEnqueueReadBuffer(queue, deviceC, CL_TRUE, 0,
                        numCRows*numCColumns*sizeof(float), hostC, 0, NULL, NULL);
    clFinish(queue);


    //@@ Free the GPU memory here
    clReleaseMemObject(deviceA);
    clReleaseMemObject(deviceB);
    clReleaseMemObject(deviceC);
    /*
    for(int i=0; i<numCRows; i++){
        for(int j=0; j<numCColumns; j++){
            fout << hostC[i*numCColumns+j] << " ";
        }
        fout << "\n";
    }
    */
    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}