int exec_kernel(int ngrid,int natom,int ngadj,int naadj,
float *ax,float *ay,float *az,
float *gx,float *gy,float *gz,
float *charge,float *size,float xkappa,
float pre1,float *val,int itmax,const char * filename){
cl_context context;
cl_command_queue cmd_queue;
cl_device_id devices[2];
cl_uint count;
cl_uint device_index;
cl_int err;
// Connect to a compute device
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 2, devices, &count);
printf("Found %d GPU(s)...\n", count
);
// for (int i=0; i < count; i++) { device_stats(devices[i]); }
// choose the secondary gpu if there is one; the primary gpu is likely being used for the display
device_index = (count > 1)? 1 : 0;
size_t returned_size = 0;
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
err = clGetDeviceInfo(devices[device_index], CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
err|= clGetDeviceInfo(devices[device_index], CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
printf("Connecting to %s %s...\n", vendor_name
, device_name
);
// device_stats(devices[device_index]);
// Read the program
printf("Loading program '%s'\n\n", filename
);
char *program_source = load_program_source(filename);
//Create the context and command queue
context = clCreateContext(0, 1, &devices[device_index], NULL, NULL, &err);
cmd_queue = clCreateCommandQueue(context, devices[device_index], 0, NULL);
//Allocate memory for programs and kernels
cl_program program;
cl_kernel kernel;
//Create program from .cl file
program = clCreateProgramWithSource(context,1, (const char**)&program_source, NULL, &err);
// build the program (compile it)
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
char build[2048];
clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, 2048, build, NULL);
printf("Build Log:\n%s\n",build
);
// create the kernel
kernel = clCreateKernel(program, "mdh", &err);
uint64_t mbeg, mend;
double cl_alloc, cl_enqueue, cl_read;
size_t atom_buffer_size = sizeof(float) * natom;
size_t grid_buffer_size = sizeof(float) * ngrid;
size_t gadj_buffer_size = sizeof(float) * ngadj;
mbeg = mach_absolute_time();
//Allocate memory and queue it to be written to the device
cl_mem ax_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, atom_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, ax_mem, CL_TRUE, 0, atom_buffer_size, (void*)ax, 0, NULL, NULL);
cl_mem ay_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, atom_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, ay_mem, CL_TRUE, 0, atom_buffer_size, (void*)ay, 0, NULL, NULL);
cl_mem az_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, atom_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, az_mem, CL_TRUE, 0, atom_buffer_size, (void*)az, 0, NULL, NULL);
cl_mem charge_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, atom_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, charge_mem, CL_TRUE, 0, atom_buffer_size, (void*)charge, 0, NULL, NULL);
cl_mem size_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, atom_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, size_mem, CL_TRUE, 0, atom_buffer_size, (void*)size, 0, NULL, NULL);
cl_mem gx_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, gadj_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, gx_mem, CL_TRUE, 0, gadj_buffer_size, (void*)gx, 0, NULL, NULL);
cl_mem gy_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, gadj_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, gy_mem, CL_TRUE, 0, gadj_buffer_size, (void*)gy, 0, NULL, NULL);
cl_mem gz_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, gadj_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, gz_mem, CL_TRUE, 0, gadj_buffer_size, (void*)gz, 0, NULL, NULL);
cl_mem val_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, gadj_buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, val_mem, CL_TRUE, 0, gadj_buffer_size, (void*)val, 0, NULL, NULL);
//Push the data out to device
clFinish(cmd_queue);
mend = mach_absolute_time();
cl_alloc = machcore(mend, mbeg);
// set work-item dimensions
size_t global_work_size, local_work_size, shared_size;
global_work_size = ngadj;
local_work_size = 64;
shared_size = (5 * local_work_size) * sizeof(float);
//Set kernel arguments
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &ax_mem);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &ay_mem);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &az_mem);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charge_mem);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &size_mem);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &gx_mem);
err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &gy_mem);
err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &gz_mem);
err |= clSetKernelArg(kernel, 8, sizeof(float), &pre1);
err |= clSetKernelArg(kernel, 9, sizeof(float), &xkappa);
err |= clSetKernelArg(kernel, 10, sizeof(cl_mem), &val_mem);
err |= clSetKernelArg(kernel, 11, sizeof(int), &natom);
err |= clSetKernelArg(kernel, 12, shared_size, NULL);
size_t thread_size;
clGetKernelWorkGroupInfo(kernel,devices[device_index],CL_KERNEL_WORK_GROUP_SIZE,
sizeof(size_t),&thread_size,NULL);
printf("Recommended Size: %lu\n",thread_size
);
mbeg = mach_absolute_time();
//Queue up the kernels itmax times
for(int i=0;i<itmax;i++)
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
//Finish the calculation
clFinish(cmd_queue);
mend = mach_absolute_time();
cl_enqueue = machcore(mend, mbeg);
mbeg = mach_absolute_time();
// read output image
err = clEnqueueReadBuffer(cmd_queue, val_mem, CL_TRUE, 0, grid_buffer_size, val, 0, NULL, NULL);
clFinish(cmd_queue);
mend = mach_absolute_time();
cl_read = machcore(mend, mbeg);
printf("Allocation: %1.12g Enqueue: %1.12g Read: %1.12g\n",cl_alloc
,cl_enqueue
,cl_read
);
print_total(val, ngrid);
// release kernel, program, and memory objects
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
clReleaseMemObject(ax_mem);
clReleaseMemObject(ay_mem);
clReleaseMemObject(az_mem);
clReleaseMemObject(charge_mem);
clReleaseMemObject(size_mem);
clReleaseMemObject(gx_mem);
clReleaseMemObject(gy_mem);
clReleaseMemObject(gz_mem);
clReleaseMemObject(val_mem);
return CL_SUCCESS;
}