__kernel void LinearMetricAddition(__read_only image3d_t target,__read_only image3d_t source, __global int* _bins, sampler_t volumeSampler, __global float* matrix, ) { int i = get_global_id(0); // multiple of 64 int j = get_global_id(1); // multiple of 64 int z = get_global_id(2); // multiple of 64 int4 coords = (int4)(i,j,z,1); // Perform the matrix transformation on the current pixel float _x = matrix[0+0] * i + matrix[0+1] * j + matrix[0+2] * z + matrix[0+3]; float _y = matrix[4+0] * i + matrix[4+1] * j + matrix[4+2] * z + matrix[4+3]; float _z = matrix[8+0] * i + matrix[8+1] * j + matrix[8+2] * z + matrix[8+3]; float4 transCoords = (float4)(_x,_y,_z,1.0f); int res = read_imagei(source,volumeSampler,transCoords).x; //Block size is 64,1,1 ensuring that all operations to _bins will be coalesced _bins[i + j*64 + z*64*64] = read_imagei(target,volumeSampler,coords).x + res; }