__kernel void LinearMetricAddition(__read_only image3d_t target,__read_only image3d_t source,
__global int* _bins,
sampler_t volumeSampler,
__global float* matrix,
)
{
int i = get_global_id(0); // multiple of 64
int j = get_global_id(1); // multiple of 64
int z = get_global_id(2); // multiple of 64
int4 coords = (int4)(i,j,z,1);
// Perform the matrix transformation on the current pixel
float _x = matrix[0+0] * i + matrix[0+1] * j + matrix[0+2] * z + matrix[0+3];
float _y = matrix[4+0] * i + matrix[4+1] * j + matrix[4+2] * z + matrix[4+3];
float _z = matrix[8+0] * i + matrix[8+1] * j + matrix[8+2] * z + matrix[8+3];
float4 transCoords = (float4)(_x,_y,_z,1.0f);
int res = read_imagei(source,volumeSampler,transCoords).x;
//Block size is 64,1,1 ensuring that all operations to _bins will be coalesced
_bins[i + j*64 + z*64*64] = read_imagei(target,volumeSampler,coords).x + res;
}