Untitled

//R: these 3 defines set vector size of fetch kernel
typedef float4 ftype;
#define FETCH vload4
#define FSHIFT 2 // 0 - scalar; 2 - float4; 3 - float 8; 4 - float16
#define FD (480) //(480<<2) - scalar/float; (480) - float4; (480>>1) - float8; (480>>2) - float16

#define FARRAY_SIZE 10
__kernel void GPU_fetch_array_kernel_twin_1D_cl(__global float* src, __global float* src_twin,
											  __global int* offsets,  __global int* f_int,
											 __global ftype* dest, __global ftype* dest_twin,
		const uint offset, const uint stride, const uint max_per_int){
	size_t threadID=get_global_id(0)+offset;
//	size_t j=get_global_id(1);
//	if(j>=fd)return; //R:matrix bounds guard
	size_t l;
	ftype acc[FARRAY_SIZE],	acc_twin[FARRAY_SIZE];
	//__local src_cache[1024];
	//__local src_twin_cache[1024];

	size_t n_per=f_int[threadID];
	for(size_t outer_loop=0;outer_loop<max_per_int;outer_loop+=FARRAY_SIZE){
		for(size_t j=0;j<FARRAY_SIZE;j++){acc[j]=0.f;acc_twin[j]=0.f;}
	 for(size_t k=0;k<n_per;k++){
			l=offsets[k*stride+threadID];
			for(size_t j=0;j<FARRAY_SIZE;j++){
				__global float* s=src+l+((outer_loop+j)<<FSHIFT);
				acc[j]+=FETCH(0,s);
				s=src_twin+l+((outer_loop+j)<<FSHIFT);
				acc_twin[j]+=FETCH(0,s);
	 		}
	 }
	 for(size_t j=0;j<FARRAY_SIZE;j++){
	 	size_t index=threadID*FD+(outer_loop+j);
	 	dest[index]=acc[j];
	 	dest_twin[index]=acc_twin[j];
	 }
	}

}