Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //R: these 3 defines set vector size of fetch kernel
- typedef float4 ftype;
- #define FETCH vload4
- #define FSHIFT 2 // 0 - scalar; 2 - float4; 3 - float 8; 4 - float16
- #define FD (480) //(480<<2) - scalar/float; (480) - float4; (480>>1) - float8; (480>>2) - float16
- #define FARRAY_SIZE 10
- __kernel void GPU_fetch_array_kernel_twin_1D_cl(__global float* src, __global float* src_twin,
- __global int* offsets, __global int* f_int,
- __global ftype* dest, __global ftype* dest_twin,
- const uint offset, const uint stride, const uint max_per_int){
- size_t threadID=get_global_id(0)+offset;
- // size_t j=get_global_id(1);
- // if(j>=fd)return; //R:matrix bounds guard
- size_t l;
- ftype acc[FARRAY_SIZE], acc_twin[FARRAY_SIZE];
- //__local src_cache[1024];
- //__local src_twin_cache[1024];
- size_t n_per=f_int[threadID];
- for(size_t outer_loop=0;outer_loop<max_per_int;outer_loop+=FARRAY_SIZE){
- for(size_t j=0;j<FARRAY_SIZE;j++){acc[j]=0.f;acc_twin[j]=0.f;}
- for(size_t k=0;k<n_per;k++){
- l=offsets[k*stride+threadID];
- for(size_t j=0;j<FARRAY_SIZE;j++){
- __global float* s=src+l+((outer_loop+j)<<FSHIFT);
- acc[j]+=FETCH(0,s);
- s=src_twin+l+((outer_loop+j)<<FSHIFT);
- acc_twin[j]+=FETCH(0,s);
- }
- }
- for(size_t j=0;j<FARRAY_SIZE;j++){
- size_t index=threadID*FD+(outer_loop+j);
- dest[index]=acc[j];
- dest_twin[index]=acc_twin[j];
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement