Advertisement
Guest User

Untitled

a guest
Apr 12th, 2014
69
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. //R: these 3 defines set vector size of fetch kernel
  2. typedef float4 ftype;
  3. #define FETCH vload4
  4. #define FSHIFT 2 // 0 - scalar; 2 - float4; 3 - float 8; 4 - float16
  5. #define FD (480) //(480<<2) - scalar/float; (480) - float4; (480>>1) - float8; (480>>2) - float16
  6.  
  7. #define FARRAY_SIZE 10
  8. __kernel void GPU_fetch_array_kernel_twin_1D_cl(__global float* src, __global float* src_twin,
  9. __global int* offsets, __global int* f_int,
  10. __global ftype* dest, __global ftype* dest_twin,
  11. const uint offset, const uint stride, const uint max_per_int){
  12. size_t threadID=get_global_id(0)+offset;
  13. // size_t j=get_global_id(1);
  14. // if(j>=fd)return; //R:matrix bounds guard
  15. size_t l;
  16. ftype acc[FARRAY_SIZE], acc_twin[FARRAY_SIZE];
  17. //__local src_cache[1024];
  18. //__local src_twin_cache[1024];
  19.  
  20. size_t n_per=f_int[threadID];
  21. for(size_t outer_loop=0;outer_loop<max_per_int;outer_loop+=FARRAY_SIZE){
  22. for(size_t j=0;j<FARRAY_SIZE;j++){acc[j]=0.f;acc_twin[j]=0.f;}
  23. for(size_t k=0;k<n_per;k++){
  24. l=offsets[k*stride+threadID];
  25. for(size_t j=0;j<FARRAY_SIZE;j++){
  26. __global float* s=src+l+((outer_loop+j)<<FSHIFT);
  27. acc[j]+=FETCH(0,s);
  28. s=src_twin+l+((outer_loop+j)<<FSHIFT);
  29. acc_twin[j]+=FETCH(0,s);
  30. }
  31. }
  32. for(size_t j=0;j<FARRAY_SIZE;j++){
  33. size_t index=threadID*FD+(outer_loop+j);
  34. dest[index]=acc[j];
  35. dest_twin[index]=acc_twin[j];
  36. }
  37. }
  38.  
  39. }
Advertisement
RAW Paste Data Copied
Advertisement