Advertisement
Guest User

failed kernel

a guest
Mar 25th, 2015
286
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.65 KB | None | 0 0
  1. __kernel __attribute__((vec_type_hint(float4)))
  2. void PC_find_triplets_avg_kernel_HD5_cl(int ul_FftLength, int len_power, float triplet_thresh_base, int AdvanceBy, int PoTLen,
  3.                                       __global float4* PoT,__global uint* result_flag,__global float4* PulsePoT_average,
  4.                                       __local float4* tmp) {
  5. //R: this one uses local memory to increase number of separate workitems hence number of workgroups for better load of
  6. // multi-CU devices
  7.                                          
  8. //R: difference from original kernel: this one just doing fast precheck and relies on CPU for real triplet analysis
  9. //R: this kernel does 4 PoT chunks at once.
  10. //R: cause workitems can write flag in arbitrary order it's not possible to set error code actually (original CUDA code
  11. // missed this fact and tries to return error code from kernel. That is, different variable should be used for setting
  12. // state flags.
  13.     int ul_PoT = get_global_id(0);//R: 4 PoTs at once!
  14.     int y = get_global_id(1);//R: index of offset chunk
  15.     int tid=get_local_id(2);
  16.     int fft_len4=ul_FftLength>>2;
  17.     int TOffset = y * AdvanceBy;
  18. //R: each wave of third index works on single PoT array
  19. //  local float4 local_sum[64/*can be get_local_size(2) if variable length allowed*/];
  20.     if(TOffset + len_power > PoTLen) {     
  21.         TOffset = PoTLen - len_power;
  22.     }
  23.     __global float4* fp_PulsePot= PoT + ul_PoT + TOffset * (fft_len4);
  24.     // Clear the result array
  25.     //int4 numBinsAboveThreshold_private=(int4)0;
  26.     float4 tmp_private=(float4)0.f,triplet_thresh=(float4)triplet_thresh_base,pp;
  27.     __local float4* tmp_local=tmp+get_local_size(2)*get_local_id(0);
  28.  
  29.     /* Get all the bins that are above the threshold, and find the power array mean value */
  30.     for( int i=tid;i<len_power;i+=get_local_size(2)/*can be get_local_size(2) if variable length allowed*/ ) {
  31.         tmp_private += fp_PulsePot[i*fft_len4];
  32.     }
  33.     //R: here can be one of new reduce operations but this will require higher CL version
  34.     tmp_local[tid]=tmp_private;
  35.     for(int i=(get_local_size(2)>>1); i>0;i>>=1){
  36.         barrier(CLK_LOCAL_MEM_FENCE);
  37.         if(tid<i){
  38.             tmp_local[tid]+=tmp_local[tid+i];
  39.         }
  40.     }
  41.     barrier(CLK_LOCAL_MEM_FENCE);
  42.     if(tid==0){
  43.         tmp_private=tmp_local[0];
  44.         tmp_private/= (float4)len_power;
  45.         PulsePoT_average[ul_PoT+y*fft_len4]=tmp_private;//R: this avg will be needed later, at pulse finding
  46.         tmp_local[0]=tmp_private;//R: to  share with other threads
  47.     }
  48.     barrier(CLK_LOCAL_MEM_FENCE);
  49.     tmp_private=tmp_local[0];//R: broadcast reduced value to all threads for further use
  50.     triplet_thresh*=tmp_private;
  51.     tmp_private=(float4)0.f;
  52.  
  53.     for( int i=tid;i<len_power;i+=get_local_size(2)) {
  54.         pp= fp_PulsePot[i*fft_len4];
  55.         if(  pp.x>= triplet_thresh.x ) {
  56.             tmp_private.x+=1.f;
  57.             printf("X BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
  58.                 get_global_id(0),get_global_id(1),get_global_id(2),
  59.             get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
  60.             printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
  61.         }
  62.         if(  pp.y>= triplet_thresh.y ) {
  63.             tmp_private.y+=1.f;
  64.             printf("Y BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
  65.                 get_global_id(0),get_global_id(1),get_global_id(2),
  66.                 get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
  67.             printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
  68.         }
  69.         if(  pp.z>= triplet_thresh.z ) {
  70.             tmp_private.z+=1.f;
  71.             printf("Z BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
  72.                 get_global_id(0),get_global_id(1),get_global_id(2),
  73.                 get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
  74.             printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
  75.         }
  76.         if(  pp.w>= triplet_thresh.w ) {
  77.             tmp_private.w+=1.f;
  78.             printf("W BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
  79.                 get_global_id(0),get_global_id(1),get_global_id(2),
  80.                 get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
  81.             printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
  82.         }
  83.     }
  84. //R: again need to reduce values
  85.     tmp_local[tid]=tmp_private;
  86.     for(int i=(get_local_size(2)>>1)/*can be get_local_size(2) if variable length allowed*/; i>0;i>>=1){
  87.         barrier(CLK_LOCAL_MEM_FENCE);
  88.         if(tid<i){
  89.             tmp_local[tid]+=tmp_local[tid+i];
  90.         }
  91.     }
  92.     barrier(CLK_LOCAL_MEM_FENCE);
  93.   if(tid==0){
  94.         tmp_private=tmp_local[0];
  95.     if(tmp_private.x>2.f || tmp_private.y>2.f || tmp_private.z>2.f || tmp_private.w>2.f){
  96. //R: global size is power of 2 so it should be safe to perform integer division here
  97. //      printf("Resulting numbers of peaks: (%v4g)\n",tmp_private);
  98.         int result_coordinate=(get_global_size(0)>RESULT_SIZE)?
  99.             ((RESULT_SIZE*get_global_id(0))/get_global_size(0)):get_global_id(0);
  100.         result_flag[result_coordinate]=1;
  101.     }
  102.   }
  103. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement