Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __kernel __attribute__((vec_type_hint(float4)))
- void PC_find_triplets_avg_kernel_HD5_cl(int ul_FftLength, int len_power, float triplet_thresh_base, int AdvanceBy, int PoTLen,
- __global float4* PoT,__global uint* result_flag,__global float4* PulsePoT_average,
- __local float4* tmp) {
- //R: this one uses local memory to increase number of separate workitems hence number of workgroups for better load of
- // multi-CU devices
- //R: difference from original kernel: this one just doing fast precheck and relies on CPU for real triplet analysis
- //R: this kernel does 4 PoT chunks at once.
- //R: cause workitems can write flag in arbitrary order it's not possible to set error code actually (original CUDA code
- // missed this fact and tries to return error code from kernel. That is, different variable should be used for setting
- // state flags.
- int ul_PoT = get_global_id(0);//R: 4 PoTs at once!
- int y = get_global_id(1);//R: index of offset chunk
- int tid=get_local_id(2);
- int fft_len4=ul_FftLength>>2;
- int TOffset = y * AdvanceBy;
- //R: each wave of third index works on single PoT array
- // local float4 local_sum[64/*can be get_local_size(2) if variable length allowed*/];
- if(TOffset + len_power > PoTLen) {
- TOffset = PoTLen - len_power;
- }
- __global float4* fp_PulsePot= PoT + ul_PoT + TOffset * (fft_len4);
- // Clear the result array
- //int4 numBinsAboveThreshold_private=(int4)0;
- float4 tmp_private=(float4)0.f,triplet_thresh=(float4)triplet_thresh_base,pp;
- __local float4* tmp_local=tmp+get_local_size(2)*get_local_id(0);
- /* Get all the bins that are above the threshold, and find the power array mean value */
- for( int i=tid;i<len_power;i+=get_local_size(2)/*can be get_local_size(2) if variable length allowed*/ ) {
- tmp_private += fp_PulsePot[i*fft_len4];
- }
- //R: here can be one of new reduce operations but this will require higher CL version
- tmp_local[tid]=tmp_private;
- for(int i=(get_local_size(2)>>1); i>0;i>>=1){
- barrier(CLK_LOCAL_MEM_FENCE);
- if(tid<i){
- tmp_local[tid]+=tmp_local[tid+i];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(tid==0){
- tmp_private=tmp_local[0];
- tmp_private/= (float4)len_power;
- PulsePoT_average[ul_PoT+y*fft_len4]=tmp_private;//R: this avg will be needed later, at pulse finding
- tmp_local[0]=tmp_private;//R: to share with other threads
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- tmp_private=tmp_local[0];//R: broadcast reduced value to all threads for further use
- triplet_thresh*=tmp_private;
- tmp_private=(float4)0.f;
- for( int i=tid;i<len_power;i+=get_local_size(2)) {
- pp= fp_PulsePot[i*fft_len4];
- if( pp.x>= triplet_thresh.x ) {
- tmp_private.x+=1.f;
- printf("X BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
- get_global_id(0),get_global_id(1),get_global_id(2),
- get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
- printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
- }
- if( pp.y>= triplet_thresh.y ) {
- tmp_private.y+=1.f;
- printf("Y BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
- get_global_id(0),get_global_id(1),get_global_id(2),
- get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
- printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
- }
- if( pp.z>= triplet_thresh.z ) {
- tmp_private.z+=1.f;
- printf("Z BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
- get_global_id(0),get_global_id(1),get_global_id(2),
- get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
- printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
- }
- if( pp.w>= triplet_thresh.w ) {
- tmp_private.w+=1.f;
- printf("W BRANCH: global:(%d,%d,%d); local:(%d,%d,%d); tmp_private:(%v4g)\n",
- get_global_id(0),get_global_id(1),get_global_id(2),
- get_local_id(0),get_local_id(1),get_local_id(2),tmp_private);
- printf("pp:(%v4g); triplet_thresh:(%v4g)\n",pp,triplet_thresh);
- }
- }
- //R: again need to reduce values
- tmp_local[tid]=tmp_private;
- for(int i=(get_local_size(2)>>1)/*can be get_local_size(2) if variable length allowed*/; i>0;i>>=1){
- barrier(CLK_LOCAL_MEM_FENCE);
- if(tid<i){
- tmp_local[tid]+=tmp_local[tid+i];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(tid==0){
- tmp_private=tmp_local[0];
- if(tmp_private.x>2.f || tmp_private.y>2.f || tmp_private.z>2.f || tmp_private.w>2.f){
- //R: global size is power of 2 so it should be safe to perform integer division here
- // printf("Resulting numbers of peaks: (%v4g)\n",tmp_private);
- int result_coordinate=(get_global_size(0)>RESULT_SIZE)?
- ((RESULT_SIZE*get_global_id(0))/get_global_size(0)):get_global_id(0);
- result_flag[result_coordinate]=1;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement