Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- bool sort(Context * c,int targetDevice,int n,cl_mem in,cl_mem out) const
- {
- c->enqueueCopy(targetDevice,in,out,0,0,n*sizeof(data_t),EventVector());
- c->enqueueBarrier(targetDevice); // sync
- cl_mem buffers[2];
- buffers[0] = in;
- buffers[1] = out;
- for (int length=1;length<n;length<<=1)
- {
- int inc = length;
- std::list<int> strategy; // vector defining the sequence of reductions
- {
- int ii = inc;
- while (ii>0)
- {
- if (ii==128 || ii==32 || ii==8) { strategy.push_back(-1); break; } // C kernel
- int d = 1; // default is 1 bit
- if (0) d = 1;
- #if 1
- // Force jump to 128
- else if (ii==256) d = 1;
- else if (ii==512 && (ALLOWB & 4)) d = 2;
- else if (ii==1024 && (ALLOWB & 8)) d = 3;
- else if (ii==2048 && (ALLOWB & 16)) d = 4;
- #endif
- else if (ii>=8 && (ALLOWB & 16)) d = 4;
- else if (ii>=4 && (ALLOWB & 8)) d = 3;
- else if (ii>=2 && (ALLOWB & 4)) d = 2;
- else d = 1;
- strategy.push_back(d);
- ii >>= d;
- }
- }
- while (inc > 0)
- {
- int ninc = 0;
- int kid = -1;
- int doLocal = 0;
- int nThreads = 0;
- int d = strategy.front(); strategy.pop_front();
- switch (d)
- {
- case -1:
- kid = PARALLEL_BITONIC_C4_KERNEL;
- ninc = -1; // reduce all bits
- doLocal = 4;
- nThreads = n >> 2;
- break;
- case 4:
- kid = PARALLEL_BITONIC_B16_KERNEL;
- ninc = 4;
- nThreads = n >> ninc;
- break;
- case 3:
- kid = PARALLEL_BITONIC_B8_KERNEL;
- ninc = 3;
- nThreads = n >> ninc;
- break;
- case 2:
- kid = PARALLEL_BITONIC_B4_KERNEL;
- ninc = 2;
- nThreads = n >> ninc;
- break;
- case 1:
- kid = PARALLEL_BITONIC_B2_KERNEL;
- ninc = 1;
- nThreads = n >> ninc;
- break;
- default:
- printf("Strategy error!\n");
- break;
- }
- int wg = c->getMaxWorkgroupSize(targetDevice,kid);
- wg = std::min(wg,256);
- wg = std::min(wg,nThreads);
- c->clearArgs(kid);
- c->pushArg(kid,out);
- c->pushArg(kid,inc); // INC passed to kernel
- c->pushArg(kid,length<<1); // DIR passed to kernel
- if (doLocal>0) c->pushLocalArg(kid,doLocal*wg*sizeof(data_t)); // DOLOCAL values / thread
- c->enqueueKernel(targetDevice,kid,nThreads,1,wg,1,EventVector());
- c->enqueueBarrier(targetDevice); // sync
- // if (mLastN != n) printf("LENGTH=%d INC=%d KID=%d\n",length,inc,kid); // DEBUG
- if (ninc < 0) break; // done
- inc >>= ninc;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement