Advertisement
sosiris

collatz glide kernel with reduction

Jul 31st, 2014
232
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 6.00 KB | None | 0 0
  1. //This kernel file should define LUTSTEP
  2. //count collatz glide steps without actually comparing the value to its original. With reduction within the work group
  3. //assuming SIEVESTEP = 32
  4. //-cl-finite-math-only and -cl-no-signed-zeros may be used for faster floating point calculations
  5.  
  6. //constants
  7. #define MAXSTEP 0xfffffffu
  8. #define BITDIFF (32-LUTSTEP)
  9. #define TABLEMASK ((1u << LUTSTEP) - 1)
  10. #define ODDSCORE 1.5849625f
  11.  
  12. //Table of 3^N
  13. __constant uint power3[21] =
  14. {1u,3u,9u,27u,81u,243u,729u,2187u,6561u,19683u,
  15. 59049u,177147u,531441u,1594323u,4782969u,14348907u,43046721u,129140163u,387420489u,1162261467u,
  16. 3486784401u
  17. };
  18.  
  19. //mul64() : takes two 32-bit uint and returns a uint2 as the multiply result
  20. inline uint2 mul64(const uint a, const uint b){
  21.   return (uint2)(a*b, mul_hi(a,b));
  22. }
  23.  
  24. //collatzGlide() : count collatz glide and store the result of maximum glide.
  25. //-----parameters-----
  26. //result : s0 : max glide step. s1 : 'offset' of max glide. s23 : sum of steps (64-bit). Length = global work size
  27. //sieve : Numbers needed to compute for every 2^32 numbers. Length = global work size
  28. //table : look-up table for jumping. s0: x of 3^x mulVal. s1 : transformed b. Length = 2^ (LUTSTEP)
  29. //negNadir : negative nadir score during a 'jump' by look-up table. Score is an approximation of log2 (val' / val). Length = 2^ (LUTSTEP)
  30. //start : starting value (128-bit uint) as uint4
  31. //l_maxStep : length = local work size
  32. //l_totalOffset : length = local work size
  33. //l_sumStep : length = local work size
  34. //offset32 : actual offset = offset32 * (2^32)
  35.  
  36. __kernel void collatzGlide(
  37.     __global uint *maxStep,
  38.     __global ulong *totalOffset,
  39.     __global ulong *sumStep,
  40.     __global const uint *sieve,
  41.     __global const uint2 *table,
  42.     __global const float *negNadir,
  43.     uint4 val,
  44.     __local uint *l_maxStep,
  45.     __local ulong *l_totalOffset,
  46.     __local uint *l_sumStep,
  47.     const uint offset32
  48. ){
  49.     uint id = get_global_id(0);
  50.     uint2 lut = val.s23; //lut : look-up table item. temporary store higher 64-bits of start
  51.     val.s0 = sieve[id];
  52.     val.s1 += offset32;
  53.     val.s2 += (val.s1 < offset32); //carry operation of val
  54.     val.s3 += (val.s2 < lut.s0);
  55.     uint2 val_h = (uint2)(val.s3 < lut.s1 , 0u ); //val_h : higher 64-bit of val
  56.    
  57.     //save total offset to shared memory
  58.     id = get_local_id(0);
  59.     l_totalOffset[id] = upsample(offset32, val.s0);
  60.    
  61.     uint carryIdx = val.s0 & TABLEMASK; //carryIdx : 1. carry bits, 2. look-up table index
  62.     uint stepCnt = 0; //stepCnt : count glide steps
  63.     uint mulVal = 0; //mulVal : multiply value
  64.     //uint2 lut; //lut : look-up table item
  65.     float score = 0.0f; //score : approximation of log2(val'/ val)
  66.     //The main loop, jump (LUTSTEP) iterations at a time, until the score hit nadir => the value will be less than its original value during next jump
  67.     //The first iteration, val will always greater than its original because the sieve is applied; do{}while is used.
  68.     do{
  69.         lut = table[carryIdx]; //retrieve 'jump' table item
  70.         stepCnt += LUTSTEP + lut.s0; //accumulate glide step
  71.         mulVal = power3[lut.s0]; //multiply value = 3^(# odd encountered the in jump table)
  72.         carryIdx = lut.s1;
  73.         score += convert_float(lut.s0)* ODDSCORE - convert_float(LUTSTEP); //accumulate score, may use fma()
  74.        
  75.         //Do n = (n>>LUTSTEP)*a + b
  76.         lut = mul64((val.s0 >> LUTSTEP) | (val.s1 << BITDIFF), mulVal); //bit 0-31
  77.         val.s0 = lut.s0 + carryIdx;
  78.         carryIdx = lut.s1 + (val.s0 < lut.s0); //carry bits
  79.         lut = mul64((val.s1 >> LUTSTEP) | (val.s2 << BITDIFF), mulVal);//bit 32-63
  80.         val.s1 = lut.s0 + carryIdx;
  81.         carryIdx = lut.s1 + (val.s1 < lut.s0);
  82.         lut = mul64((val.s2 >> LUTSTEP) | (val.s3 << BITDIFF), mulVal);//bit 64-95
  83.         val.s2 = lut.s0 + carryIdx;
  84.         carryIdx = lut.s1 + (val.s2 < lut.s0);
  85.         lut = mul64((val.s3 >> LUTSTEP) | (val_h.s0 << BITDIFF), mulVal);//bit 96-127
  86.         val.s3 = lut.s0 + carryIdx;
  87.         carryIdx = lut.s1 + (val.s3 < lut.s0);
  88.         lut = mul64((val_h.s0 >> LUTSTEP) | (val_h.s1 << BITDIFF), mulVal);//bit 128-159
  89.         val_h.s0 = lut.s0 + carryIdx;
  90.         carryIdx = lut.s1 + (val_h.s0 < lut.s0);
  91.         lut = mul64((val_h.s1 >> LUTSTEP), mulVal);//bit 160-191 and overflow detection
  92.         val_h.s1 = lut.s0 + carryIdx;
  93.        
  94.         lut.s1 += (val_h.s1 < lut.s0); //lut.s1 act as overflow bits
  95.         carryIdx = val.s0 & TABLEMASK; //next table index
  96.         //the loop should continue when all of the below are true
  97.         //1. next nadir + score > 0 (val will be always greater than its original in the next jump so we can safely jump 'LUTSTEP' iterations)
  98.         //2. no overflow
  99.         //3. step < maxStep (2^28 -1)
  100.         mulVal = (score > negNadir[carryIdx]) && (lut.s1 == 0) && (stepCnt < MAXSTEP);
  101.     }while(mulVal);
  102.    
  103.     // Phase2 : find the position where the val fall under its original.
  104.     do{
  105.         mulVal = carryIdx & 1u; //odd or even?
  106.         stepCnt += 1 + mulVal;
  107.         carryIdx += select((uint)0, 1 + (carryIdx << 1), mulVal);
  108.         carryIdx >>= 1;
  109.         score += convert_float(mulVal) * ODDSCORE - 1.0f;
  110.     }while(signbit(score) == 0); //while(score >= 0)
  111.    
  112.     //Save step to shared memory
  113.     l_maxStep[id] = stepCnt;
  114.     l_sumStep[id] = stepCnt;
  115.    
  116.     //Do reduction within the work group
  117.     for(uint offset = get_local_size(0)/2; offset > 0; offset>>=1){
  118.         barrier(CLK_LOCAL_MEM_FENCE);
  119.         if(id < offset){
  120.             uint idOther = id + offset;
  121.             l_sumStep[id] += l_sumStep[idOther];
  122.            
  123.             stepCnt = l_maxStep[id];
  124.             uint stepOther = l_maxStep[idOther];
  125.             ulong offsetThis = l_totalOffset[id];
  126.             ulong offsetOther = l_totalOffset[idOther];
  127.             l_maxStep[id] = max(stepCnt, stepOther);
  128.             l_totalOffset[id] = select(offsetOther, offsetThis, convert_ulong(stepCnt > stepOther));
  129.         } //if(id < offset)
  130.     } //for(offset)
  131.    
  132.     //save to output array (by the first thread in the work group)
  133.     if(id == 0){
  134.         uint wid = get_group_id(0);
  135.         sumStep[wid] += l_sumStep[0]; //accumulate total steps
  136.         uint stepOther = maxStep[wid];
  137.         ulong offsetOther = totalOffset[wid];
  138.         stepCnt = l_maxStep[0];
  139.         ulong offsetThis = l_totalOffset[0];
  140.         maxStep[wid] = max(stepCnt, stepOther);
  141.         totalOffset[wid] = select( offsetOther, offsetThis, convert_ulong(stepCnt > stepOther));
  142.     } //if(id == 0)
  143. } //collatzGlide()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement