Advertisement
sosiris

collatzGlide.cl

Jul 31st, 2014
243
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 7.12 KB | None | 0 0
  1. //This kernel file should define LUTSTEP
  2. //assuming SIEVESTEP = 32
  3. //-cl-finite-math-only and -cl-no-signed-zeros may be used for faster floating point calculations
  4.  
  5. //constants
  6. #define MAXSTEP 0xfffffffu
  7. #define BITDIFF (32-LUTSTEP)
  8. #define TABLEMASK ((1u << LUTSTEP) - 1)
  9. #define ODDSCORE 1.5849625f
  10.  
  11. //compareUint4() : compares two 128-bit unsigned integers in uint4
  12. //returns negative number if a < b, positive number if a > b, 0 if a==b;
  13. inline int compareUint4(uint4 a, uint4 b){
  14.     int4 comp = (a<b) - (a>b);
  15.     comp <<= (int4)(0, 1, 2, 3);
  16.     return (comp.s0 + comp.s1 + comp.s2 + comp.s3);
  17. } //compareUint4()
  18.  
  19. //compareUint() : compares two 32-bit unsigned integers
  20. //returns negative number if a < b, positive number if a > b, 0 if a==b;
  21. inline int compareUint(uint a, uint b){
  22.     return (a<b) - (a>b);
  23. } //compareUint()
  24.  
  25. //Table of 3^N
  26. __constant uint power3[21] =
  27. {1u,3u,9u,27u,81u,243u,729u,2187u,6561u,19683u,
  28. 59049u,177147u,531441u,1594323u,4782969u,14348907u,43046721u,129140163u,387420489u,1162261467u,
  29. 3486784401u
  30. };
  31.  
  32. //mul64() : takes two 32-bit uint and returns a uint2 as the multiply result
  33. inline uint2 mul64(const uint a, const uint b){
  34.   return (uint2)(a*b, mul_hi(a,b));
  35. }
  36.  
  37. //clearRes() : fill the result array zero; clEnqueueFillBuffer() can be used in openCL 1.2 instead.
  38. __kernel void clearRes(__global uint4 *results){
  39.     results[get_global_id(0)] = (uint4)0;
  40. }
  41. //collatzGlide() : count collatz glide and store the result of maximum glide.
  42. //-----parameters-----
  43. //result : s0 : max glide step. s1 : 'offset' of max glide. s23 : sum of steps (64-bit). Length = global work size
  44. //sieve : Numbers needed to compute for every 2^32 numbers. Length = global work size
  45. //table : look-up table for jumping. s0: x of 3^x mulVal. s1 : transformed b. Length = 2^ (LUTSTEP)
  46. //negNadir : negative nadir score during a 'jump' by look-up table. Score is an approximation of log2 (val' / val). Length = 2^ (LUTSTEP)
  47. //start : starting value (128-bit uint) as uint4
  48. //offset32 : actual offset = offset32 * (2^32)
  49.  
  50. __kernel void collatzGlide( __global uint4 *results,
  51.                             __global const uint *sieve,
  52.                             __global const uint2 *table,
  53.                             __global const float *negNadir,
  54.                             uint4 start,
  55.                             const uint offset32){
  56.     const uint gid = get_global_id(0);
  57.    
  58.     uint4 val = (uint4)(sieve[gid], offset32 + start.s1, start.s23); //val : first 128-bit of val
  59.     val.s2 += (val.s1 < start.s1); //carry operation of val
  60.     val.s3 += (val.s2 < start.s2);
  61.     uint2 val_h = (uint2)(val.s3 < start.s3 , 0u ); //val_h : higher 64-bit of val
  62.    
  63.     //record original value
  64.     start = val;
  65.     const uint originalVal_h = val_h.s0;
  66.    
  67.     uint carryIdx = val.s0 & TABLEMASK; //carryIdx : 1. carry bits, 2. look-up table index
  68.     uint stepCnt = 0; //stepCnt : count glide steps
  69.     uint mulVal = 0; //mulVal : multiply value
  70.     uint2 lut; //lut : look-up table item
  71.     float score = 0.0f; ////score : approximation of log2(val'/ val)
  72.     //The main loop, jump (LUTSTEP) iterations at a time, until the score hit nadir => the value will be less than its original value during next jump
  73.     //The first iteration, val will always greater than its original because the sieve is applied; do{}while is used.
  74.     do{
  75.         lut = table[carryIdx]; //retrieve 'jump' table item
  76.         stepCnt += LUTSTEP + lut.s0; //accumulate glide step
  77.         mulVal = power3[lut.s0]; //multiply value = 3^(# odd encountered the in jump table)
  78.         carryIdx = lut.s1;
  79.         score += convert_float(lut.s0)* ODDSCORE - convert_float(LUTSTEP); //accumulate score, may use fma()
  80.        
  81.         //Do n = (n>>LUTSTEP)*a + b
  82.         lut = mul64((val.s0 >> LUTSTEP) | (val.s1 << BITDIFF), mulVal); //bit 0-31
  83.         val.s0 = lut.s0 + carryIdx;
  84.         carryIdx = lut.s1 + (val.s0 < lut.s0); //carry bits
  85.         lut = mul64((val.s1 >> LUTSTEP) | (val.s2 << BITDIFF), mulVal);//bit 32-63
  86.         val.s1 = lut.s0 + carryIdx;
  87.         carryIdx = lut.s1 + (val.s1 < lut.s0);
  88.         lut = mul64((val.s2 >> LUTSTEP) | (val.s3 << BITDIFF), mulVal);//bit 64-95
  89.         val.s2 = lut.s0 + carryIdx;
  90.         carryIdx = lut.s1 + (val.s2 < lut.s0);
  91.         lut = mul64((val.s3 >> LUTSTEP) | (val_h.s0 << BITDIFF), mulVal);//bit 96-127
  92.         val.s3 = lut.s0 + carryIdx;
  93.         carryIdx = lut.s1 + (val.s3 < lut.s0);
  94.         lut = mul64((val_h.s0 >> LUTSTEP) | (val_h.s1 << BITDIFF), mulVal);//bit 128-159
  95.         val_h.s0 = lut.s0 + carryIdx;
  96.         carryIdx = lut.s1 + (val_h.s0 < lut.s0);
  97.         lut = mul64((val_h.s1 >> LUTSTEP), mulVal);//bit 160-191 and overflow detection
  98.         val_h.s1 = lut.s0 + carryIdx;
  99.        
  100.         lut.s1 += (val_h.s1 < lut.s0); //lut.s1 act as overflow bits
  101.         carryIdx = val.s0 & TABLEMASK; //next table index
  102.         //the loop should continue when all of the below are true
  103.         //1. next nadir + score > 0 (val will be always greater than its original in the next jump so we can safely jump 'LUTSTEP' iterations)
  104.         //2. no overflow
  105.         //3. step < maxStep
  106.         lut.s0 = (score > negNadir[carryIdx]) && (lut.s1 == 0) && (stepCnt < MAXSTEP);
  107.     }while(lut.s0);
  108.     //Phase2 : find the position where the val fall under its original.
  109.     uint iterations = 0; //count iterations where the fail position is
  110.     mulVal = 0; //mulVal : counting 3^x;
  111.     do{
  112.         const uint odd = carryIdx & 1u;
  113.         mulVal += odd;
  114.         carryIdx += select((uint)0, 1 + (carryIdx << 1), odd);
  115.         carryIdx >>= 1;
  116.         score += convert_float(odd) * ODDSCORE - 1.0f;
  117.         ++iterations;
  118.     }while(signbit(score) == 0); //while(score >= 0)
  119.    
  120.     stepCnt += iterations + mulVal;
  121.     mulVal = power3[mulVal];
  122.     const uint newBitDiff = 32 - iterations;
  123.    
  124.     //Do n = (n >> iterations)*a + b
  125.     lut = mul64((val.s0 >> iterations) | (val.s1 << newBitDiff), mulVal); //bit 0-31
  126.     val.s0 = lut.s0 + carryIdx;
  127.     carryIdx = lut.s1 + (val.s0 < lut.s0); //carry bits
  128.     lut = mul64((val.s1 >> iterations) | (val.s2 << newBitDiff), mulVal); //bit 32-63
  129.     val.s1 = lut.s0 + carryIdx;
  130.     carryIdx = lut.s1 + (val.s1 < lut.s0);
  131.     lut = mul64((val.s2 >> iterations) | (val.s3 << newBitDiff), mulVal); //bit 64-95
  132.     val.s2 = lut.s0 + carryIdx;
  133.     carryIdx = lut.s1 + (val.s2 < lut.s0);
  134.     lut = mul64((val.s3 >> iterations) | (val_h.s0 <<newBitDiff), mulVal); //bit 96-127
  135.     val.s3 = lut.s0 + carryIdx;
  136.     carryIdx = lut.s1 + (val.s3 < lut.s0);
  137.     lut = mul64((val_h.s0 >> iterations) | (val_h.s1 << newBitDiff), mulVal); //bit 128-159
  138.     val_h.s0 = lut.s0 + carryIdx;
  139.     carryIdx = lut.s1 + (val_h.s0 < lut.s0);
  140.     lut = mul64((val_h.s1 >> iterations), mulVal); //bit 160-191
  141.     val_h.s1 = lut.s0 + carryIdx;
  142.    
  143.     //if transformed value >= original value, raise an error in the steps (set it to maxStep). Right now compares lower 128-bit only
  144.     int comp = compareUint4(val, start)+ (compareUint(val_h.s0, originalVal_h) << 4) + ((val_h.s1>0) << 5);
  145.     stepCnt = select(stepCnt, MAXSTEP, comp >= 0);
  146.     //compare and store the results from this kernel to the previous results
  147.     val = results[gid]; //Use val as the result to save space as it's no longer used
  148.     //If stepCount > that of the buffer, replace the results of the buffer with that (stepCount& kNo) of this kernel.
  149.     comp = stepCnt > val.s0;
  150.     val.s0 = select(val.s0, stepCnt, comp);
  151.     val.s1 = select(val.s1, offset32, comp);
  152.     val.s2 += stepCnt; //accumulate the step
  153.     val.s3 += val.s2 < stepCnt; //Check if the sum overflows 32-bit
  154.     results[gid] = val; //save result to output array
  155. } //collatzGlide()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement