Advertisement
sosiris

collatz kernel with high step sieve

Aug 19th, 2015
224
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 5.42 KB | None | 0 0
  1. /*
  2. NOTICE: You must define the following:
  3.  
  4. LUTSIZEP2
  5. DELAYSIZEP2
  6. INITSTEP
  7.  
  8. via compile options, or it won't compile.
  9. */
  10.  
  11. /*
  12. **README
  13. Collatz Delay openCL kernel with 256-step sieve, by Sosiris, BOINC@Taiwan, 2015 C.E.
  14. How this thingy works :
  15. I Scan the numbers vertically instead of horizontally.
  16. Let n0 = a0 * (offset + id) + b0, where a0 = 2^256, b0 = one of the # in the 256-step sieve, offset = offset by kernel launch (kNo) + offset by initial conditions
  17. We can calculate it 256 steps ahead in the host, as all numbers share the same path until (256+1)th step:
  18. n256 = a256 * (offset + id) + b256, where a256 = 3^(odd b's encountered), b256 = b0 after 256 steps
  19. And n256 = (a256 * id) + (a256 * offset + b256), the former is computed by the openCL kernel, the latter (call it totalOffset) by the host
  20. Moreover, since offset = offset by kernel launch (kNo) + offset by initial conditions, and kNo increased by 1, offset increased by 'global threads' for every kernel launch,
  21. the increment of totalOffset = a256 * globalThreads, which can be computed in the device to decrease device-host communication
  22.  
  23.  
  24. **Work flow proposed for this app:
  25. kNo = lastkNo; //resume from last checkpoint
  26. totalOffset = getTotalOffset(a256, b256, offset + kNo * globalThreads);
  27. increment = a256 * globalThreads;
  28. for (k from 1 to range)
  29. {
  30.     kernel_clearRes (result[]);
  31.     for(i from 1 to kernelsPerReduction)
  32.     {
  33.         kernel_collatzDelay(result[], lut[], delay[], a256, totalOffset, kNo);
  34.         kernel_incTotalOffset(totalOffset, increment);
  35.         ++kNo;
  36.     }
  37.     readAndCompareResult (result[]);
  38.     checkPoint(kNo);
  39. }
  40.  
  41. */
  42.  
  43. //preprocessor defines
  44. #define LUTMASK ((1u<<LUTSIZEP2)-1)
  45. #define DELAYMASK ((1u<<DELAYSIZEP2)-1)
  46. #define NUMLENGTH 16
  47. #define MAXSTEP 0xfffffffu
  48.  
  49. //Power of 3
  50. __constant uint power3[21] = {
  51. 1u,3u,9u,27u,81u,243u,729u,2187u,6561u,19683u,
  52. 59049u,177147u,531441u,1594323u,4782969u,14348907u,43046721u,129140163u,387420489u,1162261467u,
  53. 3486784401u};
  54.  
  55. //returns a*b+c in uint2
  56. inline uint2 mad64(const uint a, const uint b, const uint c)
  57. {
  58.     uint2 res = (uint2)(a*b, mul_hi(a,b));
  59.     res.s0 += c;
  60.     res.s1 += res.s0 < c;
  61.     return res;
  62. } //mad64()
  63.  
  64. /*
  65. collatzDelay():
  66. g_maxStep : s0 = max collatz delay, s1 = corresponding kNo; length = global threads
  67. g_totalSteps : accumulated collatz delay steps; length = global threads
  68. g_lut : look-up table to 'jump', s0 : value should be multiplied (presented as 3^x), s1 : value should be added; length = 2^(LUTSIZEP2)
  69. g_delay : collatz delays for # under 2^(DELAYSIZEP2), length = 2^(DELAYSIZEP2)
  70. c_a : a256 stored in 512-bit, length = NUMLENGTH (=16)
  71. c_totalOffset : totalOffset stored in 512-bit, length = NUMLENGTH (=16)
  72. kNo : offset from each kernel launch, 1 kNo = offset of 'global threads'
  73. */
  74.  
  75. __kernel void collatzDelay
  76. (
  77.     __global uint2 *g_maxStep,
  78.     __global ulong *g_totalSteps,
  79.     __global const uint2 *g_lut,
  80.     __global const uint *g_delay,
  81.     __constant uint *c_a,
  82.     __constant uint *c_totalOffset,
  83.     const uint kNo
  84. )
  85. {
  86.     uint gid = get_global_id(0);
  87.     uint val[NUMLENGTH];
  88.     uint2 lut = (uint2)0;
  89.     ulong addRes = 0;
  90.     //val = a * gid
  91.     #pragma unroll
  92.     for(int i=0 ; i<NUMLENGTH ; ++i)
  93.     {
  94.         lut = mad64(c_a[i], gid, lut.s1);
  95.         val[i] = lut.s0;
  96.     } //for()
  97.     //val += totalOffset
  98.     #pragma unroll
  99.     for(int i=0 ; i<NUMLENGTH ; ++i)
  100.     {
  101.         addRes = (addRes>>32) + val[i] + c_totalOffset[i];
  102.         val[i] = convert_uint(addRes);
  103.     } //for()
  104.  
  105.     uint index = 0;
  106.     uint steps = INITSTEP;
  107.  
  108.     do
  109.     {
  110.         index = val[0] & LUTMASK;
  111.         lut = g_lut[index];
  112.         steps += LUTSIZEP2 + lut.s0;
  113.         index = power3[lut.s0]; //index as mulVal
  114.  
  115.         //val = val>>LUTSIZEP2
  116.         #pragma unroll
  117.         for(int i=0 ; i<NUMLENGTH-1; ++i)
  118.         {
  119.             val[i] = (val[i] >> LUTSIZEP2) | (val[i+1] << (32-LUTSIZEP2));
  120.         } //for()
  121.        
  122.         val[NUMLENGTH -1] >>= LUTSIZEP2;
  123.  
  124.         //val = val * 3^(lut.s0) + lut.s1
  125.         #pragma unroll
  126.         for(int i=0 ; i<NUMLENGTH; ++i)
  127.         {
  128.             lut = mad64(val[i], index, lut.s1);
  129.             val[i] = lut.s0;
  130.         }
  131.  
  132.         //compare val to DELAYMASK, if val > DELAYMASK, index is positive; else, 0
  133.         index = val[0] > DELAYMASK;
  134.         #pragma unroll
  135.         for(int i=1 ; i<NUMLENGTH; ++i)
  136.         {
  137.             index |= val[i];
  138.         }
  139.         //if step >= MAXSTEP or val overflowed, stop the loop
  140.         index = index && (steps < MAXSTEP) && (!lut.s1);
  141.     } while(index);
  142.  
  143.     steps += g_delay[val[0] & DELAYMASK];
  144.     steps = select(steps, MAXSTEP, lut.s1);
  145.     addRes = g_totalSteps[gid];
  146.     addRes += steps;
  147.     g_totalSteps[gid] = addRes;
  148.     lut = g_maxStep[gid];
  149.     index = -(steps > lut.s0);
  150.     lut = select(lut, (uint2)(steps, kNo), (uint2)index);
  151.     g_maxStep[gid] = lut;
  152. } //collatzDelay()
  153.  
  154. __kernel void clearRes
  155. (
  156.     __global uint2 *g_maxStep,
  157.     __global ulong *g_totalSteps
  158. )
  159. {
  160.     uint gid = get_global_id(0);
  161.     g_maxStep[gid] = (uint2)0;
  162.     g_totalSteps[gid] = 0;
  163. } //clearRes()
  164.  
  165.  
  166. __kernel void incTotalOffset
  167. (
  168.     __global uint *g_totalOffset,
  169.     __constant uint *c_increment
  170. )
  171. {
  172.     __local ulong l_addRes[NUMLENGTH];
  173.     __local uint l_res[NUMLENGTH];
  174.  
  175.     uint gid = get_global_id(0);
  176.     l_addRes[gid] = g_totalOffset[gid] + c_increment[gid];
  177.     barrier(CLK_LOCAL_MEM_FENCE); //actually not required right now in AMD GPUs, because its wavefront size is 64, bigger than NUMLENGTH (16)
  178.     if(gid == 0)
  179.     {  
  180.         l_res[0] = convert_uint(l_addRes[0]);
  181.         #pragma unroll
  182.         for(int i=1;i<NUMLENGTH;++i)
  183.         {
  184.             l_addRes[i] += (l_addRes[i-1]>>32); //carry
  185.             l_res[i] = convert_uint(l_addRes[i]);
  186.         }
  187.     }
  188.     barrier(CLK_LOCAL_MEM_FENCE);
  189.     g_totalOffset[gid] = l_res[gid];
  190. }//incTotalOffset()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement