Advertisement
Guest User

sorting kernels

a guest
Jun 29th, 2012
29
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 5.50 KB | None | 0 0
  1. #define FIELDS_PER_RECORD 1
  2. #define uint32 uint
  3. #define uint64 ulong
  4. #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
  5. #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
  6. typedef struct {
  7.   uint64 f[FIELDS_PER_RECORD];
  8. } record_t;
  9.  
  10. #pragma unroll
  11. #define ORDER(a,b) {   bool swap = reverse ^ (a.f[0] < b.f[0]);   record_t auxa = a;   record_t auxb = b;   a = swap ? auxb : auxa;   b = swap ? auxa : auxb; }
  12.  
  13. __kernel void ParallelBitonic_B2(
  14.     volatile __global uint32 *result_count,  
  15.     volatile __global record_t* data,
  16.     volatile uint32 inc0,
  17.     volatile uint32 dir,
  18.     __local record_t* aux) {
  19.   uint32 t = get_global_id(0);
  20.   uint32 low = t & (inc0 - 1);
  21.   uint32 i = (t<<1) - low;
  22.   bool reverse = ((dir & i) == 0);
  23.   data += i;
  24.  
  25.   record_t x0 = data[0];
  26.   record_t x1 = data[inc0];
  27.  
  28.   ORDER(x0, x1);
  29.  
  30.   data[0] = x0;
  31.   data[inc0] = x1;
  32. }
  33.  
  34. __kernel void ParallelBitonic_B4(
  35.     volatile __global uint32 *result_count,  
  36.     volatile __global record_t* data,
  37.     volatile uint32 inc0,
  38.     volatile uint32 dir,
  39.     __local record_t* aux) {
  40.   int inc = inc0 >> 1;
  41.   int t = get_global_id(0);
  42.   int low = t & (inc - 1);
  43.   int i = ((t - low) << 2) + low;
  44.   bool reverse = ((dir & i) == 0);
  45.   data += i;
  46.  
  47.   record_t x0 = data[0];
  48.   record_t x1 = data[inc];
  49.   record_t x2 = data[2*inc];
  50.   record_t x3 = data[3*inc];
  51.  
  52.   ORDER(x0, x2);
  53.   ORDER(x1, x3);
  54.   ORDER(x0, x1);
  55.   ORDER(x2, x3);
  56.  
  57.   data[0] = x0;
  58.   data[inc] = x1;
  59.   data[2*inc] = x2;
  60.   data[3*inc] = x3;
  61. }
  62.  
  63.  
  64. #define ORDERV(x, a, b) {    bool swap = reverse ^ (x[a].f[0] < x[b].f[0]);    record_t auxa = x[a];    record_t auxb = x[b];    x[a] = swap ? auxb : auxa;    x[b] = swap ? auxa : auxb; }
  65.  
  66. #define B2V(x, a) { ORDERV(x, a, a + 1); }
  67. #define B4V(x, a) {   ORDERV(x, a + 0, a+0+2);   ORDERV(x, a + 1, a+1+2);   B2V(x, a);   B2V(x, a + 2); }
  68. #define B8V(x,a) {   ORDERV(x, a + 0, a+0+4);   ORDERV(x, a + 1, a+1+4);   ORDERV(x, a + 2, a+2+4);   ORDERV(x, a + 3, a+3+4);   B4V(x, a);   B4V(x, a + 4); }
  69. #define B16V(x, a) {   ORDERV(x, a + 0, a + 0 + 8);   ORDERV(x, a + 1, a + 1 + 8);   ORDERV(x, a + 2, a + 2 + 8);   ORDERV(x, a + 3, a + 3 + 8);   ORDERV(x, a + 4, a + 4 + 8);   ORDERV(x, a + 5, a + 5 + 8);   ORDERV(x, a + 6, a + 6 + 8);   ORDERV(x, a + 7, a + 7 + 8);   B8V(x, a);   B8V(x, a + 8); }
  70.  
  71. __kernel void ParallelBitonic_B8(
  72.     volatile __global uint32 *result_count,  
  73.     volatile __global record_t* data,
  74.     volatile uint32 inc0,
  75.     volatile uint32 dir,
  76.     __local record_t* aux) {
  77.   int inc = inc0 >> 2;
  78.   int t = get_global_id(0);
  79.   int low = t & (inc - 1);
  80.   int i = ((t - low) << 3) + low;
  81.   bool reverse = ((dir & i) == 0);
  82.   data += i;
  83.  
  84.   record_t x[8];
  85.   x[0] = data[0 * inc];
  86. x[1] = data[1 * inc];
  87. x[2] = data[2 * inc];
  88. x[3] = data[3 * inc];
  89. x[4] = data[4 * inc];
  90. x[5] = data[5 * inc];
  91. x[6] = data[6 * inc];
  92. x[7] = data[7 * inc];
  93.  
  94.  
  95.   B8V(x, 0);
  96.  
  97.   data[0 * inc] = x[0];
  98. data[1 * inc] = x[1];
  99. data[2 * inc] = x[2];
  100. data[3 * inc] = x[3];
  101. data[4 * inc] = x[4];
  102. data[5 * inc] = x[5];
  103. data[6 * inc] = x[6];
  104. data[7 * inc] = x[7];
  105.  
  106. }
  107.  
  108. __kernel void ParallelBitonic_B16(
  109.     volatile __global uint32 *result_count,  
  110.     volatile __global record_t* data,
  111.     volatile uint32 inc0,
  112.     volatile uint32 dir,
  113.     __local record_t* aux) {
  114.   int inc = inc0 >> 3;
  115.   int t = get_global_id(0);
  116.   int low = t & (inc - 1);
  117.   int i = ((t - low) << 4) + low;
  118.   bool reverse = ((dir & i) == 0);
  119.   data += i;
  120.  
  121.   record_t x[16];
  122.   x[0] = data[0 * inc];
  123. x[1] = data[1 * inc];
  124. x[2] = data[2 * inc];
  125. x[3] = data[3 * inc];
  126. x[4] = data[4 * inc];
  127. x[5] = data[5 * inc];
  128. x[6] = data[6 * inc];
  129. x[7] = data[7 * inc];
  130. x[8] = data[8 * inc];
  131. x[9] = data[9 * inc];
  132. x[10] = data[10 * inc];
  133. x[11] = data[11 * inc];
  134. x[12] = data[12 * inc];
  135. x[13] = data[13 * inc];
  136. x[14] = data[14 * inc];
  137. x[15] = data[15 * inc];
  138.  
  139.   //for (int k = 0; k < 16; k++) x[k] = data[k * inc];
  140.  
  141.   B16V(x, 0);
  142.  
  143.   data[0 * inc] = x[0];
  144. data[1 * inc] = x[1];
  145. data[2 * inc] = x[2];
  146. data[3 * inc] = x[3];
  147. data[4 * inc] = x[4];
  148. data[5 * inc] = x[5];
  149. data[6 * inc] = x[6];
  150. data[7 * inc] = x[7];
  151. data[8 * inc] = x[8];
  152. data[9 * inc] = x[9];
  153. data[10 * inc] = x[10];
  154. data[11 * inc] = x[11];
  155. data[12 * inc] = x[12];
  156. data[13 * inc] = x[13];
  157. data[14 * inc] = x[14];
  158. data[15 * inc] = x[15];
  159.  
  160.   //for (int k = 0; k < 16; k++) data[k * inc] = x[k];
  161. }
  162.  
  163. __kernel void ParallelBitonic_C4(
  164.     volatile __global uint32 *result_count,  
  165.     volatile __global record_t* data,
  166.     volatile uint32 inc0,
  167.     volatile uint32 dir,
  168.     __local record_t* aux) {
  169.   int t = get_global_id(0);
  170.   int wg_bits = 4 * get_local_size(0) - 1;
  171.   int low, i;
  172.   bool reverse;
  173.  
  174.   record_t x[4];
  175.  
  176.   int inc = inc0 >> 1;
  177.   low = t & (inc - 1);
  178.   i = ((t - low) << 2) + low;
  179.   reverse = ((dir & i) == 0);
  180.   for (int k = 0; k < 4; k++) x[k] = data[i + k * inc];
  181.   B4V(x, 0);
  182.   for (int k = 0; k < 4; k++) aux[(i + k * inc) & wg_bits] = x[k];
  183.   barrier(CLK_LOCAL_MEM_FENCE);
  184.  
  185.   for (; inc > 1; inc >>= 2) {
  186.     low = t & (inc - 1);
  187.     i = ((t - low) << 2) + low;
  188.     reverse = ((dir & i) == 0);
  189.     for (int k = 0; k < 4; k++) x[k] = aux[(i + k * inc) & wg_bits];
  190.     B4V(x, 0);
  191.     barrier(CLK_LOCAL_MEM_FENCE);
  192.     for (int k = 0; k < 4; k++) aux[(i + k * inc) & wg_bits] = x[k];
  193.     barrier(CLK_LOCAL_MEM_FENCE);
  194.   }
  195.  
  196.   i = t << 2;
  197.   reverse = ((dir & i) == 0);
  198.   for (int k = 0; k < 4; k++) x[k] = aux[(i + k) & wg_bits];
  199.   B4V(x, 0);
  200.   for (int k = 0; k < 4; k++) data[i + k] = x[k];
  201. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement