Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define FIELDS_PER_RECORD 1
- #define uint32 uint
- #define uint64 ulong
- #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
- #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
- typedef struct {
- uint64 f[FIELDS_PER_RECORD];
- } record_t;
- #pragma unroll
- #define ORDER(a,b) { bool swap = reverse ^ (a.f[0] < b.f[0]); record_t auxa = a; record_t auxb = b; a = swap ? auxb : auxa; b = swap ? auxa : auxb; }
- __kernel void ParallelBitonic_B2(
- volatile __global uint32 *result_count,
- volatile __global record_t* data,
- volatile uint32 inc0,
- volatile uint32 dir,
- __local record_t* aux) {
- uint32 t = get_global_id(0);
- uint32 low = t & (inc0 - 1);
- uint32 i = (t<<1) - low;
- bool reverse = ((dir & i) == 0);
- data += i;
- record_t x0 = data[0];
- record_t x1 = data[inc0];
- ORDER(x0, x1);
- data[0] = x0;
- data[inc0] = x1;
- }
- __kernel void ParallelBitonic_B4(
- volatile __global uint32 *result_count,
- volatile __global record_t* data,
- volatile uint32 inc0,
- volatile uint32 dir,
- __local record_t* aux) {
- int inc = inc0 >> 1;
- int t = get_global_id(0);
- int low = t & (inc - 1);
- int i = ((t - low) << 2) + low;
- bool reverse = ((dir & i) == 0);
- data += i;
- record_t x0 = data[0];
- record_t x1 = data[inc];
- record_t x2 = data[2*inc];
- record_t x3 = data[3*inc];
- ORDER(x0, x2);
- ORDER(x1, x3);
- ORDER(x0, x1);
- ORDER(x2, x3);
- data[0] = x0;
- data[inc] = x1;
- data[2*inc] = x2;
- data[3*inc] = x3;
- }
- #define ORDERV(x, a, b) { bool swap = reverse ^ (x[a].f[0] < x[b].f[0]); record_t auxa = x[a]; record_t auxb = x[b]; x[a] = swap ? auxb : auxa; x[b] = swap ? auxa : auxb; }
- #define B2V(x, a) { ORDERV(x, a, a + 1); }
- #define B4V(x, a) { ORDERV(x, a + 0, a+0+2); ORDERV(x, a + 1, a+1+2); B2V(x, a); B2V(x, a + 2); }
- #define B8V(x,a) { ORDERV(x, a + 0, a+0+4); ORDERV(x, a + 1, a+1+4); ORDERV(x, a + 2, a+2+4); ORDERV(x, a + 3, a+3+4); B4V(x, a); B4V(x, a + 4); }
- #define B16V(x, a) { ORDERV(x, a + 0, a + 0 + 8); ORDERV(x, a + 1, a + 1 + 8); ORDERV(x, a + 2, a + 2 + 8); ORDERV(x, a + 3, a + 3 + 8); ORDERV(x, a + 4, a + 4 + 8); ORDERV(x, a + 5, a + 5 + 8); ORDERV(x, a + 6, a + 6 + 8); ORDERV(x, a + 7, a + 7 + 8); B8V(x, a); B8V(x, a + 8); }
- __kernel void ParallelBitonic_B8(
- volatile __global uint32 *result_count,
- volatile __global record_t* data,
- volatile uint32 inc0,
- volatile uint32 dir,
- __local record_t* aux) {
- int inc = inc0 >> 2;
- int t = get_global_id(0);
- int low = t & (inc - 1);
- int i = ((t - low) << 3) + low;
- bool reverse = ((dir & i) == 0);
- data += i;
- record_t x[8];
- x[0] = data[0 * inc];
- x[1] = data[1 * inc];
- x[2] = data[2 * inc];
- x[3] = data[3 * inc];
- x[4] = data[4 * inc];
- x[5] = data[5 * inc];
- x[6] = data[6 * inc];
- x[7] = data[7 * inc];
- B8V(x, 0);
- data[0 * inc] = x[0];
- data[1 * inc] = x[1];
- data[2 * inc] = x[2];
- data[3 * inc] = x[3];
- data[4 * inc] = x[4];
- data[5 * inc] = x[5];
- data[6 * inc] = x[6];
- data[7 * inc] = x[7];
- }
- __kernel void ParallelBitonic_B16(
- volatile __global uint32 *result_count,
- volatile __global record_t* data,
- volatile uint32 inc0,
- volatile uint32 dir,
- __local record_t* aux) {
- int inc = inc0 >> 3;
- int t = get_global_id(0);
- int low = t & (inc - 1);
- int i = ((t - low) << 4) + low;
- bool reverse = ((dir & i) == 0);
- data += i;
- record_t x[16];
- x[0] = data[0 * inc];
- x[1] = data[1 * inc];
- x[2] = data[2 * inc];
- x[3] = data[3 * inc];
- x[4] = data[4 * inc];
- x[5] = data[5 * inc];
- x[6] = data[6 * inc];
- x[7] = data[7 * inc];
- x[8] = data[8 * inc];
- x[9] = data[9 * inc];
- x[10] = data[10 * inc];
- x[11] = data[11 * inc];
- x[12] = data[12 * inc];
- x[13] = data[13 * inc];
- x[14] = data[14 * inc];
- x[15] = data[15 * inc];
- //for (int k = 0; k < 16; k++) x[k] = data[k * inc];
- B16V(x, 0);
- data[0 * inc] = x[0];
- data[1 * inc] = x[1];
- data[2 * inc] = x[2];
- data[3 * inc] = x[3];
- data[4 * inc] = x[4];
- data[5 * inc] = x[5];
- data[6 * inc] = x[6];
- data[7 * inc] = x[7];
- data[8 * inc] = x[8];
- data[9 * inc] = x[9];
- data[10 * inc] = x[10];
- data[11 * inc] = x[11];
- data[12 * inc] = x[12];
- data[13 * inc] = x[13];
- data[14 * inc] = x[14];
- data[15 * inc] = x[15];
- //for (int k = 0; k < 16; k++) data[k * inc] = x[k];
- }
- __kernel void ParallelBitonic_C4(
- volatile __global uint32 *result_count,
- volatile __global record_t* data,
- volatile uint32 inc0,
- volatile uint32 dir,
- __local record_t* aux) {
- int t = get_global_id(0);
- int wg_bits = 4 * get_local_size(0) - 1;
- int low, i;
- bool reverse;
- record_t x[4];
- int inc = inc0 >> 1;
- low = t & (inc - 1);
- i = ((t - low) << 2) + low;
- reverse = ((dir & i) == 0);
- for (int k = 0; k < 4; k++) x[k] = data[i + k * inc];
- B4V(x, 0);
- for (int k = 0; k < 4; k++) aux[(i + k * inc) & wg_bits] = x[k];
- barrier(CLK_LOCAL_MEM_FENCE);
- for (; inc > 1; inc >>= 2) {
- low = t & (inc - 1);
- i = ((t - low) << 2) + low;
- reverse = ((dir & i) == 0);
- for (int k = 0; k < 4; k++) x[k] = aux[(i + k * inc) & wg_bits];
- B4V(x, 0);
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int k = 0; k < 4; k++) aux[(i + k * inc) & wg_bits] = x[k];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- i = t << 2;
- reverse = ((dir & i) == 0);
- for (int k = 0; k < 4; k++) x[k] = aux[(i + k) & wg_bits];
- B4V(x, 0);
- for (int k = 0; k < 4; k++) data[i + k] = x[k];
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement