Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // take an array of interleaved (x,y) pairs and computes fast_atan2(y,x) estimate on them.
- // approximately 15-40x faster than a simple loop with atan2, depending on
- // input buffer size. Generated from vectorized output of fast_atan function.
- //
- // @param out array to write output to
- // @param in input array containing interleaved pairs
- // @param npair number of input pairs to process
- //
- static inline void vatan2_avx(float* __restrict__ out, const float* __restrict__ in, ssize_t npair) {
- // compute how many iterations to do and remainder of pairs left to do manually
- size_t iters = npair/8;
- size_t rem = npair-iters*8;
- // constants
- static const uint32_t signbit = 0x80000000;
- static const uint32_t posnan = 0x7fffffff;
- static const float one = 1.0f;
- static const float mpi_2 = M_PI_2;
- static const float mpi = M_PI;
- static const float coefa = -0.0464964749f;
- static const float coefb = +0.15931422f;
- static const float coefc = -0.327622764f;
- __asm__(
- // load constants
- " vxorps %%ymm8, %%ymm8, %%ymm8\n\t" // ymm8 = 0
- " vbroadcastss %[posnan], %%ymm9 \n\t" // abs() mask
- " vbroadcastss %[mpi], %%ymm10\n\t"
- " vbroadcastss %[mpi_2], %%ymm11\n\t"
- " vbroadcastss %[one], %%ymm12\n\t"
- " vbroadcastss %[coefa], %%ymm15\n\t"
- " vbroadcastss %[coefc], %%ymm13\n\t"
- " vbroadcastss %[coefb], %%ymm14\n\t"
- // setup indices, pointers
- " mov %[in], %%rax\n\t" // input pointer
- " mov %[out], %%rcx\n\t" // output pointer
- " xor %%r8d, %%r8d\n\t" // r8 = 0
- ".p2align 4\n\t"
- ".LOOP%=:\n\t"
- // load bottom part of ymm0 and ymm1
- " vmovups (%%rax), %%ymm0\n\t"
- " vmovups 0x20(%%rax), %%ymm1\n\t"
- // increment loop variables
- " add $0x01, %%r8\n\t" // r8 += 1
- " add $0x40, %%rax\n\t" // in += 16
- " add $0x20, %%rcx\n\t" // out += 8
- // de-interleave x,y pairs into separate registers
- " vshufps $0x88, %%ymm1, %%ymm0, %%ymm3\n\t"
- " vshufps $0xdd, %%ymm1, %%ymm0, %%ymm0\n\t"
- " vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm2\n\t"
- " vperm2f128 $0x03, %%ymm0, %%ymm0, %%ymm1\n\t"
- " vshufps $0x44, %%ymm2, %%ymm3, %%ymm4\n\t"
- " vshufps $0xee, %%ymm2, %%ymm3, %%ymm2\n\t"
- " vshufps $0x44, %%ymm1, %%ymm0, %%ymm3\n\t"
- " vshufps $0xee, %%ymm1, %%ymm0, %%ymm1\n\t"
- " vinsertf128 $0x01, %%xmm2, %%ymm4, %%ymm2\n\t"
- " vinsertf128 $0x01, %%xmm1, %%ymm3, %%ymm3\n\t"
- // absolute values and zero check
- " vandps %%ymm9, %%ymm2, %%ymm4\n\t" // abs(x)
- " vcmpeqps %%ymm8, %%ymm2, %%ymm0\n\t" // x == 0?
- " vandps %%ymm9, %%ymm3, %%ymm6\n\t" // abs(y)
- " vcmpeqps %%ymm8, %%ymm3, %%ymm1\n\t" // y == 0?
- // compute argument a to polynomial
- " vmaxps %%ymm4, %%ymm6, %%ymm5\n\t" // max(abs(x), abs(y))
- " vandps %%ymm0, %%ymm1, %%ymm1\n\t" // x == 0 && y == 0
- " vminps %%ymm4, %%ymm6, %%ymm0\n\t" // min(abs(x), abs(y))
- " vcmpltps %%ymm6, %%ymm4, %%ymm4\n\t" // abs(x) < abs(y)
- " vrcpps %%ymm5, %%ymm7 \n\t" // compute 1/max(abs(x), abs(y))
- " vmulps %%ymm5, %%ymm7, %%ymm5\n\t"
- " vcmpltps %%ymm8, %%ymm2, %%ymm2\n\t" // x < 0
- // compute polynomial
- " vmulps %%ymm5, %%ymm7, %%ymm5\n\t"
- " vaddps %%ymm7, %%ymm7, %%ymm7\n\t"
- " vsubps %%ymm5, %%ymm7, %%ymm7\n\t"
- " vmulps %%ymm7, %%ymm0, %%ymm5\n\t"
- " vmulps %%ymm5, %%ymm5, %%ymm7\n\t"
- " vmulps %%ymm15,%%ymm7, %%ymm0\n\t"
- " vaddps %%ymm14,%%ymm0, %%ymm0\n\t"
- " vmulps %%ymm7, %%ymm0, %%ymm0\n\t"
- " vaddps %%ymm13,%%ymm0, %%ymm0\n\t"
- " vmulps %%ymm7, %%ymm0, %%ymm0\n\t"
- // finish up
- " vcmpneqps %%ymm8, %%ymm3, %%ymm7\n\t"
- " vaddps %%ymm12,%%ymm0, %%ymm0\n\t"
- " vandps %%ymm4, %%ymm7, %%ymm4\n\t"
- " vandps %%ymm2, %%ymm7, %%ymm2\n\t"
- " vmulps %%ymm5, %%ymm0, %%ymm0\n\t"
- " vsubps %%ymm0, %%ymm11,%%ymm5\n\t"
- " vblendvps %%ymm4, %%ymm5, %%ymm0, %%ymm0\n\t"
- " vsubps %%ymm0, %%ymm10,%%ymm5\n\t"
- " vblendvps %%ymm2, %%ymm5, %%ymm0, %%ymm0\n\t"
- " vcmpleps %%ymm3, %%ymm8, %%ymm2\n\t"
- " vcmpltps %%ymm8, %%ymm3, %%ymm3\n\t"
- " vbroadcastss %[signbit], %%ymm8\n\t"
- " vxorps %%ymm8, %%ymm0, %%ymm4\n\t"
- " vandps %%ymm2, %%ymm7, %%ymm2\n\t"
- " vandps %%ymm3, %%ymm7, %%ymm7\n\t"
- " vblendvps %%ymm1, %%ymm8, %%ymm4, %%ymm1\n\t"
- " vblendvps %%ymm7, %%ymm4, %%ymm1, %%ymm1\n\t"
- " vblendvps %%ymm2, %%ymm0, %%ymm1, %%ymm1\n\t"
- // store to result
- " vmovups %%ymm1,-0x20(%%rcx)\n\t"
- // are we done?
- " cmp %[iters],%%r8\n\t"
- " jb .LOOP%=\n\t"
- VZU
- :
- : [posnan] "m" (posnan), [coefa] "m" (coefa), [coefb] "m" (coefb),
- [coefc] "m" (coefc), [one] "m" (one), [mpi_2] "m" (mpi_2), [mpi] "m" (mpi),
- [signbit] "m" (signbit), [in] "r" (in), [out] "r" (out), [iters] "er" (iters)
- : MMREG(0), MMREG(1), MMREG(2), MMREG(3), MMREG(4), MMREG(5), MMREG(6), MMREG(7),
- MMREG(8), MMREG(9), MMREG(10), MMREG(11), MMREG(12), MMREG(13), MMREG(14), MMREG(15),
- "rax", "rcx", "r8", "memory"
- );
- // finish remainder
- if (rem > 0) {
- in += iters*16;
- out += iters*8;
- for (size_t ii=0; ii < rem; ii++) {
- out[ii] = fast_atan2(in[2*ii+1], in[2*ii+0]);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement