Untitled

// take an array of interleaved (x,y) pairs and computes fast_atan2(y,x) estimate on them.
// approximately 15-40x faster than a simple loop with atan2, depending on
// input buffer size.  Generated from vectorized output of fast_atan function.
//
// @param out    array to write output to
// @param in     input array containing interleaved pairs
// @param npair  number of input pairs to process
//
static inline void vatan2_avx(float* __restrict__ out, const float* __restrict__ in, ssize_t npair) {
    // compute how many iterations to do and remainder of pairs left to do manually
    size_t iters = npair/8;
    size_t rem   = npair-iters*8;

    // constants
    static const uint32_t signbit = 0x80000000;
    static const uint32_t posnan  = 0x7fffffff;
    static const float    one     = 1.0f;
    static const float    mpi_2   = M_PI_2;
    static const float    mpi     = M_PI;
    static const float    coefa   = -0.0464964749f;
    static const float    coefb   = +0.15931422f;
    static const float    coefc   = -0.327622764f;

    __asm__(
        // load constants
        "    vxorps  %%ymm8, %%ymm8, %%ymm8\n\t"  // ymm8 = 0
        "    vbroadcastss %[posnan], %%ymm9 \n\t" // abs() mask
        "    vbroadcastss %[mpi],    %%ymm10\n\t"
        "    vbroadcastss %[mpi_2],  %%ymm11\n\t"
        "    vbroadcastss %[one],    %%ymm12\n\t"
        "    vbroadcastss %[coefa],  %%ymm15\n\t"
        "    vbroadcastss %[coefc],  %%ymm13\n\t"
        "    vbroadcastss %[coefb],  %%ymm14\n\t"

        // setup indices, pointers
        "    mov %[in],  %%rax\n\t" // input pointer
        "    mov %[out], %%rcx\n\t" // output pointer
        "    xor %%r8d,  %%r8d\n\t" // r8 = 0

        ".p2align 4\n\t"
        ".LOOP%=:\n\t"
        // load bottom part of ymm0 and ymm1
        "    vmovups     (%%rax), %%ymm0\n\t"
        "    vmovups 0x20(%%rax), %%ymm1\n\t"

        // increment loop variables
        "    add     $0x01,  %%r8\n\t"  // r8  +=  1
        "    add     $0x40,  %%rax\n\t" // in  += 16
        "    add     $0x20,  %%rcx\n\t" // out +=  8

        // de-interleave x,y pairs into separate registers
        "    vshufps      $0x88, %%ymm1, %%ymm0, %%ymm3\n\t"
        "    vshufps      $0xdd, %%ymm1, %%ymm0, %%ymm0\n\t"
        "    vperm2f128   $0x03, %%ymm3, %%ymm3, %%ymm2\n\t"
        "    vperm2f128   $0x03, %%ymm0, %%ymm0, %%ymm1\n\t"
        "    vshufps      $0x44, %%ymm2, %%ymm3, %%ymm4\n\t"
        "    vshufps      $0xee, %%ymm2, %%ymm3, %%ymm2\n\t"
        "    vshufps      $0x44, %%ymm1, %%ymm0, %%ymm3\n\t"
        "    vshufps      $0xee, %%ymm1, %%ymm0, %%ymm1\n\t"
        "    vinsertf128  $0x01, %%xmm2, %%ymm4, %%ymm2\n\t"
        "    vinsertf128  $0x01, %%xmm1, %%ymm3, %%ymm3\n\t"

        // absolute values and zero check
        "    vandps       %%ymm9, %%ymm2, %%ymm4\n\t" // abs(x)
        "    vcmpeqps     %%ymm8, %%ymm2, %%ymm0\n\t" // x == 0?
        "    vandps       %%ymm9, %%ymm3, %%ymm6\n\t" // abs(y)
        "    vcmpeqps     %%ymm8, %%ymm3, %%ymm1\n\t" // y == 0?

        // compute argument a to polynomial
        "    vmaxps       %%ymm4, %%ymm6, %%ymm5\n\t" // max(abs(x), abs(y))
        "    vandps       %%ymm0, %%ymm1, %%ymm1\n\t" // x == 0 && y == 0
        "    vminps       %%ymm4, %%ymm6, %%ymm0\n\t" // min(abs(x), abs(y))
        "    vcmpltps     %%ymm6, %%ymm4, %%ymm4\n\t" // abs(x) < abs(y)
        "    vrcpps       %%ymm5, %%ymm7        \n\t" // compute 1/max(abs(x), abs(y))
        "    vmulps       %%ymm5, %%ymm7, %%ymm5\n\t"
        "    vcmpltps     %%ymm8, %%ymm2, %%ymm2\n\t" // x < 0

        // compute polynomial
        "    vmulps       %%ymm5, %%ymm7, %%ymm5\n\t"
        "    vaddps       %%ymm7, %%ymm7, %%ymm7\n\t"
        "    vsubps       %%ymm5, %%ymm7, %%ymm7\n\t"
        "    vmulps       %%ymm7, %%ymm0, %%ymm5\n\t"
        "    vmulps       %%ymm5, %%ymm5, %%ymm7\n\t"
        "    vmulps       %%ymm15,%%ymm7, %%ymm0\n\t"
        "    vaddps       %%ymm14,%%ymm0, %%ymm0\n\t"
        "    vmulps       %%ymm7, %%ymm0, %%ymm0\n\t"
        "    vaddps       %%ymm13,%%ymm0, %%ymm0\n\t"
        "    vmulps       %%ymm7, %%ymm0, %%ymm0\n\t"

        // finish up
        "    vcmpneqps    %%ymm8, %%ymm3, %%ymm7\n\t"
        "    vaddps       %%ymm12,%%ymm0, %%ymm0\n\t"
        "    vandps       %%ymm4, %%ymm7, %%ymm4\n\t"
        "    vandps       %%ymm2, %%ymm7, %%ymm2\n\t"
        "    vmulps       %%ymm5, %%ymm0, %%ymm0\n\t"
        "    vsubps       %%ymm0, %%ymm11,%%ymm5\n\t"
        "    vblendvps    %%ymm4, %%ymm5, %%ymm0, %%ymm0\n\t"
        "    vsubps       %%ymm0, %%ymm10,%%ymm5\n\t"
        "    vblendvps    %%ymm2, %%ymm5, %%ymm0, %%ymm0\n\t"
        "    vcmpleps     %%ymm3, %%ymm8, %%ymm2\n\t"
        "    vcmpltps     %%ymm8, %%ymm3, %%ymm3\n\t"
        "    vbroadcastss %[signbit], %%ymm8\n\t"
        "    vxorps       %%ymm8, %%ymm0, %%ymm4\n\t"
        "    vandps       %%ymm2, %%ymm7, %%ymm2\n\t"
        "    vandps       %%ymm3, %%ymm7, %%ymm7\n\t"
        "    vblendvps    %%ymm1, %%ymm8, %%ymm4, %%ymm1\n\t"
        "    vblendvps    %%ymm7, %%ymm4, %%ymm1, %%ymm1\n\t"
        "    vblendvps    %%ymm2, %%ymm0, %%ymm1, %%ymm1\n\t"

        // store to result
        "    vmovups     %%ymm1,-0x20(%%rcx)\n\t"

        // are we done?
        "    cmp    %[iters],%%r8\n\t"
        "    jb     .LOOP%=\n\t"
        VZU
        :
        : [posnan]  "m" (posnan),  [coefa] "m" (coefa),  [coefb] "m"  (coefb),
          [coefc]   "m" (coefc),   [one]   "m" (one),    [mpi_2] "m" (mpi_2),  [mpi]   "m"  (mpi),
          [signbit] "m" (signbit), [in]    "r" (in),     [out]   "r" (out),    [iters] "er" (iters)
        : MMREG(0), MMREG(1), MMREG(2),  MMREG(3),  MMREG(4),  MMREG(5),  MMREG(6),  MMREG(7),
          MMREG(8), MMREG(9), MMREG(10), MMREG(11), MMREG(12), MMREG(13), MMREG(14), MMREG(15),
          "rax", "rcx", "r8", "memory"
    );

    // finish remainder
    if (rem > 0) {
        in  += iters*16;
        out += iters*8;

        for (size_t ii=0; ii < rem; ii++) {
            out[ii] = fast_atan2(in[2*ii+1], in[2*ii+0]);
        }
    }
}