Untitled

uint64_t div(uint64_t a_lo, uint64_t a_hi, uint64_t b, uint64_t &r)
{
    uint64_t p_lo;
    uint64_t p_hi;
    uint64_t q = 0;

    auto r_hi = a_hi;
    auto r_lo = a_lo;

    int s = 0;
    if(0 == (b >> 63)){

        // Normalize so quotient estimates are
        // no more than 2 in error.

        // Note: If any bits get shifted out of
        // r_hi at this point, the result would
        // overflow.

        s = 63 - bsr(b);
        const auto t = 64 - s;

        b <<= s;
        r_hi = (r_hi << s)|(r_lo >> t);
        r_lo <<= t;
    }

    const auto b_hi = b >> 32;

    /*
    The first full-by-half division places b
    across r_hi and r_lo, making the reduction
    step a little complicated.

    To make this easier, u_hi and u_lo will hold
    a shifted image of the remainder.

    [u_hi||    ][u_lo||    ]
          [r_hi||    ][r_lo||    ]
                [ b  ||    ]
    [p_hi||    ][p_lo||    ]
                  |
                  V
                [q_hi||    ]
    */

    auto q_hat = r_hi / b_hi;

    p_lo = mul(b, q_hat, p_hi);

    const auto u_hi = r_hi >> 32;
    const auto u_lo = (r_hi << 32)|(r_lo >> 32);

    // r -= b*q_hat
    //
    // At most 2 iterations of this...
    while(
        (p_hi > u_hi) ||
        ((p_hi == u_hi) && (p_lo > u_lo))
        )
    {
        if(p_lo < b){
            --p_hi;
        }
        p_lo -= b;
        --q_hat;
    }

    auto w_lo = (p_lo << 32);
    auto w_hi = (p_hi << 32)|(p_lo >> 32);

    if(w_lo > r_lo){
        ++w_hi;
    }

    r_lo -= w_lo;
    r_hi -= w_hi;

    q = q_hat << 32;

    /*
    The lower half of the quotient is easier,
    as b is now aligned with r_lo.

          |r_hi][r_lo||    ]
                [ b  ||    ]
    [p_hi||    ][p_lo||    ]
                        |
                        V
                [q_hi||q_lo]
    */

    q_hat = ((r_hi << 32)|(r_lo >> 32)) / b_hi;

    p_lo = mul(b, q_hat, p_hi);

    // r -= b*q_hat
    //
    // ...and at most 2 iterations of this.
    while(
        (p_hi > r_hi) ||
        ((p_hi == r_hi) && (p_lo > r_lo))
        )
    {
        if(p_lo < b){
            --p_hi;
        }
        p_lo -= b;
        --q_hat;
    }

    r_lo -= p_lo;

    q |= q_hat;

    r = r_lo >> s;

    return q;
}

int bsr(uint64_t x)
{
    uint64_t y;
    uint64_t r;

    r = (x > 0xFFFFFFFF) << 5; x >>= r;
    y = (x > 0xFFFF    ) << 4; x >>= y; r |= y;
    y = (x > 0xFF      ) << 3; x >>= y; r |= y;
    y = (x > 0xF       ) << 2; x >>= y; r |= y;
    y = (x > 0x3       ) << 1; x >>= y; r |= y;

    return static_cast<int>(r | (x >> 1));
}

uint64_t mul(uint64_t a, uint64_t b, uint64_t &y)
{
    auto a_lo = a & 0x00000000FFFFFFFF;
    auto a_hi = a >> 32;

    auto b_lo = b & 0x00000000FFFFFFFF;
    auto b_hi = b >> 32;

    auto c0 = a_lo * b_lo;
    auto c1 = a_hi * b_lo;
    auto c2 = a_hi * b_hi;

    auto u1 = c1 + (a_lo * b_hi);
    if(u1 < _c1){
        c2 += 1LL << 32;
    }

    auto u0 = c0 + (u1 << 32);
    if(u0 < c0){
        ++c2;
    }

    y = c2 + (u1 >> 32);

    return u0;
}