Untitled

#include <array>
#include <cstdint>
#include <limits>
#include <cstddef>
#include <iostream>

//generic addition and subracton with carry / borrow used for example. Real version uses intrinsics if available.
unsigned char HP_Add(const unsigned char carry_in, const std::uint32_t a, const std::uint32_t b, std::uint32_t& result) noexcept
{
    std::uint64_t sum = static_cast<std::uint64_t>(a) + b + carry_in;
    result = static_cast<T>(sum);
    return static_cast<unsigned char>(sum >> 32);
}

unsigned char HP_Sub(const unsigned char borrow, const std::uint32_t a, const std::uint32_t b, std::uint32_t& result) noexcept
{
    result = a - b + borrow * std::numeric_limits<std::uint32_t>::max();
    return (a < b) || (a == b && static_cast<bool>(borrow));
}

//This is the single-precision version of the function I'm trying to implement.
std::uint32_t REDC(const std::uint32_t a, const std::uint32_t b, const std::uint32_t n, const std::uint32_t n_inv) noexcept
{
    assert(n % 2 == 1);
    using std::uint64_t;
    using std::uint32_t;
    const uint64_t ab = static_cast<uint64_t>(a) * b;
    const uint32_t t = static_cast<uint32_t>(ab);           //ab % 2^32
    const uint32_t m = t * n_inv;                           //(t * n_inv) % 2^32
    const uint64_t mn = static_cast<uint64_t>(m) * n;
    uint32_t u = static_cast<uint32_t>((ab - mn) >> 32);    //(ab - mn) / 2^32
    u += n * (ab < mn);

    return u;
}

//This is a multiprecision version of REDC found above^^^. It works but there is a key difference. The input n_inv
//must be different and instead of implementing (ab - mn) as seen above this version performs (ab + mn).
//I can't get a multiprecision version that performs (ab - mn) to make the proper output.
template <std::size_t S>
std::array<std::uint32_t, S> REDC(const std::array<std::uint32_t, S>& a, const std::array<std::uint32_t, S>& b, const std::array<std::uint32_t, S>& n, const std::uint32_t n_inv) noexcept
{
    using std::size_t; using std::uint32_t; using std::uint64_t;
    std::array<std::uint32_t, S> t = {};
    uint64_t buffer;
    uint32_t carry;
    uint32_t t1;
    uint32_t t2;

    for (size_t i = 0; i < S; ++i)
    {
        carry = 0;

        for (size_t j = 0; j < S; ++j)
        {
            buffer = static_cast<uint64_t>(a[j]) * b[i] + t[j] + carry;
            t[j] = static_cast<uint32_t>(buffer);
            carry = static_cast<uint32_t>(buffer >> 32);
        }

        buffer = static_cast<uint64_t>(t1) + carry;
        t1 = static_cast<uint32_t>(buffer);
        t2 = static_cast<uint32_t>(buffer >> 32);
        const uint32_t m = t[0] * n_inv;

        buffer = static_cast<uint64_t>(m) * n[0] + t[0];
        carry = static_cast<uint32_t>(buffer >> 32);

        for (size_t j = 1; j < S; ++j)
        {
            buffer = static_cast<uint64_t>(m) * n[j] + t[j] + carry;
            t[j - 1] = static_cast<uint32_t>(buffer);
            carry = static_cast<uint32_t>(buffer >> 32);
        }

        buffer = static_cast<uint64_t>(t1) + carry;
        t[S - 1] = static_cast<uint32_t>(buffer);
        t1 = t2 + static_cast<uint32_t>(buffer >> 32);
    }

    std::array<std::uint32_t, S> u;
    unsigned char borrow = 0;

    for (std::size_t i = 0; i < S; ++i)
        borrow = jaml::detail::HP_Sub(borrow, t[i], n[i], u[i]);
    borrow = jaml::detail::HP_Sub(borrow, t1, 0, t1);

    if (borrow)
        return t;
    else
        return u;
}

//This version attempts to implement (ab - mn). The output is close but wrong.
template <std::size_t S>
std::array<std::uint32_t, S> REDC2(const std::array<std::uint32_t, S>& a, const std::array<std::uint32_t, S>& b, const std::array<std::uint32_t, S>& n, const std::uint32_t n_inv) noexcept
{
    using std::size_t; using std::uint32_t; using std::uint64_t;
    std::array<std::uint32_t, S> t = {};
    uint64_t buffer;
    uint32_t carry;
    uint32_t t1;
    uint32_t t2;

    for (size_t i = 0; i < S; ++i)
    {
        carry = 0;

        for (size_t j = 0; j < S; ++j)
        {
            buffer = static_cast<uint64_t>(a[j]) * b[i] + t[j] + carry;
            t[j] = static_cast<uint32_t>(buffer);
            carry = static_cast<uint32_t>(buffer >> 32);
        }

        buffer = static_cast<uint64_t>(t1) + carry;
        t1 = static_cast<uint32_t>(buffer);
        t2 = static_cast<uint32_t>(buffer >> 32);
        const uint32_t m = t[0] * n_inv;

        buffer = static_cast<uint64_t>(m) * n[0];
        uint32_t low = static_cast<uint32_t>(buffer);
        unsigned char borrow = jaml::detail::HP_Sub(0, t[0], low, t[0]);
        carry = static_cast<uint32_t>(buffer >> 32);

        for (size_t j = 1; j < S; ++j)
        {
            buffer = static_cast<uint64_t>(m) * n[j] + carry;
            low = static_cast<uint32_t>(buffer);
            borrow = jaml::detail::HP_Sub(borrow, t[j], low, t[j - 1]);
            carry = static_cast<uint32_t>(buffer >> 32);
        }

        borrow = jaml::detail::HP_Sub(borrow, t1, carry, t[S - 1]);
        t1 = t2 - borrow;
    }

    std::array<std::uint32_t, S> u;
    unsigned char carry2 = 0;

    for (std::size_t i = 0; i < S; ++i)
        carry2 = jaml::detail::HP_Add(carry2, t[i], n[i], u[i]);
    carry2 = jaml::detail::HP_Add(carry2, t1, 0, t1);

    if (!carry2)
        return t;
    else
        return u;
}

int main()
{
    //test values
    std::array<std::uint32_t, 4> a = { 544677573, 0, 4454356, 0 };
    std::array<std::uint32_t, 4> b = { 355334, 0, 88196580, 0 };
    std::array<std::uint32_t, 4> c = { 696957799, 0, 427290, 0 };
    std::uint32_t n_inv1 = 3223639;
    std::uint32_t n_inv2 = 4291743657;

    //REDC2 output is close but wrong. Help me find the mistake?
     std::array<std::uint32_t> arr1 = REDC(a, b, c, n_inv2);
     std::array<std::uint32_t> arr2 = REDC2(a, b, c, n_inv1);
}