ethash.cl nvidia 3050 LHR possible unlock

/*
Let me know if it works
Happy to accept donations to 0x5a5db084cc2ac543d39961a12926be7cfa02913a
(Nicehash ethereum wallet) 36,000 recommended gas limit.
I'm getting improvements on my Radeon rt 5500 xt sapphire pulse,
Went up from 24Mh/s to 39-42MH/s, But don't have a LHR card to test on.

Cheers,
image28/imagekiwi

p.s could improve dag generation speed but wouldn't be worth the effort right now
Dag compression would be more interesting. The unlock works by removing unions which use the
largest variable type in the union 64bit, which has the speed clamped on LHR gpu's, some other LHR
gpu's also clamp 16bit performance, so I adjusted most of the code to work on 32bit values.
Still room for more improvements.
*/

#define OPENCL_PLATFORM_UNKNOWN 0
#define OPENCL_PLATFORM_AMD 1
#define OPENCL_PLATFORM_CLOVER 2
#define OPENCL_PLATFORM_NVIDIA 3
#define OPENCL_PLATFORM_INTEL 4
/*
#define WORKSIZE 256
#define MAX_OUTPUTS 4
#define ACCESSES 64
#define PLATFORM 1
#define COMPUTE 1
*/

#define CHUNKSIZE 64

#ifdef cl_clang_storage_class_specifiers
#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif

#if defined(cl_amd_media_ops)
    #if PLATFORM == OPENCL_PLATFORM_CLOVER
        uint2 amd_bitalign(uint2 src0, uint2 src1, uint2 src2)
        {
            uint2 dst;
            __asm(
                "v_alignbit_b32 %0, %2, %3, %4\n"
                "v_alignbit_b32 %1, %5, %6, %7"
                : "=v"(dst.x), "=v"(dst.y)
                : "v"(src0.x), "v"(src1.x), "v"(src2.x), "v"(src0.y), "v"(src1.y), "v"(src2.y));
            return dst;
        }
    #endif

    #pragma OPENCL EXTENSION cl_amd_media_ops : enable
#elif defined(cl_nv_pragma_unroll)
    uint amd_bitalign(uint src0, uint src1, uint src2)
    {
        uint dest;
        asm("shf.r.wrap.b32 %0, %2, %1, %3;" : "=r"(dest) : "r"(src0), "r"(src1), "r"(src2));
        return dest;
    }
#else
    #define amd_bitalign(src0, src1, src2) \
        ((uint)(((((ulong)(src0)) << 32) | (ulong)(src1)) >> ((src2)&31)))
#endif

#define EndianSwap(n) (rotate(n & 0x00FF00FF, 24U)|(rotate(n, 8U) & 0x00FF00FF)

#if WORKSIZE % 4 != 0
    #error "WORKSIZE has to be a multiple of 4"
#endif

#define FNV_PRIME 0x01000193U // 2^24+403

static __constant uint2 const Keccak_f1600_RC[24] = {
    (uint2)(0x00000001, 0x00000000),
    (uint2)(0x00008082, 0x00000000),
    (uint2)(0x0000808a, 0x80000000),
    (uint2)(0x80008000, 0x80000000),
    (uint2)(0x0000808b, 0x00000000),
    (uint2)(0x80000001, 0x00000000),
    (uint2)(0x80008081, 0x80000000),
    (uint2)(0x00008009, 0x80000000),
    (uint2)(0x0000008a, 0x00000000),
    (uint2)(0x00000088, 0x00000000),
    (uint2)(0x80008009, 0x00000000),
    (uint2)(0x8000000a, 0x00000000),
    (uint2)(0x8000808b, 0x00000000),
    (uint2)(0x0000008b, 0x80000000),
    (uint2)(0x00008089, 0x80000000),
    (uint2)(0x00008003, 0x80000000),
    (uint2)(0x00008002, 0x80000000),
    (uint2)(0x00000080, 0x80000000),
    (uint2)(0x0000800a, 0x00000000),
    (uint2)(0x8000000a, 0x80000000),
    (uint2)(0x80008081, 0x80000000),
    (uint2)(0x00008080, 0x80000000),
    (uint2)(0x80000001, 0x00000000),
    (uint2)(0x80008008, 0x80000000),
};

#ifdef cl_amd_media_ops
    #define ROTL32_1(x, y) amd_bitalign((x), (x).s10, 32 - (y))
    #define ROTL32_2(x, y) amd_bitalign((x).s10, (x), 32 - (y))
#else
    #define ROTL32_1(x, y) rotate(x, y)
    #define ROTL32_2(x, y) ROTL32_1(x, (y) + 32)
#endif

#define KECCAKF_1600_RND(a, i, outsz)                                      \
    do                                                                     \
    {                                                                      \
        const uint2 m0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] ^             \
                         ROTL32_1(a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22], 1); \
        const uint2 m1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] ^             \
                         ROTL32_1(a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23], 1); \
        const uint2 m2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] ^             \
                         ROTL32_1(a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24], 1); \
        const uint2 m3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] ^             \
                         ROTL32_1(a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20], 1); \
        const uint2 m4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] ^             \
                         ROTL32_1(a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21], 1); \
                                                                           \
        const uint2 tmp = a[1] ^ m0;                                       \
                                                                           \
        a[0] ^= m4;                                                        \
        a[5] ^= m4;                                                        \
        a[10] ^= m4;                                                       \
        a[15] ^= m4;                                                       \
        a[20] ^= m4;                                                       \
                                                                           \
        a[6] ^= m0;                                                        \
        a[11] ^= m0;                                                       \
        a[16] ^= m0;                                                       \
        a[21] ^= m0;                                                       \
                                                                           \
        a[2] ^= m1;                                                        \
        a[7] ^= m1;                                                        \
        a[12] ^= m1;                                                       \
        a[17] ^= m1;                                                       \
        a[22] ^= m1;                                                       \
                                                                           \
        a[3] ^= m2;                                                        \
        a[8] ^= m2;                                                        \
        a[13] ^= m2;                                                       \
        a[18] ^= m2;                                                       \
        a[23] ^= m2;                                                       \
                                                                           \
        a[4] ^= m3;                                                        \
        a[9] ^= m3;                                                        \
        a[14] ^= m3;                                                       \
        a[19] ^= m3;                                                       \
        a[24] ^= m3;                                                       \
                                                                           \
        a[1] = ROTL32_2(a[6], 12);                                         \
        a[6] = ROTL32_1(a[9], 20);                                         \
        a[9] = ROTL32_2(a[22], 29);                                        \
        a[22] = ROTL32_2(a[14], 7);                                        \
        a[14] = ROTL32_1(a[20], 18);                                       \
        a[20] = ROTL32_2(a[2], 30);                                        \
        a[2] = ROTL32_2(a[12], 11);                                        \
        a[12] = ROTL32_1(a[13], 25);                                       \
        a[13] = ROTL32_1(a[19], 8);                                        \
        a[19] = ROTL32_2(a[23], 24);                                       \
        a[23] = ROTL32_2(a[15], 9);                                        \
        a[15] = ROTL32_1(a[4], 27);                                        \
        a[4] = ROTL32_1(a[24], 14);                                        \
        a[24] = ROTL32_1(a[21], 2);                                        \
        a[21] = ROTL32_2(a[8], 23);                                        \
        a[8] = ROTL32_2(a[16], 13);                                        \
        a[16] = ROTL32_2(a[5], 4);                                         \
        a[5] = ROTL32_1(a[3], 28);                                         \
        a[3] = ROTL32_1(a[18], 21);                                        \
        a[18] = ROTL32_1(a[17], 15);                                       \
        a[17] = ROTL32_1(a[11], 10);                                       \
        a[11] = ROTL32_1(a[7], 6);                                         \
        a[7] = ROTL32_1(a[10], 3);                                         \
        a[10] = ROTL32_1(tmp, 1);                                          \
                                                                           \
        uint2 m5 = a[0];                                                   \
        uint2 m6 = a[1];                                                   \
        a[0] = bitselect(a[0] ^ a[2], a[0], a[1]);                         \
        a[0] ^= as_uint2(Keccak_f1600_RC[i]);                              \
        if (outsz > 1)                                                     \
        {                                                                  \
            a[1] = bitselect(a[1] ^ a[3], a[1], a[2]);                     \
            a[2] = bitselect(a[2] ^ a[4], a[2], a[3]);                     \
            a[3] = bitselect(a[3] ^ m5, a[3], a[4]);                       \
            a[4] = bitselect(a[4] ^ m6, a[4], m5);                         \
            if (outsz > 4)                                                 \
            {                                                              \
                m5 = a[5];                                                 \
                m6 = a[6];                                                 \
                a[5] = bitselect(a[5] ^ a[7], a[5], a[6]);                 \
                a[6] = bitselect(a[6] ^ a[8], a[6], a[7]);                 \
                a[7] = bitselect(a[7] ^ a[9], a[7], a[8]);                 \
                a[8] = bitselect(a[8] ^ m5, a[8], a[9]);                   \
                a[9] = bitselect(a[9] ^ m6, a[9], m5);                     \
                if (outsz > 8)                                             \
                {                                                          \
                    m5 = a[10];                                            \
                    m6 = a[11];                                            \
                    a[10] = bitselect(a[10] ^ a[12], a[10], a[11]);        \
                    a[11] = bitselect(a[11] ^ a[13], a[11], a[12]);        \
                    a[12] = bitselect(a[12] ^ a[14], a[12], a[13]);        \
                    a[13] = bitselect(a[13] ^ m5, a[13], a[14]);           \
                    a[14] = bitselect(a[14] ^ m6, a[14], m5);              \
                    m5 = a[15];                                            \
                    m6 = a[16];                                            \
                    a[15] = bitselect(a[15] ^ a[17], a[15], a[16]);        \
                    a[16] = bitselect(a[16] ^ a[18], a[16], a[17]);        \
                    a[17] = bitselect(a[17] ^ a[19], a[17], a[18]);        \
                    a[18] = bitselect(a[18] ^ m5, a[18], a[19]);           \
                    a[19] = bitselect(a[19] ^ m6, a[19], m5);              \
                    m5 = a[20];                                            \
                    m6 = a[21];                                            \
                    a[20] = bitselect(a[20] ^ a[22], a[20], a[21]);        \
                    a[21] = bitselect(a[21] ^ a[23], a[21], a[22]);        \
                    a[22] = bitselect(a[22] ^ a[24], a[22], a[23]);        \
                    a[23] = bitselect(a[23] ^ m5, a[23], a[24]);           \
                    a[24] = bitselect(a[24] ^ m6, a[24], m5);              \
                }                                                          \
            }                                                              \
        }                                                                  \
    } while (0)


#define KECCAK_PROCESS(st, in_size, out_size)    \
    do                                           \
    {                                            \
        short r=0;                               \
        short os=0;                              \
                                                 \
        do                                       \
        {                                        \
            os=(r < 23 ? 25 : (out_size));       \
            KECCAKF_1600_RND(st, r, os);         \
            r++;                                 \
        }while(r<24);                            \
    } while (0)

#define fnv(x, y) ((x)*FNV_PRIME ^ (y))
#define fnv_reduce(v) fnv(fnv(fnv(v.x, v.y), v.z), v.w)

#define MIX(x)                                                          \
    do                                                                  \
    {                                                                   \
        *(local_buffer) = fnv(init0 ^ (a + x), *(imix+x)) % dag_size;   \
        mix = fnv(mix, g_dag_uint[(buffer[lane_idx]*4)+ids[1]]);        \
        mem_fence(CLK_LOCAL_MEM_FENCE);                                 \
    } while (0)

// NOTE: This struct must match the one defined in CLMiner.cpp
struct __attribute__((packed)) __attribute__((aligned(128))) SearchResults
{
    uint count;
    uint hashCount;
    volatile uint abort;
    uint gid[MAX_OUTPUTS];
};

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(
    __global struct SearchResults* g_output, __constant uint2 const* g_header,
    __global ulong8 const* _g_dag0, __global ulong8 const* _g_dag1, uint dag_size,
    ulong start_nonce, ulong target)
{
    if (g_output->abort)
        return;

    const ushort ids[] = {          \
    (ushort) get_local_id(0),       \
    (ushort)(get_local_id(0) & 3),      \
    (ushort)(get_local_id(0) >> 2),     \
    (ushort)(get_local_id(0) >> 2 << 2)};
    const uint gid = get_global_id(0);
    __global uint8 const* g_dag_uint = (__global uint8 const*)_g_dag0;

    __local uint sharebuf[1024];
    __local uint buffer[256];
    __local uint *local_buffer=&buffer[ids[0]];
    __local ulong8 *ulong8_buffer=&sharebuf[ids[2]*16]; // (write buffer 64 bytes )
    __local ulong4 *ulong4_buffer=&sharebuf[ids[2]*16]; // (read buffer 32 bytes )
    __local uint8 *uint8_buffer=(uint)&sharebuf[ids[2]*16]; // (read buffer 32 bytes )
    __local uint2 *uint2_buffer=(uint)&sharebuf[ids[2]*16]; // (write buffer 16 bytes )
    __local uint *uint_buffer=(uint)&sharebuf[ids[2]*16]; // ( read buffer )

    uint2 state[25];
    ulong8 *convert=&state;
    ulong4 *convert2=&state;
    uchar a,x,lane;
    char tid=0;
    uint init0;
    uint8 mix;
    uint *imix=&mix;

    *(convert)=(ulong8)(0);
    *(convert+1)=*(convert);
    *(convert+2)=*(convert);
    state[0] = g_header[0];
    state[1] = g_header[1];
    state[2] = g_header[2];
    state[3] = g_header[3];
    state[4] = as_uint2(start_nonce + gid);
    state[5] = as_uint2(0x0000000000000001UL);
    state[8] = as_uint2(0x8000000000000000UL);
    state[24] = state[23];

    KECCAK_PROCESS(state,5, 8);

    for(tid=0; tid < 4; tid++)
    {
        barrier(CLK_LOCAL_MEM_FENCE);
        if ( ids[1] == tid-1 )
            *(convert2+2) = *(ulong4_buffer);

        if ( tid == ids[1])
            *(ulong8_buffer)=*(convert);
        barrier(CLK_LOCAL_MEM_FENCE);

        mix = *(uint8_buffer+(ids[1]&1));
        init0 = *(uint_buffer);

        a=0;lane=0;
        barrier(CLK_LOCAL_MEM_FENCE);
        #pragma unroll 1
        do
        {
            const uchar lane_idx = ids[3] + lane;
            #pragma unroll 8
            for (x = 0; x < 8; ++x)
                MIX(x);

            lane=(lane+1)&3;
            a += 8;
        }while(a < ACCESSES);

        barrier(CLK_LOCAL_MEM_FENCE);
        *(uint2_buffer+ids[1]) = (uint2)(fnv_reduce(mix.lo), fnv_reduce(mix.hi));
    }

    if ( ids[1] == 3 )
        *(convert2+2) = *(ulong4_buffer);

    *(convert2+3)=(ulong4)(0);
    *(convert2+4)=*(convert2+3);
    *(convert2+5)=*(convert2+3);
    state[12] = as_uint2(0x0000000000000001UL);
    state[16] = as_uint2(0x8000000000000000UL);
    state[24] = state[23];

    KECCAK_PROCESS(state,12, 1);

    if (get_local_id(0) == 0)
    {
        atomic_inc(&g_output->hashCount);
    }

    // weird
    if (as_ulong(as_uchar8(state[0]).s76543210) <= target)
    {
        atomic_inc(&g_output->abort);
        uint slot = min(MAX_OUTPUTS - 1u, atomic_inc(&g_output->count));
        g_output->gid[slot] = gid;
    }
}

typedef union _Node
{
    uint dwords[16];
    uint2 qwords[8];
    uint4 dqwords[4];
} Node;

static void SHA3_512(uint2* s)
{
    uint2 state[25];
    ulong8 *convert=&state;
    ulong8 *result=s;

    *(convert)=*(result);
    *(convert+1)=(ulong8)(0);
    *(convert+2)=(ulong8)(0);
    state[8] = (uint2)(0x00000001, 0x80000000);
    state[24] = (uint2)(0);

    KECCAK_PROCESS(state,8, 8);

    *(result)=*(convert);
}

__kernel void GenerateDAG(uint start, __global const uint16* _Cache, __global uint16* _DAG0,
    __global uint16* _DAG1, uint light_size)
{
    __global const Node* Cache = (__global const Node*)_Cache;
    const uint gid = get_global_id(0);
    uint NodeIdx = start + gid;
    const uint thread_id = gid & 3;

    __local Node sharebuf[WORKSIZE];
    __local uint indexbuf[WORKSIZE];
    __local Node* dagNode = sharebuf + (get_local_id(0) / 4) * 4;
    __local uint* indexes = indexbuf + (get_local_id(0) / 4) * 4;
    __global const Node* parentNode;

    Node DAGNode = Cache[NodeIdx % light_size];

    DAGNode.dwords[0] ^= NodeIdx;
    SHA3_512(DAGNode.qwords);

    dagNode[thread_id] = DAGNode;
    barrier(CLK_LOCAL_MEM_FENCE);

    for (uint i = 0; i < 256; ++i)
    {
        uint ParentIdx = fnv(NodeIdx ^ i, dagNode[thread_id].dwords[i & 15]) % light_size;
        indexes[thread_id] = ParentIdx;
        barrier(CLK_LOCAL_MEM_FENCE);

        for (uint t = 0; t < 4; ++t)
        {
            uint parentIndex = indexes[t];
            parentNode = Cache + parentIndex;

            dagNode[t].dqwords[thread_id] = fnv(dagNode[t].dqwords[thread_id], parentNode->dqwords[thread_id]);
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
    DAGNode = dagNode[thread_id];

    SHA3_512(DAGNode.qwords);

    __global Node* DAG;
    DAG = (__global Node *) _DAG0;
    DAG[NodeIdx] = DAGNode;
}