Untitled

#include "cuda.h"                        // check for dups ...
#include "cuda_runtime.h"
#include <iostream>
#include "device_launch_parameters.h"
#include <stdint.h>                        // check for dup in cpp, see if it matters or is necessary
#include <inttypes.h>                    // same check for dups
#include <stdio.h>                        // ditto
#include <time.h>
#include <sys/timeb.h>                    // TODO: double-check linux compatibility for future port/build, should be fine
#include "Windows.h"

#include <string>

#define        CUDA_HOWMANY_DEVICES    2
#define        CUDA_MAX_DEVICES        16

using namespace std;

extern bool gSolnFound[CUDA_MAX_DEVICES];
extern string gStr_Hashrate[CUDA_MAX_DEVICES];

// TODO: initialization, including nullptr ?
bool    gSolving = false;                               // if set to false, solving stops, then CUDA shutdown
uint64_t* d_solution;
uint8_t* d_challenge;
uint8_t* d_hash_prefix;
__constant__ uint64_t    d_mid[CUDA_MAX_DEVICES][25];
__constant__ uint64_t    d_target[CUDA_MAX_DEVICES];
__constant__ uint32_t    threads[CUDA_MAX_DEVICES];

extern uint8_t gSolution[CUDA_MAX_DEVICES][32];                                    // stores sol'n after a successful solve. (TODO: sol'ns queue)
                                                                                // see equivalent (actual) declaration in main_includes.h
extern double cuda_solvetime[CUDA_MAX_DEVICES];                                    // stores time taken this solve


uint64_t    printable_hashrate_cnt[CUDA_MAX_DEVICES] = { 0 };

// function prototypes
extern void print_bytes(uint8_t inArray[], uint8_t len);        // defined in C/C++ code compiled by non-CUDA compiler
extern void EnqueueSolution();                                    // defined in comms_pool.h
int cuda_shutdown();                                            // later in this file


#define ROTL64(x, y) (((x) << (y)) ^ ((x) >> (64 - (y))))
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))

#define            DEF_CUDA_DEVICE_INDEX_DEFAULT         0
#define            DEF_CUDA_DEVICE_INTENSITY            24

cudaDeviceProp  cdp_chosenDevice[CUDA_MAX_DEVICES];                        // TODO: initialize with appropriate null
uint8_t            gCudaDeviceIntensities[CUDA_MAX_DEVICES] = { 0 };

int                device_count = 0;
extern string    gpuName[CUDA_MAX_DEVICES];
extern string    gStr_HashCount[CUDA_MAX_DEVICES];

int                gCuda_Device0Intensity = DEF_CUDA_DEVICE_INTENSITY;        // Should be overwritten by real setting (config file, OptionsForm.)
uint64_t        cudaDeviceClockSpeed[CUDA_MAX_DEVICES] = { 0 };            // TODO: an array of devices
uint8_t            cudaDeviceComputeCapability[CUDA_MAX_DEVICES] = { 0 };    // TODO: likewise, and default fallback just in case
bool            cudaDeviceIsInitialized[CUDA_MAX_DEVICES] = { 0 };        // TODO: an array of devices

uint32_t        numThreads[CUDA_MAX_DEVICES] = { 0 };
uint64_t        theCounter = 0;                            // TODO: multiple devices, independent or unified counter?
uint64_t        printableHashrateCounter[CUDA_MAX_DEVICES] = { 0 };        // ^^
uint64_t        printableCounter[CUDA_MAX_DEVICES] = { 0 };                // ^^
uint64_t        print_timer[CUDA_MAX_DEVICES] = { 0 };                    // TODO: rename these- they're too confusingly similar ;)

extern uint8_t  solution[CUDA_MAX_DEVICES][32];

// TODO: initialize? (nullptr)
// TODO: ** we need to reset to UINT64_MAX each time we get a sol'n! IMPT Reminder !! **
extern uint64_t* h_message[CUDA_MAX_DEVICES];            // init to nullptr? (FIXME)

struct timeb    tStart[CUDA_MAX_DEVICES], tEnd[CUDA_MAX_DEVICES];                            // ^^
                                                        // TODO/FIXME: This shouldn't be necessary- work out make options which don't require it
//#include "cuda_funcs.h"


//
// CUDACALLCHECKERROR: Processes result (type cudaError) of a CUDA call. Provides error-checking
uint8_t cudaCallCheckError(cudaError theResult)
{
    if (theResult == cudaSuccess)
    {
        printf("Successful!\n");
        return 0; // no error
    }
    else
    {
        printf("Error.\nCUDA Result: %s \n", cudaGetErrorString(theResult));
        return 1; // for error counters
    }
}


__device__ __forceinline__
uint64_t bswap_64(uint64_t input)
{
    asm("{"
        "  .reg .u32 oh, ol;"
        "  mov.b64 {oh,ol}, %0;"
        "  prmt.b32 oh, oh, 0, 0x0123;"
        "  prmt.b32 ol, ol, 0, 0x0123;"
        "  mov.b64 %0, {ol,oh};"
        "}" : "+l"(input));
    return input;
}

// try doing this with two offsettings of output operand instead
__device__ __forceinline__
uint64_t ROTL64asm(uint64_t input, uint32_t magnitude)
{
#if __CUDA_ARCH__ >= 320
    asm("{"
        ".reg .b32 hi, lo, mag, scr;"
        "mov.b32 mag, %1;"
        "mov.b64 {hi,lo}, %0;"
        "shf.l.wrap.b32 scr, lo, hi, mag;"
        "shf.l.wrap.b32 lo, hi, lo, mag;"
        "mov.b64 %0, {scr,lo};"
        "}" : "+l"(input) : "r"(magnitude));
    return input;
#else
    return ROTL64(input, magnitude);
#endif
}

// try doing this with two offsettings of output operand instead
__device__ __forceinline__
uint64_t ROTR64asm(uint64_t input, uint32_t magnitude)
{
    // TODO/FIXME: verify correct version is running on multiple arches
#if __CUDA_ARCH__ >= 320
    asm("{"
        ".reg .b32 hi, lo, mag, scr;"
        "mov.b32 mag, %1;"
        "mov.b64 {hi,lo}, %0;"
        "shf.r.wrap.b32 scr, hi, lo, mag;"
        "shf.r.wrap.b32 lo, lo, hi, mag;"
        "mov.b64 %0, {scr,lo};"
        "}" : "+l"(input) : "r"(magnitude));
    return input;
#else
    return ROTR64(input, magnitude);
#endif
}

__device__ __forceinline__
uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
{
    asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(b));
    asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(c));
    asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(d));
    asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(e));
    return a;
}

// shortcut to rotation by 32 (flip halves), then rotate left by `mag`
__device__ __forceinline__
uint64_t ROTLfrom32(uint64_t rtdby32, uint32_t magnitude)
{
    asm("{"
        "    .reg .b32 hi, lo, scr, mag;       "
        "    mov.b64 {lo,hi}, %0;              "      // halves reversed since rotl'd by 32
        "    mov.b32 mag, %1;                  "
        "    shf.l.wrap.b32 scr, lo, hi, mag;  "
        "    shf.l.wrap.b32 lo, hi, lo, mag;   "
        "    mov.b64 %0, {scr,lo};             "
        "}" : "+l"(rtdby32) : "r"(magnitude));    // see if this is faster w/ uint2 .x and .y
                                                  // for saving shf results out
    return rtdby32;   // return rotation from the rotation by 32
}

// shortcut to rotation by 32 (flip halves), then rotate right by `mag`
__device__ __forceinline__
uint64_t ROTRfrom32(uint64_t rtdby32, uint32_t magnitude)
{
    asm("{"
        "    .reg .b32 hi, lo, scr, mag;       "
        "    mov.b64 {lo,hi}, %0;              "      // halves reversed since rotl'd by 32
        "    mov.b32 mag, %1;                  "
        "    shf.r.wrap.b32 scr, hi, lo, mag;  "
        "    shf.r.wrap.b32 lo, lo, hi, mag;   "
        "    mov.b64 %0, {scr,lo};             "
        "}" : "+l"(rtdby32) : "r"(magnitude));    // see if this is faster w/ uint2 .x and .y
                                                  // for saving shf results out
    return rtdby32;   // return rotation from the rotation by 32
}

__device__ __forceinline__
uint64_t ROTLby16(uint64_t input)
{
    asm("{"
        "   .reg .b32 hi, lo, scr;"
        "   mov.b64 {hi,lo}, %0;"
        "   prmt.b32 scr, hi, lo, 0x5432;"
        "   prmt.b32 lo, hi, lo, 0x1076;"
        "   mov.b64 %0, {lo,scr};"
        " }" : "+l"(input));
    return input;
}

// see if this is faster using x and y vectors, no extra regs
__device__ __forceinline__
uint64_t ROTLby8(uint64_t input)
{
    asm("{"
        ".reg .b32 hi, lo, scr;"
        "mov.b64 {hi,lo}, %0;"
        "prmt.b32 scr, hi, lo, 0x2107;"
        "prmt.b32 lo, hi, lo, 0x6543;"
        "mov.b64 %0, {scr,lo};"
        "}"
        : "+l"(input));
    return input;
}

__device__ __forceinline__
uint64_t ROTRby8(uint64_t input)
{
    asm("{"
        ".reg .b32 hi, lo, scr;"
        "mov.b64 {hi,lo}, %0;"
        "prmt.b32 scr, lo, hi, 0x0765;"
        "prmt.b32 lo, lo, hi, 0x4321;"
        "mov.b64 %0, {scr,lo};"
        "}"
        : "+l"(input));
    return input;
}

// TODO: Look for a snappier way to do this. Should still be slightly
//       faster than the variable-magnitude version above.
__device__ __forceinline__
uint64_t ROTLby1(uint64_t input)
{
#if __CUDA_ARCH__ >= 320
    asm("{"
        ".reg .b32 hi, lo, scr;"
        "mov.b64 {hi,lo}, %0;"
        "shf.l.wrap.b32 scr, lo, hi, 1;"   // magnitude replaced w/ immediate
        "shf.l.wrap.b32 lo, hi, lo, 1;"    // magnitude replaced w/ immediate
        "mov.b64 %0, {scr,lo};"
        "}" : "+l"(input));
    return input;
#else
    return ROTL64(input, 1);
#endif
}

// try doing this with two offsettings of output operand instead
__device__ __forceinline__
uint64_t ROTRby1(uint64_t input)
{
#if __CUDA_ARCH__ >= 320
    asm("{"
        ".reg .b32 hi, lo, scr;"
        "mov.b64 {hi,lo}, %0;"
        "shf.r.wrap.b32 scr, hi, lo, 1;"
        "shf.r.wrap.b32 lo, lo, hi, 1;"
        "mov.b64 %0, {scr,lo};"
        "}" : "+l"(input));
    return input;
#else
    return ROTR64(input, 1);
#endif
}

__device__ __forceinline__
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
{
    uint64_t output{ 0 };
    asm("{"
        "  xor.b64 %0, %1, %2;"
        "  xor.b64 %0, %0, %3;"
        "}" : "+l"(output) : "l"(a), "l"(b), "l"(c));
    return output;
}

// FIXME: Assuming SM 5.x+
__device__ __forceinline__
uint64_t lop3_0xD2(uint64_t a, uint64_t b, uint64_t c)
{   // FIXME/TODO: make SURE that the correct version is running on Maxwell Gen2, Pascal!
#if __CUDA_ARCH__ >= 500
    asm("{"
        "  .reg .b32 ah, al, bh, bl, ch, cl;"
        "  mov.b64 {ah,al}, %0;"
        "  mov.b64 {bh,bl}, %1;"
        "  mov.b64 {ch,cl}, %2;"

        "  lop3.b32 ah, ah, bh, ch, 0xD2;"
        "  lop3.b32 al, al, bl, cl, 0xD2;"
        "  mov.b64 %0, {ah,al};"
        "}" : "+l"(a) : "l"(b), "l"(c));
    return a;
#else
    return a ^ ((~b) & c);
#endif
}

__device__
bool keccak(uint8_t deviceNum, uint64_t nonce, uint32_t thread) // had 'passing optimizations' and 'passing mid' in 3.4t
{
    uint64_t state[25], C[5], D[5], scratch;

    //if (thread == 543210)
    //    PermTest (0);
    uint64_t RClocal[24] =
    {
        /* Element     (elements which are '32bit': 1, 4-5, 8, 9-12, 18, 22)      */
        /* -------     ------------------  ------------------  ------------------ */
        /* 00..02  */  0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
        /* 03..05  */  0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
        /* 06..08  */  0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
        /* 09..11  */  0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
        /* 12..14  */  0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
        /* 15..17  */  0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
        /* 18..20  */  0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
        /* 21..23  */  0x8000000000008080, 0x0000000080000001, 0x8000000080008008
        /* -------     ------------------  ------------------  ------------------ */
    };

    // TODO: consider defining these conditionally, preprocessor
    //if (HB_SELECTIVE_32BIT_XOR)
    //uint2* RCvec = (uint2*)&RClocal;     // vectorized access to RClocal[]
    //uint2* stateVec = (uint2*)&state;    // vectorized access to state[]

    //if (thread == 543210)
    //    PermTest (0);

    C[0] = d_mid[deviceNum][2] ^ ROTR64asm(nonce, 20);
    C[1] = d_mid[deviceNum][4] ^ ROTL64asm(nonce, 14);

    state[0] = lop3_0xD2(d_mid[deviceNum][0], d_mid[deviceNum][1], C[0]);   //^ 0x0000000000000001;
#if HB_SELECTIVE_32BIT_XOR                          // shouldn't it be .y? only XOR low end of RC 0
                                                      //stateVec[0].x = stateVec[0].x ^ 0x00000001;   // because the rest is leading zeroes :)
    asm("xor.b32 %0, %0, 0x00000001;" : "+r"(stateVec[0].x));
#else
    state[0] = state[0] ^ 0x0000000000000001;     // was RC[0]
#endif

    state[1] = lop3_0xD2(d_mid[deviceNum][1], C[0], d_mid[deviceNum][3]);
    state[2] = lop3_0xD2(C[0], d_mid[deviceNum][3], C[1]);
    state[3] = lop3_0xD2(d_mid[deviceNum][3], C[1], d_mid[deviceNum][0]);
    state[4] = lop3_0xD2(C[1], d_mid[deviceNum][0], d_mid[deviceNum][1]);

    C[0] = d_mid[deviceNum][6] ^ ROTL64asm(nonce, 20);          // nonce*1048576;
    C[1] = d_mid[deviceNum][9] ^ ROTR64(nonce, 2);
    state[5] = lop3_0xD2(d_mid[deviceNum][5], C[0], d_mid[deviceNum][7]);
    state[6] = lop3_0xD2(C[0], d_mid[deviceNum][7], d_mid[deviceNum][8]);
    state[7] = lop3_0xD2(d_mid[deviceNum][7], d_mid[deviceNum][8], C[1]);
    state[8] = lop3_0xD2(d_mid[deviceNum][8], C[1], d_mid[deviceNum][5]);
    state[9] = lop3_0xD2(C[1], d_mid[deviceNum][5], C[0]);

    // experimental rotation replacement
    scratch = nonce * 128;
    C[0] = d_mid[deviceNum][11] ^ scratch;                    // ROTL by 7
    C[1] = d_mid[deviceNum][13] ^ scratch * 2;                  // ROTL by 8
    state[10] = lop3_0xD2(d_mid[deviceNum][10], C[0], d_mid[deviceNum][12]);
    state[11] = lop3_0xD2(C[0], d_mid[deviceNum][12], C[1]);
    state[12] = lop3_0xD2(d_mid[deviceNum][12], C[1], d_mid[deviceNum][14]);
    state[13] = lop3_0xD2(C[1], d_mid[deviceNum][14], d_mid[deviceNum][10]);
    state[14] = lop3_0xD2(d_mid[deviceNum][14], d_mid[deviceNum][10], C[0]);

    C[0] = d_mid[deviceNum][15] ^ ROTL64asm(nonce, 27);   //nonce*134217728;
    C[1] = d_mid[deviceNum][18] ^ ROTLby16(nonce);
    state[15] = lop3_0xD2(C[0], d_mid[deviceNum][16], d_mid[deviceNum][17]);
    state[16] = lop3_0xD2(d_mid[deviceNum][16], d_mid[deviceNum][17], C[1]);
    state[17] = lop3_0xD2(d_mid[deviceNum][17], C[1], d_mid[deviceNum][19]);
    state[18] = lop3_0xD2(C[1], d_mid[deviceNum][19], C[0]);
    state[19] = lop3_0xD2(d_mid[deviceNum][19], C[0], d_mid[deviceNum][16]);

    C[0] = d_mid[deviceNum][20] ^ ROTRby1(nonce);
    C[1] = d_mid[deviceNum][21] ^ ROTR64(nonce, 9);      //idea: ROTRby1(ROTRby8(nonce));
    C[2] = d_mid[deviceNum][22] ^ ROTR64(nonce, 25);
    state[20] = lop3_0xD2(C[0], C[1], C[2]);
    state[21] = lop3_0xD2(C[1], C[2], d_mid[deviceNum][23]);
    state[22] = lop3_0xD2(C[2], d_mid[deviceNum][23], d_mid[deviceNum][24]);
    state[23] = lop3_0xD2(d_mid[deviceNum][23], d_mid[deviceNum][24], C[0]);
    state[24] = lop3_0xD2(d_mid[deviceNum][24], C[0], C[1]);


#if __CUDA_ARCH__ >= 350
#  pragma unroll
#endif
    for (int32_t i{ 1 }; i < 23; ++i)
    {
        // Theta
        for (uint32_t x{ 0 }; x < 5; ++x)
        {
            C[(x + 6) % 5] = xor5(state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20]);
        }

#if __CUDA_ARCH__ >= 350
        for (uint32_t x{ 0 }; x < 5; ++x)
        {
            D[x] = ROTL64(C[(x + 2) % 5], 1);
            state[x] = xor3(state[x], D[x], C[x]);
            state[x + 5] = xor3(state[x + 5], D[x], C[x]);
            state[x + 10] = xor3(state[x + 10], D[x], C[x]);
            state[x + 15] = xor3(state[x + 15], D[x], C[x]);
            state[x + 20] = xor3(state[x + 20], D[x], C[x]);
        }
#else
        for (uint32_t x{ 0 }; x < 5; ++x)
        {
            D[x] = ROTL64(C[(x + 2) % 5], 1) ^ C[x];
            state[x] = state[x] ^ D[x];
            state[x + 5] = state[x + 5] ^ D[x];
            state[x + 10] = state[x + 10] ^ D[x];
            state[x + 15] = state[x + 15] ^ D[x];
            state[x + 20] = state[x + 20] ^ D[x];
        }
#endif

        // Rho Pi
        C[0] = state[1];
        state[1] = ROTR64asm(state[6], 20);
        state[6] = ROTL64asm(state[9], 20);
        state[9] = ROTR64(state[22], 3);
        state[22] = ROTR64asm(state[14], 25);
        state[14] = ROTL64asm(state[20], 18);
        state[20] = ROTR64(state[2], 2);
        state[2] = ROTR64asm(state[12], 21);
        state[12] = ROTL64asm(state[13], 25);
        state[13] = ROTLby8(state[19]);
        state[19] = ROTRby8(state[23]);
        state[23] = ROTR64asm(state[15], 23);

        // FIXME: watch out for thread divergence within warp. But this should be fine.
        //if (optimizations[NUM_HBOPT_ROTLBACK32])
#if __CUDA_ARCH__ >= 320
        state[15] = ROTRfrom32(state[4], 5);        // (make toggleable- forced on for now)
#else
        state[15] = ROTL64(state[4], 27);
#endif

        /*if (thread==1236667)
        {
        printf ("st15: %" PRIx64 " \n", state[15]);
        printf ("tstR: %" PRIx64 " \n", ROTRfrom32(state[4], 5) );
        printf ("tstL: %" PRIx64 " \n\n", ROTLfrom32(state[4], 5) );
        }*/

        state[4] = ROTL64asm(state[24], 14);
        state[24] = ROTL64(state[21], 2);
        state[21] = ROTR64(state[8], 9);            // R9
        state[8] = ROTR64asm(state[16], 19);

        // FIXME: watch out for thread divergence. Slowdown here. Figure out why
        //if (optimizations[NUM_HBOPT_ROTLBACK32])    // forced on
        //{
#if __CUDA_ARCH__ >= 320
        state[16] = ROTLfrom32(state[5], 4);
        state[5] = ROTRfrom32(state[3], 4);
#else
        state[16] = ROTR64(state[5], 28);
        state[5] = ROTL64(state[3], 28);
#endif
        state[3] = ROTL64asm(state[18], 21);
        state[18] = ROTL64asm(state[17], 15);
        state[17] = ROTL64asm(state[11], 10);
        state[11] = ROTL64(state[7], 6);
        state[7] = ROTL64(state[10], 3);
        state[10] = ROTLby1(C[0]);

        // lop3_0xD2
        for (uint32_t x{ 0 }; x < 25; x += 5)
        {
            C[0] = state[x];
            C[1] = state[x + 1];
            C[2] = state[x + 2];
            C[3] = state[x + 3];
            C[4] = state[x + 4];
            state[x] = lop3_0xD2(C[0], C[1], C[2]);
            state[x + 1] = lop3_0xD2(C[1], C[2], C[3]);
            state[x + 2] = lop3_0xD2(C[2], C[3], C[4]);
            state[x + 3] = lop3_0xD2(C[3], C[4], C[0]);
            state[x + 4] = lop3_0xD2(C[4], C[0], C[1]);
        }

        // Iota
        state[0] = state[0] ^ RClocal[i];
    }

    for (uint32_t x{ 0 }; x < 5; ++x)
    {
        C[(x + 6) % 5] = xor5(state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20]);
    }

    D[0] = ROTLby1(C[2]);
    D[1] = ROTLby1(C[3]);
    D[2] = ROTLby1(C[4]);

    state[0] = xor3(state[0], D[0], C[0]);
    state[6] = ROTR64asm(xor3(state[6], D[1], C[1]), 20);
    state[12] = ROTR64(xor3(state[12], D[2], C[2]), 21);

    state[0] = lop3_0xD2(state[0], state[6], state[12]) ^ 0x8000000080008008;    // was RC[23];

    return bswap_64(state[0]) <= d_target[deviceNum];
}


//
// TODO: Currently only initializes one GPU with a semi-hardcoded index #.
//         Initialize multiple GPUs and put those to work, too! :)
//         `theCudaDeviceIndex` is selected device's index#, with CUDA's enumeration style
// TODO: This function is too big and multifunction: let's break it up into convenient pieces
//         and only run the parts we need to, when we need to.
int cuda_device_init()
{
    string scratchStr = "";
    uint8_t i = 0;
    // Used by experimental Cosmic 3.5a code that matches CUDA device indices to NVML and NVapi ones
    //                                                            (they enumerate devices differently.)
    //char buf[16] = "?";

    // TODO: Decide if I want to return error types as integer or cudaSuccess (be consistent.)
    //cudaError_t   cudaResult = cudaSuccess;

    device_count = cudaGetDeviceCount(&device_count);
    printf( "cuda_init(): found %d CUDA devices.\n", device_count );
    for (i = 0; i < CUDA_HOWMANY_DEVICES; ++i)
    {
        gCudaDeviceIntensities[i] = 24;    // HAX! (TODO)

        printf("Initializing CUDA device with index # %d ...\n", i);
            gCudaDeviceIntensities[i] = 24;
        if (cudaGetDeviceProperties(&cdp_chosenDevice[i], i) == cudaSuccess)
            printf("Successfully got CUDA device # %d! \n", i);
        else
        {
            printf("cuda_init(): Couldn't get CUDA device (# %d ). (Busy? Too high intensity setting?\n", i);
            //TODO: display chosen CUDA device index # in Win32 dialog, too
            //MessageBox(NULL, "Couldn't get CUDA device. (Busy? Too high intensity setting?", "From Function: cuda_init()", MB_ICONEXCLAMATION | MB_OK);
            continue;
        }

        // FIXME: Is this better just before cudaGetDeviceProperties() above?
        printf( "Setting device #%d ...\n", i );
        cudaCallCheckError( cudaSetDevice(i) );

        if (!cudaDeviceIsInitialized[i])
        {
            printf(" Resetting Device #%d ...\n" );
            if (cudaCallCheckError( cudaDeviceReset() ) != 0)                                        // In 0xBitcoin-Miner, without this reset, CPU usage was too high.
                continue;    // run for() loop for next device(s)

            printf("Setting device %d flags.\n", i);
            cudaCallCheckError( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | cudaDeviceLmemResizeToMax) );        // FIXME: ideal selection (see arches)
            printf("Setting device %d cache configuration.\n", i);
            cudaCallCheckError( cudaDeviceSetCacheConfig(cudaFuncCachePreferL1) );                                        // FIXME: ideal configuration (see arches)

            printf("\nInitialized CUDA device %d : %s", i, cdp_chosenDevice[i].name);
            printf("\nAvailable compute capability: %d.%d \n\n", cdp_chosenDevice[i].major, cdp_chosenDevice[i].minor);
            gpuName[i] = cdp_chosenDevice[i].name;

            //TODO: Here, I take the cdp(cuda device prop, chosen device)'s Compute Capability and condense for encoding into Share margin
            //computeCapabilityForMargin = (cdp_chosenDevice.major * 10) + cdp_chosenDevice.minor;  // e.g. major:6,minor:1 becomes 61
            //numberMultiprocessorsForMargin = cdp_chosenDevice.multiProcessorCount;

            cudaDeviceIsInitialized[i] = true;            // TODO: this will be an array of devices or similar approach

            cudaDeviceComputeCapability[i] = cdp_chosenDevice[i].major * 100 + cdp_chosenDevice[i].minor * 10;
            // REF: from 0xBitcoin-Miner. GHz to Hz.
            // TODO/FIXME: int cudaDeviceProp::clockRate [inherited]
            // Clock frequency in kilohertz (GPU, correct?)
            cudaDeviceClockSpeed[i] = (int64_t)(cdp_chosenDevice[i].memoryClockRate * 1000 * 1000);
            printableHashrateCounter[i] = 0;        // ^^
            printableCounter[i] = 0;                // ^^
            ftime(&tStart[i]);
        }

        theCounter = 0;                        // TODO: multiple devices, independent or unified counter?

        return 0;                            // no error. 1 is fatal error (see above.)

        // ^^ relation ?
        // TODO: we'll get to this
        /*if (new_input)
            new_input = false;*/

        // this is for NVML, which can get handles from PCI ID as a string.
        // we need to get the same device that CUDA is using, for NVML/NVapi use.
        // TODO: implement this. Source: Cosmic v3.4(t)

        //cudaResult = cudaDeviceGetPCIBusId(buf, 16, cuda_device);
        //if (cudaResult == cudaSuccess)
        //    printf("cudaDeviceGetPCIBusID : %s \n", buf);
        //else
        //    printf("error getting BusID: %s \n", cudaGetErrorString(cudaResult));

        //printf ("-- CUDA pciDeviceID: %d \n", device_prop.pciDeviceID);
        //printf ("-- CUDA pciBusID: %d \n\n", device_prop.pciBusID);

        //std::string cppStr;
        //cppStr = buf;

        //return cppStr;     // return the PCI info to C++ code where it'll get NVML/NVapi handles
                           //return device_prop.pciBusID;

        // REF: "cudaDeviceReset must be called before exiting in order for profiling and
        // tracing tools such as Nsight and Visual Profiler to show complete traces."
        // TODO: We want to reset when the GPU stops working

        // FIXME: redundant?
}

// TODO/FIXME: consider alternate compute capability check(m.arch check). Forced TPB50
/*#if //__CUDA_ARCH__ > 500
#  define KERNEL_LAUNCH_PARAMS __global__ __launch_bounds__(TPB50, 1)
#else
#  define KERNEL_LAUNCH_PARAMS __global__ __launch_bounds__(TPB35, 2)
#endif*/

// REF: uint32_t tpb{ compute_version > 500 ? TPB50 : TPB35 };
#define TPB50 1024u            // compute_50 and up
#define TPB35 384u            // compute_35 and down

__global__ __launch_bounds__(TPB50, 1)            // Enforce maximum threads! (TODO/FIXME)
void mining_kernel( uint8_t deviceNum, uint64_t* possSolution, uint64_t theCounter )
{
    uint64_t thread = blockDim.x * blockIdx.x + threadIdx.x;
    uint64_t nonce = theCounter + thread;
    // Was here: passing optimizations (COSMiC 3.4t)
    //uint64_t passingMid[25];
    //uint8_t i;                        // will be set to 0 next line

    // TODO: while(1 && pauseVar|abortVar) faster?
    if ( keccak(deviceNum, nonce, thread) == true )    // write solving device # to *devicenum
    {
        *possSolution = nonce; // write to function parameter
        printf("\n\n* keccak returning true on device #%d with   cnt: %" PRIu64 ", nonce: %" PRIx64 " ! * \n\n", deviceNum, theCounter, nonce);        // no newline needed (debuggery)
        return;
    }
}

//
// HANDLE_KERNELS: Host function that launches kernels on the CUDA device(s).
__host__
int handle_kernels ()
{
    //#define NPT 2                // eh?
    //cudaError anyError;            // TODO: init?
    cudaError cudaResult = cudaSuccess;
    uint64_t scratch = 0;
    double     scr_meter = 0;
    double     t = 0;

    for (uint8_t i = 0; i < CUDA_HOWMANY_DEVICES - 1; ++i)
    {
        // TODO: do this once per device, not every call
        // TODO: fine-tune this for detected device's capabilities, and intensity per-device
        uint32_t threads = (1u << gCuda_Device0Intensity);    // TODO: multi-GPU, separate intensities, defaults
        int threads_per_block = TPB50;            // FIXME: for specific arches!
        dim3 grid = (threads + threads_per_block - 1) / threads_per_block;
        dim3 block = threads_per_block;

        cudaResult = cudaSetDevice(i);
        if (cudaResult != cudaSuccess)
            printf( "** cudaSetDevice(%d) failed: %s \n\n", i, cudaGetErrorString( cudaResult ) );

        cudaMemcpy( d_solution[i], h_message[i], 8, cudaMemcpyHostToDevice );   // 8 bytes

        mining_kernel <<< grid, block >>> (i, d_solution, theCounter);
        cudaResult = cudaDeviceSynchronize();
        if (cudaResult != cudaSuccess )
            printf( "Kernel launch error (device #%d): %s \n\n", i, cudaGetErrorString(cudaResult) );
        //Sleep(0.35);

        cudaMemcpy( h_message[i], d_solution[i], 8, cudaMemcpyDeviceToHost );       // 8 bytes

        // if a solution was found by device `i`
        if (*h_message[i] != UINT64_MAX)
        {
            printf( "\n\n** manage_kernels() for device #%d: Possible Solution Found ** \n\n", i );

            printf( "*h_message[%d] after solving is: %" PRIx64 " (expressed as uint64_t, hex): \n", i, *h_message[i] );
            printf( "Copying into solution[%d][len:32] starting at element 12...\n", i );
            memcpy( &solution[i][12], h_message[i], 8 );                           // 8 bytes

            printf( "Setting gSolnFound[i] to `true`. (TODO: sol'n queue) ... \n", i );
            //memcpy(gSolution, solution, 32);
            gSolnFound[i] = true;
        }

        // this stuff happens whether a solution was found this call or not.
        ftime( &tEnd[i] );
        //cnt = 0;
        theCounter += threads;
        printable_hashrate_cnt[i] += threads;

        t = (double)((tEnd[i].time * 1000 + tEnd[i].millitm) - (tStart[i].time * 1000 + tStart[i].millitm)) / 1000;
        cuda_solvetime[i] = t;        // # of ms taken for UI display and processing by statistics functions
        scr_meter = (double)(printable_hashrate_cnt[i] / t / 1000000);
        scratch = (uint64_t)scr_meter;
        //++print_counter[i];

        // copy hashrate to global string for HUD display
        gStr_Hashrate[i] = to_string(scratch);
        gStr_HashCount[i] = to_string(printable_hashrate_cnt);
        //}

    }

    printf( "GPU0 HR: %s, HCount: %s | GPU1 HR: %s, HCount: %s", gStr_Hashrate[0], gStr_HashCount[0], gStr_Hashrate[1], gStr_HashCount[1] );

    return -1;        // no solution found by any GPU

    // faster? return (*h_message != UINT64_MAX);
    // return ( h_done[0] >= 0 );
}

//
// //
void CUDA_Solve()
{
    printf("CUDA_Solve CPU thread starting.\n");
    uint8_t errorCount = 0;

    // REF:
    //   updateGPULoop(true);
    //   CUDASolver::bytes_t byte_solution(32);
    //   resetHashCount();

    *h_message[0] = UINT64_MAX;
    *h_message[1] = UINT64_MAX;
    gSolving = true;

    // TODO: rewrite this loop. Push sol'ns to queue where net thread can handle them.
    //         figure out why ftime(&tStart) to reset the round time causes a delay when a share is
    //         submitted. Write solutions sent out the log for analysis/look for dup shares.
    do
    {
        if (h_message == nullptr || gSolving == false)
        {
            cout << "in do-while loop: gSolving==false or h_message==nullptr, breaking out" << endl;
            break;
        }

        if (handle_kernels() == true)
        {
            //if (*h_message != UINT64_MAX)
            //{
            if ( handle_kernels() != 99 )        // 99 = init value


                printf("In CUDA_Solve(): *h_message is != UINT64_MAX :) \n");
                printf("Candidate Solution: %" PRIx64 " \n", *h_message);

                //CUDASolver::pushSolution(bytesToString(byte_solution));
                //resetHashCount();
                EnqueueSolution();

                // consider updating the challenge, target now if needed
                //theCounter = 0;
                printable_hashrate_cnt = 0;
            }

            *h_message[0] = UINT64_MAX;
            *h_message[1] = UINT64_MAX;

            ftime(&tStart[i]);                        // store time when we did this
        }
    } while (*h_message == UINT64_MAX) ;

    // run cuda_shutdown(), get errorCount
    if (cuda_shutdown() > 0)
        printf("%d Error(s) while shutting down CUDA \n", errorCount);

    printf("CUDA_Solve CPU thread (ending).\n");
}

//
// //
void StopCudaSolver()
{
    gSolving = false;
}

//
// SEND_TO_DEVICE: Builds a midstate from the Init Message, sends mid and target to devices
__host__
void send_to_device(uint64_t target, uint64_t* message) //uint64_t* message)
{
    uint64_t C[4], D[5], mid[25];
    //cout << init_message << endl;

    printf("send_to_device: got 'target' (expressed as uint64_t, hex): %" PRIx64 " \n", target);

    //cudaSetDevice(numChosenDevice);

    printf("send_to_device(): got 'message' (expressed as byte array): \n");
    //for (uint8_t y = 0; y < 10; ++y)
    //    printf(" %" PRIx64 " ", message[y]);
    print_bytes((uint8_t*)message, 84);
    printf("\n\n");

    printf("Building the Midstate ...\n");
    C[0] = message[0] ^ message[5] ^ message[10] ^ 0x100000000ull;
    C[1] = message[1] ^ message[6] ^ 0x8000000000000000ull;
    C[2] = message[2] ^ message[7];
    C[3] = message[4] ^ message[9];

    D[0] = ROTL64(C[1], 1) ^ C[3];
    D[1] = ROTL64(C[2], 1) ^ C[0];
    D[2] = ROTL64(message[3], 1) ^ C[1];
    D[3] = ROTL64(C[3], 1) ^ C[2];
    D[4] = ROTL64(C[0], 1) ^ message[3];

    mid[0] = message[0] ^ D[0];
    mid[1] = ROTL64(message[6] ^ D[1], 44);
    mid[2] = ROTL64(D[2], 43);
    mid[3] = ROTL64(D[3], 21);
    mid[4] = ROTL64(D[4], 14);
    mid[5] = ROTL64(message[3] ^ D[3], 28);
    mid[6] = ROTL64(message[9] ^ D[4], 20);
    mid[7] = ROTL64(message[10] ^ D[0] ^ 0x100000000ull, 3);
    mid[8] = ROTL64(0x8000000000000000ull ^ D[1], 45);
    mid[9] = ROTL64(D[2], 61);
    mid[10] = ROTL64(message[1] ^ D[1], 1);
    mid[11] = ROTL64(message[7] ^ D[2], 6);
    mid[12] = ROTL64(D[3], 25);
    mid[13] = ROTL64(D[4], 8);
    mid[14] = ROTL64(D[0], 18);
    mid[15] = ROTL64(message[4] ^ D[4], 27);
    mid[16] = ROTL64(message[5] ^ D[0], 36);
    mid[17] = ROTL64(D[1], 10);
    mid[18] = ROTL64(D[2], 15);
    mid[19] = ROTL64(D[3], 56);
    mid[20] = ROTL64(message[2] ^ D[2], 62);
    mid[21] = ROTL64(D[3], 55);
    mid[22] = ROTL64(D[4], 39);
    mid[23] = ROTL64(D[0], 41);
    mid[24] = ROTL64(D[1], 2);

    printf( "Done. The generated Midstate: \n" );
    // TODO: remove this debuggery
    uint8_t *myPointer = (uint8_t*)&mid;
    uint8_t mid_bytes[200] = { 0 };
    for (uint8_t n = 0; n < 200; ++n)
    {
        mid_bytes[n] = myPointer[n];
        printf("%02x ", myPointer[n]);
    }

    // TODO/FIXME: cudaMAlloc these first? Doesn't seem done in 0xbitcoin-miner.
    //                Check CUDA best practices.

    // probably unnecessary
    printf("send_to_device(): setting device # 0 ... ");
    cudaCallCheckError( cudaSetDevice(0) );

    printf("\n* Note: We are sending the SAME midstate and using the SAME solution byte array \n");
    printf("          to give GPU0/GPU1 identical work for testing purposes.\n\n");

    // 0 is offset: not device #
    printf("send_to_device(): copying midstate to d_mid[0] ... ");
    cudaCallCheckError ( cudaMemcpyToSymbol(d_mid[0], mid, sizeof(mid), 0, cudaMemcpyHostToDevice) );    // gpu 0
    printf("send_to_device(): copying target to d_target[0] ...");
    cudaCallCheckError ( cudaMemcpyToSymbol(d_target[0], &target, sizeof(target), 0, cudaMemcpyHostToDevice) );    // gpu 0

    // probably unnecessary
    printf("send_to_device(): setting device # 1 ... ");
    cudaCallCheckError( cudaSetDevice(1) );

    printf("send_to_device(): copying midstate to d_mid[1] ... ");
    cudaCallCheckError( cudaMemcpyToSymbol(d_mid[1], mid, sizeof(mid), 0, cudaMemcpyHostToDevice) );    // gpu 1
    printf("send_to_device(): copying target to d_target[1] ... ");
    cudaCallCheckError( cudaMemcpyToSymbol(d_target[1], &target, sizeof(target), 0, cudaMemcpyHostToDevice) );    // gpu 1
    // 0 is offset: not device #

    // at this time, the set device is GPU # 1.
}

uint8_t cuda_allocate_memory()
{
    // FIXME: use definite byte amounts.
    uint64_t delme_mid[25];
    uint8_t errorCount = 0;

    // GPU 0 (TODO: for loop, detect devices).
    printf("cuda_allocate_memory: setting device # 0 ...");
    cudaCallCheckError( cudaSetDevice(0) );

    printf("Allocating device memory for d_mid[0] (size %d bytes) ...", (int)(sizeof(delme_mid)));
    cudaCallCheckError ( cudaMalloc( (void**)&d_mid[0], sizeof(delme_mid) ));

    printf("Allocating device memory for d_target[0] (size 8 bytes ... ");
    cudaCallCheckError( cudaMalloc( (void**)&d_target[0], 8) );        // 8 bytes, size of uint64_t, gpu0

    // GPU 1 (TODO: for loop, detect devices).
    printf("cuda_allocate_memory: setting device # 1 ...");
    cudaCallCheckError( cudaSetDevice(1) );

    printf("Allocating device memory for d_mid[1] (size %d bytes) ...", (int)(sizeof(delme_mid)));
    cudaCallCheckError( cudaMalloc( (void**)&d_mid[1], sizeof(delme_mid) ));

    printf("Allocating device memory for d_target[1] (size 8 bytes) ... ");
    cudaCallCheckError( cudaMalloc( (void**)&d_target[1], 8) );        // 8 bytes, size of uint64_t, gpu1

    printf("Allocating device memory for d_solution[0] (size 8 bytes) ... ");
    cudaCallCheckError( cudaMalloc( (void**)&d_solution[0], 8) );    // 8 bytes

    printf("Allocating device memory for d_solution[1] (size 8 bytes) ... ");
    cudaCallCheckError( cudaMalloc( (void**)&d_solution[1], 8) );    // 8 bytes

    printf("Allocating host memory for h_message[0] (size 8 bytes) ... ");
    cudaCallCheckError( cudaMallocHost( (void**)&h_message[0], sizeof(uint64_t) ) );                   // was sizeof(uint64_t). 8 bytes*/

    printf("Allocating host memory for h_message[1] (size 8 bytes) ... ");
    cudaCallCheckError( cudaMallocHost( (void**)&h_message[1], sizeof(uint64_t) ) );                   // was sizeof(uint64_t). 8 bytes*/

    if (errorCount == 0)
    {
        *h_message[0] = UINT64_MAX;
        *h_message[1] = UINT64_MAX;
        return 0;    // errorCount=0
    }
    else
        return errorCount;

    /*REF: move this to where we send the device its vars !!
       *h_message = UINT64_MAX;
       cudaMemcpy( d_solution, h_message, 8, cudaMemcpyHostToDevice );   // sizeof(uint64_t), 8 bytes
    */

    //uint8_t* d_hash_prefix;        // leftover from 0xbitcoin-miner. purpose?

    /*m_address( ADDRESS_LENGTH ),
    m_challenge( UINT256_LENGTH ),
    m_target( 0 ),*/
    //numThreads = (1u << theIntensity);
    //cudaMemcpyToSymbol(threads, &h_threads, sizeof(h_threads), 0, cudaMemcpyHostToDevice);
}

__host__
uint8_t FreeDeviceVariables()
{
    cudaError theResult;
    uint8_t errorCount = 0;

    theResult = cudaFree(d_solution); //(void*)
    if (theResult != cudaSuccess)
    {
        ++errorCount;
        printf("Error in FreeDeviceVariables (freeing d_solution): %s \n", cudaGetErrorString(theResult));
    }

    cout << "Note: Not freeing d_target as it is a __constant__ \n" << endl;
    /*
    theResult = cudaFree(d_target);
    {
        ++errorCount;
        printf("Error in FreeDeviceVariables (freeing d_target): %s \n", cudaGetErrorString(theResult));
    }
    */

    cout << "Note: Not freeing d_mid as it is a __constant__ \n" << endl;
    /*
    theResult = cudaFree(d_mid);
    {
        ++errorCount;
        printf("Error in FreeDeviceVariables (freeing d_mid): %s \n", cudaGetErrorString(theResult));
    }
    */

    return errorCount;            // 0 if no errors
}

int cuda_shutdown()
{
    // cuda shutdown
    cudaError theResult;
    uint8_t errorCount = 0;

    printf("\n\n** Shutting down CUDA ...\n\n");

    cudaDeviceIsInitialized = false;

    // select the device (TODO: multi-GPU)
    theResult = cudaSetDevice(numChosenDevice);
    if (theResult != cudaSuccess)
    {
        printf("Error in cuda_shutdown (selecting device): %s \n", cudaGetErrorString(theResult));
        ++errorCount;
    }

    // free memory allocated by CUDA (including host vars)
    theResult = cudaFreeHost(h_message);
    if (theResult != cudaSuccess)
    {
        printf("Error in cuda_shutdown (free h_message): %s \n", cudaGetErrorString(theResult));
        ++errorCount;
    }

    errorCount += FreeDeviceVariables();    // Free Device Variables, add # of errors, if any, to errorCount

    if (theResult != cudaSuccess)
    {
        printf("Error in cuda_shutdown (reset device): %s \n", cudaGetErrorString(theResult));
        ++errorCount;
    }

    printf("Finished freeing CUDA variables w/ %d errors.\n\n", errorCount);

    printable_hashrate_cnt = 0;                // reset counter

    // reset the device
    theResult = cudaDeviceReset();
    if (theResult != cudaSuccess)
    {
        printf("Error in cuda_shutdown (reset device): %s \n", cudaGetErrorString(theResult));
        ++errorCount;
    }

    return errorCount;            // 0 if no errors
}