Untitled

// TODO: these are not used yet !!
#define HB_PREFER_PRMT_OVER_SHIFT     0     // prefer permutation functions over funnel shift-based functions
#define HB_RESET_COUNTER              1     // reset `cnt` each solution/new challenge (req'd for HB_CNT0_SHORTCUT)
#define HB_CNT0_SHORTCUT              1     // apply shortcut when `cnt` is 0 at start of keccak()
#define HB_SELECTIVE_32BIT_XOR        1     // where only half a 64bit var is nonzero, only XOR the relevant half

//
// HashBurner Optimizations by LtTofu/Mag517
// Date: 22 APR 2018
//

/*
Author: Mikers
date march 4, 2018 for 0xbitcoin dev

based off of https://github.com/Dunhili/SHA3-gpu-brute-force-cracker/blob/master/sha3.cu

 * Author: Brian Bowden
 * Date: 5/12/14
 *
 * This is the parallel version of SHA-3.
 */

#include "cuda_sha3.h"

int32_t intensity;
int32_t cuda_device;
int32_t clock_speed;
int32_t compute_version;
struct timeb start, end;

uint64_t cnt;
uint64_t printable_hashrate_cnt;
uint64_t print_counter;

bool gpu_initialized;
bool new_input;

uint8_t solution[32] = { 0 };

uint64_t* h_message;
uint8_t init_message[84];

uint64_t* d_solution;

uint8_t* d_challenge;
uint8_t* d_hash_prefix;
__constant__ uint64_t d_mid[25];
__constant__ uint64_t d_target;
__constant__ uint32_t threads;

/* Round Constants
- - - - - -
__device__ __constant__ const uint64_t RC[24] = {
  0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
  0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
  0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
  0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
  0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
  0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
  0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
  0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};*/

__device__ __forceinline__
uint64_t bswap_64( uint64_t input )
{
  asm( "{"
       "  .reg .u32 oh, ol;"
       "  mov.b64 {oh,ol}, %0;"
       "  prmt.b32 oh, oh, 0, 0x0123;"
       "  prmt.b32 ol, ol, 0, 0x0123;"
       "  mov.b64 %0, {ol,oh};"
       "}" : "+l"(input) );
  return input;
}

// try doing this with two offsettings of output operand instead
__device__ __forceinline__
uint64_t ROTL64asm (uint64_t input, uint32_t magnitude)
{
    asm ("{"
         ".reg .b32 hi, lo, mag, scr;"
         "mov.b32 mag, %1;"
         "mov.b64 {hi,lo}, %0;"
         "shf.l.wrap.b32 scr, lo, hi, mag;"
         "shf.l.wrap.b32 lo, hi, lo, mag;"
         "mov.b64 %0, {scr,lo};"
         "}" : "+l"(input) : "r"(magnitude) );
    return input;
}

// try doing this with two offsettings of output operand instead
__device__ __forceinline__
uint64_t ROTR64asm (uint64_t input, uint32_t magnitude)
{
    asm ("{"
         ".reg .b32 hi, lo, mag, scr;"
         "mov.b32 mag, %1;"
         "mov.b64 {hi,lo}, %0;"
         "shf.r.wrap.b32 scr, hi, lo, mag;"
         "shf.r.wrap.b32 lo, lo, hi, mag;"
         "mov.b64 %0, {scr,lo};"
         "}" : "+l"(input) : "r"(magnitude) );
    return input;
}

__device__ __forceinline__
uint64_t xor5( uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e )
{
  // try writing into a scratch register

  asm("{"
      "   .reg .b64 pad;        "
      "   xor.b64 pad, %0, %1;  "
      "   xor.b64 pad, pad, %2; "
      "   xor.b64 pad, pad, %3; "
      "   xor.b64 %0, pad, %4;  "
      "}" : "+l"(a) : "l"(b), "l"(c), "l"(d), "l"(e) );

  return a;
}

// Rotation Test for 32- but which direction? Use this later.
__device__ __forceinline__
uint64_t ROTby32_Flip (uint64_t input)
{
    asm ("{"
         ".reg .b32 hi, lo;"
         "mov.b64 {lo,hi}, %0;"
         "mov.b64 %0, {hi,lo};"
         "}" : "+l"(input) );
    return input;
}

// Rotation Test for 32- but which direction? Use this later.
__device__ __forceinline__
uint64_t ROTby32_Prmt (uint64_t input)
{
    // FIXME: out of order
    asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, hi, lo, 0x7654;"
          "prmt.b32 lo, hi, lo, 0x3210;"
          "mov.b64 %0, {lo,scr};"
          "}"
          : "+l"(input) );

    /* Alternate Version: should provide same result: ROTR64==ROTL64 by 32 */
    /*     asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, hi, lo, 0x1076;"
          "prmt.b32 lo, hi, lo, 0x5432;"
          "mov.b64 %0, {scr,lo};"
          "}"
          : "+l"(input) );
    printf ("PermB: %" PRIx64 " \n\n", input);*/

    return input;
}

__device__ __forceinline__
uint64_t ROTLby24 (uint64_t input)
{
    asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, hi, lo, 0x0765;"
          "prmt.b32 lo, hi, lo, 0x4321;"
          "mov.b64 %0, {scr,lo};"
          "}" : "+l"(input) );
    return input;
}
// TODO: ROTRby24!

__device__ __forceinline__
uint64_t ROTLby16 (uint64_t input)
{
    asm ( "{"
          "   .reg .b32 hi, lo, scr;"
          "   mov.b64 {hi,lo}, %0;"
          "   prmt.b32 scr, hi, lo, 0x5432;"
          "   prmt.b32 lo, hi, lo, 0x1076;"
          "   mov.b64 %0, {lo,scr};"
          " }" : "+l"(input) );
    return input;
}

// see if this is faster using x and y vectors, no extra regs
__device__ __forceinline__
uint64_t ROTLby8 (uint64_t input)
{
    asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, hi, lo, 0x2107;"
          "prmt.b32 lo, hi, lo, 0x6543;"
          "mov.b64 %0, {scr,lo};"
          "}"
          : "+l"(input) );
    return input;
}

__device__ __forceinline__
uint64_t ROTRby8 (uint64_t input)
{
      asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, lo, hi, 0x0765;"
          "prmt.b32 lo, lo, hi, 0x4321;"
          "mov.b64 %0, {scr,lo};"
          "}"
          : "+l"(input) );
    return input;
}

// TODO: Look for a snappier way to do this. Should still be slightly
//       faster than the variable-magnitude version above.
__device__ __forceinline__
uint64_t ROTLby1 (uint64_t input)
{
    asm ("{"
         ".reg .b32 hi, lo, scr;"
         "mov.b64 {hi,lo}, %0;"
         "shf.l.wrap.b32 scr, lo, hi, 1;"   // magnitude replaced w/ immediate
         "shf.l.wrap.b32 lo, hi, lo, 1;"    // magnitude replaced w/ immediate
         "mov.b64 %0, {scr,lo};"
         "}" : "+l"(input) );
    return input;
}

// try doing this with two offsettings of output operand instead
__device__ __forceinline__
uint64_t ROTRby1 (uint64_t input)
{
    asm ("{"
         ".reg .b32 hi, lo, scr;"
         "mov.b64 {hi,lo}, %0;"
         "shf.r.wrap.b32 scr, hi, lo, 1;"
         "shf.r.wrap.b32 lo, lo, hi, 1;"
         "mov.b64 %0, {scr,lo};"
         "}" : "+l"(input) );
    return input;
}

__device__ __forceinline__
uint64_t xor3( uint64_t a, uint64_t b, uint64_t c )
{
  asm( "{"
       ".reg .b64 pad;"
       "  xor.b64 pad, %0, %1;"
       "  xor.b64 %0, pad, %2;"
       "}" : "+l"(a) : "l"(b), "l"(c) );
  return a;
}


// FIXME: Assuming SM 5.x+
__device__ __forceinline__
uint64_t chi( uint64_t a, uint64_t b, uint64_t c )
{
#if __CUDA_ARCH__ >= 500
  asm( "{"
       "  .reg .b32 ah, al, bh, bl, ch, cl;"
       "  mov.b64 {ah,al}, %0;"
       "  mov.b64 {bh,bl}, %1;"
       "  mov.b64 {ch,cl}, %2;"

       "  lop3.b32 ah, ah, bh, ch, 0xD2;"
       "  lop3.b32 al, al, bl, cl, 0xD2;"
       "  mov.b64 %0, {ah,al};"
       "}" : "+l"(a) : "l"(b), "l"(c) );
  return a;
#else
  return a ^ ((~b) & c);
#endif
}

// Permutations Testing Function
__device__ __forceinline__
uint64_t PermTest ( uint64_t input )
{
    input = 0x11223344AABBCCDD;             // 16L and 16R rotations equal ??

    printf ("Input   : %" PRIx64 " \n", input);
    //printf ("Goal_08L: %" PRIx64 " \n", ROTL64(input, 8));
    //printf ("Goal_16L: %" PRIx64 " \n", ROTL64(input, 16));
    //printf ("Goal_24L: %" PRIx64 " \n", ROTL64(input, 24));
    //printf ("Goal_32L: %" PRIx64 " \n\n", ROTL64(input, 32));
    //printf ("Goal_08R: %" PRIx64 " \n", ROTR64(input, 8));
    //printf ("Goal_16R: %" PRIx64 " \n", ROTR64(input, 16));
    printf ("Goal_24R: %" PRIx64 " \n", ROTR64(input, 24));
    //printf ("Goal_32R: %" PRIx64 " \n\n", ROTR64(input, 32));

    asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, lo, lo, 0x1320;"
          "prmt.b32 lo, hi, lo, 0x0321;"
          "mov.b64 %0, {scr,lo};"
          "}" : "+l"(input) );

    printf ("PermA: %" PRIx64 " \n", input);

    asm ( "{"
          ".reg .b32 hi, lo, scr;"
          "mov.b64 {hi,lo}, %0;"
          "prmt.b32 scr, hi, 0, 0x3120;"
          "prmt.b32 lo, lo, 0, 0x3120;"
          "mov.b64 %0, {scr, lo};"
          "}"
          : "+l"(input) );
    printf ("PermB: %" PRIx64 " \n\n", input);

    return input;
}

__device__
bool keccak( uint64_t nounce, uint32_t thread, uint64_t i_mid[] )
{
  uint64_t state[25], C[5], D[5];

  // Round Constants
  uint64_t RClocal[24] =
  {
    /* Element     (elements which are '32bit': 1, 4-5, 8, 9-12, 18, 22)      */
    /* -------     ------------------  ------------------  ------------------ */
    /* 00..02  */  0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
    /* 03..05  */  0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
    /* 06..08  */  0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
    /* 09..11  */  0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
    /* 12..14  */  0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
    /* 15..17  */  0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
    /* 18..20  */  0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
    /* 21..23  */  0x8000000000008080, 0x0000000080000001, 0x8000000080008008
    /* -------     ------------------  ------------------  ------------------ */
  }; uint2* RCvec = (uint2*)&RClocal;  // vectorized access to RClocal[]

  uint2* stateVec = (uint2*)&state;    // vectorized access to state[]

  //if (thread == 543210)
  //    PermTest (0);

  C[0] = i_mid[ 2] ^ ROTR64asm(nounce, 20);
  C[1] = i_mid[ 4] ^ ROTL64asm(nounce, 14);

  state[ 0] = chi( i_mid[ 0], i_mid[ 1], C[ 0] );
#if HB_SELECTIVE_32BIT_XOR
  stateVec[0].x = stateVec[0].x ^ 1;                // only the relevant half (see: RClocal[0])
#else state[0] = state[0] ^ 0x0000000000000001;     // formerly RC[0], now an immediate
#endif

  state[ 1] = chi( i_mid[ 1], C[ 0], i_mid[ 3] );
  state[ 2] = chi( C[ 0], i_mid[ 3], C[ 1] );
  state[ 3] = chi( i_mid[ 3], C[ 1], i_mid[ 0] );
  state[ 4] = chi( C[ 1], i_mid[ 0], i_mid[ 1] );

  C[0] = i_mid[ 6] ^ ROTL64asm(nounce, 20);
  C[1] = i_mid[ 9] ^ ROTR64(nounce,  2);
  state[ 5] = chi( i_mid[ 5], C[ 0], i_mid[7] );
  state[ 6] = chi( C[0], i_mid[ 7], i_mid[8] );
  state[ 7] = chi( i_mid[ 7], i_mid[ 8], C[1] );
  state[ 8] = chi( i_mid[ 8], C[1], i_mid[5] );
  state[ 9] = chi( C[1], i_mid[ 5], C[0] );

  C[0] = i_mid[11] ^ nounce*128;                    //ROTL64(nounce, 7);
  C[1] = i_mid[13] ^ ROTLby8(nounce);
  state[10] = chi( i_mid[10], C[0], i_mid[12] );
  state[11] = chi( C[0], i_mid[12], C[1] );
  state[12] = chi( i_mid[12], C[1], i_mid[14] );
  state[13] = chi( C[1], i_mid[14], i_mid[10] );
  state[14] = chi( i_mid[14], i_mid[10], C[0] );

  C[0] = i_mid[15] ^ ROTL64asm(nounce, 27);
  C[1] = i_mid[18] ^ ROTLby16 (nounce);
  state[15] = chi( C[0], i_mid[16], i_mid[17] );
  state[16] = chi( i_mid[16], i_mid[17], C[1] );
  state[17] = chi( i_mid[17], C[1], i_mid[19] );
  state[18] = chi( C[1], i_mid[19], C[0] );
  state[19] = chi( i_mid[19], C[0], i_mid[16] );

  C[0] = i_mid[20] ^ ROTRby1(nounce);
  C[1] = i_mid[21] ^ ROTRby1(ROTRby8(nounce));      //ROTRby1(ROTRby8(nounce)); Fixme? EXPERIMENTAL :)
  C[2] = i_mid[22] ^ ROTR64(nounce, 25);
  state[20] = chi( C[0], C[1], C[2] );
  state[21] = chi( C[1], C[2], i_mid[23] );
  state[22] = chi( C[2], i_mid[23], i_mid[24] );
  state[23] = chi( i_mid[23], i_mid[24], C[0] );
  state[24] = chi( i_mid[24], C[0], C[1] );

#if __CUDA_ARCH__ >= 350
#  pragma unroll
#endif
  for( int32_t i{ 1 }; i < 23; ++i )
  {
      // Theta
      for( uint32_t x{ 0 }; x < 5; ++x )
      {
        C[(x + 6) % 5] = xor5( state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20] );
      }

  #if __CUDA_ARCH__ >= 350
      for( uint32_t x{ 0 }; x < 5; ++x )
      {
        D[x] = ROTL64(C[(x + 2) % 5], 1);
        state[x]      = xor3( state[x]     , D[x], C[x] );
        state[x +  5] = xor3( state[x +  5], D[x], C[x] );
        state[x + 10] = xor3( state[x + 10], D[x], C[x] );
        state[x + 15] = xor3( state[x + 15], D[x], C[x] );
        state[x + 20] = xor3( state[x + 20], D[x], C[x] );
      }
  #else
      for( uint32_t x{ 0 }; x < 5; ++x )
      {
        D[x] = ROTL64(C[(x + 2) % 5], 1) ^ C[x];
        state[x]      = state[x]      ^ D[x];
        state[x +  5] = state[x +  5] ^ D[x];
        state[x + 10] = state[x + 10] ^ D[x];
        state[x + 15] = state[x + 15] ^ D[x];
        state[x + 20] = state[x + 20] ^ D[x];
      }
  #endif

      // Rho Pi
      C[0] = state[1];
      state[ 1] = ROTR64asm( state[ 6], 20 );           //ROTR64asm( state[ 6], 20 );
      state[ 6] = ROTL64asm( state[ 9], 20 );
      state[ 9] = ROTR64( state[22],  3 );
      state[22] = ROTR64asm( state[14], 25 );
      state[14] = ROTL64asm( state[20], 18 );           //ROTL64asm( ROTLby16(state[20]), 2 );
      state[20] = ROTR64( state[ 2],  2 );
      state[ 2] = ROTR64asm( state[12], 21 );
      state[12] = ROTL64asm( state[13], 25 );
      state[13] = ROTLby8( state[19] );
      state[19] = ROTRby8( state[23] );
      state[23] = ROTR64asm( state[15], 23 );
      state[15] = ROTL64asm( state[ 4], 27 );
      state[ 4] = ROTL64asm( state[24], 14 );
      state[24] = ROTL64( state[21],  2 );
      state[21] = ROTR64asm( state[ 8], 9 );            // R9
      state[ 8] = ROTR64asm( state[16], 19 );
      state[16] = ROTR64asm (state[5], 28);     // R28
      state[ 5] = ROTL64asm(ROTLby24(state[ 3]), 4);     // L28
      state[ 3] = ROTL64asm( state[18], 21 );
      state[18] = ROTL64asm( state[17], 15 );
      state[17] = ROTL64asm( state[11], 10 );
      state[11] = ROTL64asm( state[ 7],  6 );
      state[ 7] = ROTL64asm( state[10],  3 );
      state[10] = ROTLby1( C[0] );

      // Chi
      for( uint32_t x{ 0 }; x < 25; x += 5 )
      {
        C[0] = state[x];
        C[1] = state[x + 1];
        C[2] = state[x + 2];
        C[3] = state[x + 3];
        C[4] = state[x + 4];
        state[x]     = chi( C[0], C[1], C[2] );
        state[x + 1] = chi( C[1], C[2], C[3] );
        state[x + 2] = chi( C[2], C[3], C[4] );
        state[x + 3] = chi( C[3], C[4], C[0] );
        state[x + 4] = chi( C[4], C[0], C[1] );
      }

      // Iota
      // FIXME/TODO: If you get a lot of bad shares, consider commenting this out (leave the else{} part)
      // RClocal[1,4,5,8..12,18,22] are half-zeroes: do a 32-bit XOR of the relevant bits only.
      if (i == 1 || i == 4 || i == 5 || i == 8 || i == 9 || i == 10 || i == 11 || i == 12 || i == 18 || i == 22)
        //asm ("xor.b32 %0, %0, %1;" : "+r"(stateVec[0].x) : "r"(RCvec[i].x) );
        stateVec[0].x = stateVec[0].x ^ RCvec[i].x;
      else
          state[0] = state[0] ^ RClocal[i];

  } // end of big-ass for() loop

  for( uint32_t x{ 0 }; x < 5; ++x )
  {
    C[(x + 6) % 5 ] = xor5( state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20] );
  }

  D[0] = ROTL64(C[2], 1);
  D[1] = ROTL64(C[3], 1);
  D[2] = ROTL64(C[4], 1);

  state[ 0] = xor3( state[ 0], D[0], C[0] );
  state[ 6] = ROTR64asm(xor3( state[ 6], D[1], C[1] ), 20);
  state[12] = ROTR64asm(xor3( state[12], D[2], C[2] ), 21);

  state[ 0] = chi( state[ 0], state[ 6], state[12] ) ^ 0x8000000080008008; // RC[23];

  return bswap_64( state[0] ) <= d_target;
}

KERNEL_LAUNCH_PARAMS
void gpu_mine( uint64_t* solution, uint64_t cnt )
{
  uint64_t thread = blockDim.x * blockIdx.x + threadIdx.x;
  uint64_t nounce{ cnt + thread };
  uint64_t passingMid[25];

  for (uint8_t i=0; i<25; ++i)
      passingMid[i] = d_mid[i];

  if( keccak(nounce, thread, passingMid) )
  {
    *solution = nounce;
    return;
  }
}

__host__
void stop_solving()
{
  // h_done[0] = -2;
}

__host__
uint64_t getHashCount()
{
  return cnt;
}

__host__
void resetHashCount()
{
  cudaSetDevice( cuda_device );

  *h_message = UINT64_MAX;
  cudaMemcpy( d_solution, h_message, sizeof( uint64_t ), cudaMemcpyHostToDevice );

  printable_hashrate_cnt = 0;
  print_counter = 0;

  ftime( &start );
}

__host__
void send_to_device( uint64_t target, uint64_t* message )
{
  cudaSetDevice( cuda_device );

  uint64_t C[4], D[5], mid[25];
  C[0] = message[0] ^ message[5] ^ message[10] ^ 0x100000000ull;
  C[1] = message[1] ^ message[6] ^ 0x8000000000000000ull;
  C[2] = message[2] ^ message[7];
  C[3] = message[4] ^ message[9];

  D[0] = ROTL64(C[1], 1) ^ C[3];
  D[1] = ROTL64(C[2], 1) ^ C[0];
  D[2] = ROTL64(message[3], 1) ^ C[1];
  D[3] = ROTL64(C[3], 1) ^ C[2];
  D[4] = ROTL64(C[0], 1) ^ message[3];

  mid[ 0] = message[ 0] ^ D[0];
  mid[ 1] = ROTL64( message[6] ^ D[1], 44 );
  mid[ 2] = ROTL64(D[2], 43);
  mid[ 3] = ROTL64(D[3], 21);
  mid[ 4] = ROTL64(D[4], 14);
  mid[ 5] = ROTL64( message[3] ^ D[3], 28 );
  mid[ 6] = ROTL64( message[9] ^ D[4], 20 );
  mid[ 7] = ROTL64( message[10] ^ D[0] ^ 0x100000000ull, 3 );
  mid[ 8] = ROTL64( 0x8000000000000000ull ^ D[1], 45 );
  mid[ 9] = ROTL64(D[2], 61);
  mid[10] = ROTL64( message[1] ^ D[1],  1 );
  mid[11] = ROTL64( message[7] ^ D[2],  6 );
  mid[12] = ROTL64(D[3], 25);
  mid[13] = ROTL64(D[4],  8);
  mid[14] = ROTL64(D[0], 18);
  mid[15] = ROTL64( message[4] ^ D[4], 27 );
  mid[16] = ROTL64( message[5] ^ D[0], 36 );
  mid[17] = ROTL64(D[1], 10);
  mid[18] = ROTL64(D[2], 15);
  mid[19] = ROTL64(D[3], 56);
  mid[20] = ROTL64( message[2] ^ D[2], 62 );
  mid[21] = ROTL64(D[3], 55);
  mid[22] = ROTL64(D[4], 39);
  mid[23] = ROTL64(D[0], 41);
  mid[24] = ROTL64(D[1],  2);

  cudaMemcpyToSymbol( d_mid, mid, sizeof( mid ), 0, cudaMemcpyHostToDevice);

  cudaMemcpyToSymbol( d_target, &target, sizeof( target ), 0, cudaMemcpyHostToDevice);
}

/**
 * Initializes the global variables by calling the cudaGetDeviceProperties().
 */
__host__
void gpu_init()
{
  cudaDeviceProp device_prop;
  int32_t device_count;

  char config[10];
  FILE * inf;
  inf = fopen( "0xbtc.conf", "r" );
  if( inf )
  {
    fgets( config, 10, inf );
    fclose( inf );
    intensity = atol( strtok( config, " " ) );
    cuda_device = atol( strtok( NULL, " " ) );
    printf ("\n\nRead ./0xbtc.conf - using custom intensity %d and CUDA device %d.", intensity, cuda_device);
  }
  else
  {
    intensity = INTENSITY;
    cuda_device = CUDA_DEVICE;
    printf ("\n\nNo ./0xbtc.conf - using hardcoded intensity %d and CUDA device %d.", intensity, cuda_device);
  }

  cudaGetDeviceCount( &device_count );

  if( cudaGetDeviceProperties( &device_prop, cuda_device ) != cudaSuccess )
  {
    printf( "Problem getting properties for device, exiting...\n" );
    exit( EXIT_FAILURE );
  }

  cudaSetDevice( cuda_device );

  if( !gpu_initialized )
  {
    // CPU usage goes _insane_ without this.
    cudaDeviceReset();
    cudaSetDeviceFlags( cudaDeviceScheduleBlockingSync | cudaDeviceLmemResizeToMax );
    cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );

    printf ("\nInitialized CUDA device %d : ", cuda_device);
    printf (device_prop.name);
    printf ("\nAvailable compute capability: %d.%d \n\n", device_prop.major, device_prop.minor);
    printf ("Now mining. ");   // not really but we're about to be, and don't want this msg repeating

    cudaMalloc( (void**)&d_solution, sizeof( uint64_t ) ); // solution
    cudaMallocHost( (void**)&h_message, sizeof( uint64_t ) );

    (uint32_t&)(init_message[52]) = 014533075101u;
    (uint32_t&)(init_message[56]) = 014132271150u;

    srand((time(NULL) & 0xFFFF) | (getpid() << 16));
    for(int8_t i_rand{ 60 }; i_rand < 84; ++i_rand){
      init_message[i_rand] = (uint8_t)rand() % 256;
    }
    memcpy( solution, &init_message[52], 32 );

    uint32_t h_threads{ 1u << intensity };
    cudaMemcpyToSymbol( threads, &h_threads, sizeof( h_threads ), 0, cudaMemcpyHostToDevice );

    gpu_initialized = true;
  }

  compute_version = device_prop.major * 100 + device_prop.minor * 10;

  // convert from GHz to hertz
  clock_speed = (int32_t)( device_prop.memoryClockRate * 1000 * 1000 );

  //cnt = 0;
  printable_hashrate_cnt = 0;
  print_counter = 0;

  ftime( &start );
  if( new_input ) new_input = false;
}

__host__
void update_mining_inputs( uint64_t target, uint8_t* hash_prefix )
{
  memcpy( init_message, hash_prefix, 52 );
  send_to_device( target, (uint64_t*)init_message );
}

__host__
void gpu_cleanup()
{
  cudaSetDevice( cuda_device );

  cudaThreadSynchronize();

  cudaFree( d_solution );
  cudaFreeHost( h_message );

  cudaDeviceReset();
}

__host__
bool find_message()
{
  cudaSetDevice( cuda_device );

  uint32_t threads{ 1u << intensity };

  uint32_t tpb{ compute_version > 500 ? TPB50 : TPB35 };
  dim3 grid{ (threads + tpb - 1) / tpb };
  dim3 block{ tpb };

  gpu_mine <<< grid, block >>> ( d_solution, cnt );
  // cudaError_t cudaerr = cudaDeviceSynchronize();
  // if( cudaerr != cudaSuccess )
  // {
  //  printf( "kernel launch failed with error %d: \x1b[38;5;196m%s.\x1b[0m\n", cudaerr, cudaGetErrorString( cudaerr ) );
  //  exit( EXIT_FAILURE );
  // }

  cnt += threads;
  printable_hashrate_cnt += threads;

  cudaMemcpy( h_message, d_solution, sizeof( uint64_t ), cudaMemcpyDeviceToHost );
  if( *h_message != UINT64_MAX )
    memcpy( &solution[12], h_message, sizeof( uint64_t ) );

  ftime( &end );
  double t{ (double)((end.time * 1000 + end.millitm) - (start.time * 1000 + start.millitm)) / 1000 };

  if( t*10 > print_counter )
  {
    ++print_counter;

    // maybe breaking the control codes into macros is a good idea . . .
    printf( "\x1b[s\x1b[?25l\x1b[2;22f\x1b[38;5;221m%*.2f\x1b[0m\x1b[u\x1b[?25h"
            "\x1b[s\x1b[?25l\x1b[3;36f\x1b[38;5;208m%*" PRIu64 "\x1b[0m\x1b[u\x1b[?25h"
            "\x1b[s\x1b[?25l\x1b[2;75f\x1b[38;5;33m%02u:%02u\x1b[0m\x1b[u\x1b[?25h",
            8, ( (double)printable_hashrate_cnt / t / 1000000 ),
            25, printable_hashrate_cnt,
            ((uint32_t)t/60), ((uint32_t)t%60) );
  }

  return ( *h_message != UINT64_MAX );
  // return ( h_done[0] >= 0 );
}