Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // TODO: these are not used yet !!
- #define HB_PREFER_PRMT_OVER_SHIFT 0 // prefer permutation functions over funnel shift-based functions
- #define HB_RESET_COUNTER 1 // reset `cnt` each solution/new challenge (req'd for HB_CNT0_SHORTCUT)
- #define HB_CNT0_SHORTCUT 1 // apply shortcut when `cnt` is 0 at start of keccak()
- #define HB_SELECTIVE_32BIT_XOR 1 // where only half a 64bit var is nonzero, only XOR the relevant half
- //
- // HashBurner Optimizations by LtTofu/Mag517
- // Date: 22 APR 2018
- //
- /*
- Author: Mikers
- date march 4, 2018 for 0xbitcoin dev
- based off of https://github.com/Dunhili/SHA3-gpu-brute-force-cracker/blob/master/sha3.cu
- * Author: Brian Bowden
- * Date: 5/12/14
- *
- * This is the parallel version of SHA-3.
- */
- #include "cuda_sha3.h"
- int32_t intensity;
- int32_t cuda_device;
- int32_t clock_speed;
- int32_t compute_version;
- struct timeb start, end;
- uint64_t cnt;
- uint64_t printable_hashrate_cnt;
- uint64_t print_counter;
- bool gpu_initialized;
- bool new_input;
- uint8_t solution[32] = { 0 };
- uint64_t* h_message;
- uint8_t init_message[84];
- uint64_t* d_solution;
- uint8_t* d_challenge;
- uint8_t* d_hash_prefix;
- __constant__ uint64_t d_mid[25];
- __constant__ uint64_t d_target;
- __constant__ uint32_t threads;
- /* Round Constants
- - - - - - -
- __device__ __constant__ const uint64_t RC[24] = {
- 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
- 0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
- 0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
- 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
- 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
- 0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
- 0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
- 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
- };*/
- __device__ __forceinline__
- uint64_t bswap_64( uint64_t input )
- {
- asm( "{"
- " .reg .u32 oh, ol;"
- " mov.b64 {oh,ol}, %0;"
- " prmt.b32 oh, oh, 0, 0x0123;"
- " prmt.b32 ol, ol, 0, 0x0123;"
- " mov.b64 %0, {ol,oh};"
- "}" : "+l"(input) );
- return input;
- }
- // try doing this with two offsettings of output operand instead
- __device__ __forceinline__
- uint64_t ROTL64asm (uint64_t input, uint32_t magnitude)
- {
- asm ("{"
- ".reg .b32 hi, lo, mag, scr;"
- "mov.b32 mag, %1;"
- "mov.b64 {hi,lo}, %0;"
- "shf.l.wrap.b32 scr, lo, hi, mag;"
- "shf.l.wrap.b32 lo, hi, lo, mag;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) : "r"(magnitude) );
- return input;
- }
- // try doing this with two offsettings of output operand instead
- __device__ __forceinline__
- uint64_t ROTR64asm (uint64_t input, uint32_t magnitude)
- {
- asm ("{"
- ".reg .b32 hi, lo, mag, scr;"
- "mov.b32 mag, %1;"
- "mov.b64 {hi,lo}, %0;"
- "shf.r.wrap.b32 scr, hi, lo, mag;"
- "shf.r.wrap.b32 lo, lo, hi, mag;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) : "r"(magnitude) );
- return input;
- }
- __device__ __forceinline__
- uint64_t xor5( uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e )
- {
- // try writing into a scratch register
- asm("{"
- " .reg .b64 pad; "
- " xor.b64 pad, %0, %1; "
- " xor.b64 pad, pad, %2; "
- " xor.b64 pad, pad, %3; "
- " xor.b64 %0, pad, %4; "
- "}" : "+l"(a) : "l"(b), "l"(c), "l"(d), "l"(e) );
- return a;
- }
- // Rotation Test for 32- but which direction? Use this later.
- __device__ __forceinline__
- uint64_t ROTby32_Flip (uint64_t input)
- {
- asm ("{"
- ".reg .b32 hi, lo;"
- "mov.b64 {lo,hi}, %0;"
- "mov.b64 %0, {hi,lo};"
- "}" : "+l"(input) );
- return input;
- }
- // Rotation Test for 32- but which direction? Use this later.
- __device__ __forceinline__
- uint64_t ROTby32_Prmt (uint64_t input)
- {
- // FIXME: out of order
- asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, hi, lo, 0x7654;"
- "prmt.b32 lo, hi, lo, 0x3210;"
- "mov.b64 %0, {lo,scr};"
- "}"
- : "+l"(input) );
- /* Alternate Version: should provide same result: ROTR64==ROTL64 by 32 */
- /* asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, hi, lo, 0x1076;"
- "prmt.b32 lo, hi, lo, 0x5432;"
- "mov.b64 %0, {scr,lo};"
- "}"
- : "+l"(input) );
- printf ("PermB: %" PRIx64 " \n\n", input);*/
- return input;
- }
- __device__ __forceinline__
- uint64_t ROTLby24 (uint64_t input)
- {
- asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, hi, lo, 0x0765;"
- "prmt.b32 lo, hi, lo, 0x4321;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) );
- return input;
- }
- // TODO: ROTRby24!
- __device__ __forceinline__
- uint64_t ROTLby16 (uint64_t input)
- {
- asm ( "{"
- " .reg .b32 hi, lo, scr;"
- " mov.b64 {hi,lo}, %0;"
- " prmt.b32 scr, hi, lo, 0x5432;"
- " prmt.b32 lo, hi, lo, 0x1076;"
- " mov.b64 %0, {lo,scr};"
- " }" : "+l"(input) );
- return input;
- }
- // see if this is faster using x and y vectors, no extra regs
- __device__ __forceinline__
- uint64_t ROTLby8 (uint64_t input)
- {
- asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, hi, lo, 0x2107;"
- "prmt.b32 lo, hi, lo, 0x6543;"
- "mov.b64 %0, {scr,lo};"
- "}"
- : "+l"(input) );
- return input;
- }
- __device__ __forceinline__
- uint64_t ROTRby8 (uint64_t input)
- {
- asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, lo, hi, 0x0765;"
- "prmt.b32 lo, lo, hi, 0x4321;"
- "mov.b64 %0, {scr,lo};"
- "}"
- : "+l"(input) );
- return input;
- }
- // TODO: Look for a snappier way to do this. Should still be slightly
- // faster than the variable-magnitude version above.
- __device__ __forceinline__
- uint64_t ROTLby1 (uint64_t input)
- {
- asm ("{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "shf.l.wrap.b32 scr, lo, hi, 1;" // magnitude replaced w/ immediate
- "shf.l.wrap.b32 lo, hi, lo, 1;" // magnitude replaced w/ immediate
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) );
- return input;
- }
- // try doing this with two offsettings of output operand instead
- __device__ __forceinline__
- uint64_t ROTRby1 (uint64_t input)
- {
- asm ("{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "shf.r.wrap.b32 scr, hi, lo, 1;"
- "shf.r.wrap.b32 lo, lo, hi, 1;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) );
- return input;
- }
- __device__ __forceinline__
- uint64_t xor3( uint64_t a, uint64_t b, uint64_t c )
- {
- asm( "{"
- ".reg .b64 pad;"
- " xor.b64 pad, %0, %1;"
- " xor.b64 %0, pad, %2;"
- "}" : "+l"(a) : "l"(b), "l"(c) );
- return a;
- }
- // FIXME: Assuming SM 5.x+
- __device__ __forceinline__
- uint64_t chi( uint64_t a, uint64_t b, uint64_t c )
- {
- #if __CUDA_ARCH__ >= 500
- asm( "{"
- " .reg .b32 ah, al, bh, bl, ch, cl;"
- " mov.b64 {ah,al}, %0;"
- " mov.b64 {bh,bl}, %1;"
- " mov.b64 {ch,cl}, %2;"
- " lop3.b32 ah, ah, bh, ch, 0xD2;"
- " lop3.b32 al, al, bl, cl, 0xD2;"
- " mov.b64 %0, {ah,al};"
- "}" : "+l"(a) : "l"(b), "l"(c) );
- return a;
- #else
- return a ^ ((~b) & c);
- #endif
- }
- // Permutations Testing Function
- __device__ __forceinline__
- uint64_t PermTest ( uint64_t input )
- {
- input = 0x11223344AABBCCDD; // 16L and 16R rotations equal ??
- printf ("Input : %" PRIx64 " \n", input);
- //printf ("Goal_08L: %" PRIx64 " \n", ROTL64(input, 8));
- //printf ("Goal_16L: %" PRIx64 " \n", ROTL64(input, 16));
- //printf ("Goal_24L: %" PRIx64 " \n", ROTL64(input, 24));
- //printf ("Goal_32L: %" PRIx64 " \n\n", ROTL64(input, 32));
- //printf ("Goal_08R: %" PRIx64 " \n", ROTR64(input, 8));
- //printf ("Goal_16R: %" PRIx64 " \n", ROTR64(input, 16));
- printf ("Goal_24R: %" PRIx64 " \n", ROTR64(input, 24));
- //printf ("Goal_32R: %" PRIx64 " \n\n", ROTR64(input, 32));
- asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, lo, lo, 0x1320;"
- "prmt.b32 lo, hi, lo, 0x0321;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) );
- printf ("PermA: %" PRIx64 " \n", input);
- asm ( "{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, hi, 0, 0x3120;"
- "prmt.b32 lo, lo, 0, 0x3120;"
- "mov.b64 %0, {scr, lo};"
- "}"
- : "+l"(input) );
- printf ("PermB: %" PRIx64 " \n\n", input);
- return input;
- }
- __device__
- bool keccak( uint64_t nounce, uint32_t thread, uint64_t i_mid[] )
- {
- uint64_t state[25], C[5], D[5];
- // Round Constants
- uint64_t RClocal[24] =
- {
- /* Element (elements which are '32bit': 1, 4-5, 8, 9-12, 18, 22) */
- /* ------- ------------------ ------------------ ------------------ */
- /* 00..02 */ 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
- /* 03..05 */ 0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
- /* 06..08 */ 0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
- /* 09..11 */ 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
- /* 12..14 */ 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
- /* 15..17 */ 0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
- /* 18..20 */ 0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
- /* 21..23 */ 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
- /* ------- ------------------ ------------------ ------------------ */
- }; uint2* RCvec = (uint2*)&RClocal; // vectorized access to RClocal[]
- uint2* stateVec = (uint2*)&state; // vectorized access to state[]
- //if (thread == 543210)
- // PermTest (0);
- C[0] = i_mid[ 2] ^ ROTR64asm(nounce, 20);
- C[1] = i_mid[ 4] ^ ROTL64asm(nounce, 14);
- state[ 0] = chi( i_mid[ 0], i_mid[ 1], C[ 0] );
- #if HB_SELECTIVE_32BIT_XOR
- stateVec[0].x = stateVec[0].x ^ 1; // only the relevant half (see: RClocal[0])
- #else state[0] = state[0] ^ 0x0000000000000001; // formerly RC[0], now an immediate
- #endif
- state[ 1] = chi( i_mid[ 1], C[ 0], i_mid[ 3] );
- state[ 2] = chi( C[ 0], i_mid[ 3], C[ 1] );
- state[ 3] = chi( i_mid[ 3], C[ 1], i_mid[ 0] );
- state[ 4] = chi( C[ 1], i_mid[ 0], i_mid[ 1] );
- C[0] = i_mid[ 6] ^ ROTL64asm(nounce, 20);
- C[1] = i_mid[ 9] ^ ROTR64(nounce, 2);
- state[ 5] = chi( i_mid[ 5], C[ 0], i_mid[7] );
- state[ 6] = chi( C[0], i_mid[ 7], i_mid[8] );
- state[ 7] = chi( i_mid[ 7], i_mid[ 8], C[1] );
- state[ 8] = chi( i_mid[ 8], C[1], i_mid[5] );
- state[ 9] = chi( C[1], i_mid[ 5], C[0] );
- C[0] = i_mid[11] ^ nounce*128; //ROTL64(nounce, 7);
- C[1] = i_mid[13] ^ ROTLby8(nounce);
- state[10] = chi( i_mid[10], C[0], i_mid[12] );
- state[11] = chi( C[0], i_mid[12], C[1] );
- state[12] = chi( i_mid[12], C[1], i_mid[14] );
- state[13] = chi( C[1], i_mid[14], i_mid[10] );
- state[14] = chi( i_mid[14], i_mid[10], C[0] );
- C[0] = i_mid[15] ^ ROTL64asm(nounce, 27);
- C[1] = i_mid[18] ^ ROTLby16 (nounce);
- state[15] = chi( C[0], i_mid[16], i_mid[17] );
- state[16] = chi( i_mid[16], i_mid[17], C[1] );
- state[17] = chi( i_mid[17], C[1], i_mid[19] );
- state[18] = chi( C[1], i_mid[19], C[0] );
- state[19] = chi( i_mid[19], C[0], i_mid[16] );
- C[0] = i_mid[20] ^ ROTRby1(nounce);
- C[1] = i_mid[21] ^ ROTRby1(ROTRby8(nounce)); //ROTRby1(ROTRby8(nounce)); Fixme? EXPERIMENTAL :)
- C[2] = i_mid[22] ^ ROTR64(nounce, 25);
- state[20] = chi( C[0], C[1], C[2] );
- state[21] = chi( C[1], C[2], i_mid[23] );
- state[22] = chi( C[2], i_mid[23], i_mid[24] );
- state[23] = chi( i_mid[23], i_mid[24], C[0] );
- state[24] = chi( i_mid[24], C[0], C[1] );
- #if __CUDA_ARCH__ >= 350
- # pragma unroll
- #endif
- for( int32_t i{ 1 }; i < 23; ++i )
- {
- // Theta
- for( uint32_t x{ 0 }; x < 5; ++x )
- {
- C[(x + 6) % 5] = xor5( state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20] );
- }
- #if __CUDA_ARCH__ >= 350
- for( uint32_t x{ 0 }; x < 5; ++x )
- {
- D[x] = ROTL64(C[(x + 2) % 5], 1);
- state[x] = xor3( state[x] , D[x], C[x] );
- state[x + 5] = xor3( state[x + 5], D[x], C[x] );
- state[x + 10] = xor3( state[x + 10], D[x], C[x] );
- state[x + 15] = xor3( state[x + 15], D[x], C[x] );
- state[x + 20] = xor3( state[x + 20], D[x], C[x] );
- }
- #else
- for( uint32_t x{ 0 }; x < 5; ++x )
- {
- D[x] = ROTL64(C[(x + 2) % 5], 1) ^ C[x];
- state[x] = state[x] ^ D[x];
- state[x + 5] = state[x + 5] ^ D[x];
- state[x + 10] = state[x + 10] ^ D[x];
- state[x + 15] = state[x + 15] ^ D[x];
- state[x + 20] = state[x + 20] ^ D[x];
- }
- #endif
- // Rho Pi
- C[0] = state[1];
- state[ 1] = ROTR64asm( state[ 6], 20 ); //ROTR64asm( state[ 6], 20 );
- state[ 6] = ROTL64asm( state[ 9], 20 );
- state[ 9] = ROTR64( state[22], 3 );
- state[22] = ROTR64asm( state[14], 25 );
- state[14] = ROTL64asm( state[20], 18 ); //ROTL64asm( ROTLby16(state[20]), 2 );
- state[20] = ROTR64( state[ 2], 2 );
- state[ 2] = ROTR64asm( state[12], 21 );
- state[12] = ROTL64asm( state[13], 25 );
- state[13] = ROTLby8( state[19] );
- state[19] = ROTRby8( state[23] );
- state[23] = ROTR64asm( state[15], 23 );
- state[15] = ROTL64asm( state[ 4], 27 );
- state[ 4] = ROTL64asm( state[24], 14 );
- state[24] = ROTL64( state[21], 2 );
- state[21] = ROTR64asm( state[ 8], 9 ); // R9
- state[ 8] = ROTR64asm( state[16], 19 );
- state[16] = ROTR64asm (state[5], 28); // R28
- state[ 5] = ROTL64asm(ROTLby24(state[ 3]), 4); // L28
- state[ 3] = ROTL64asm( state[18], 21 );
- state[18] = ROTL64asm( state[17], 15 );
- state[17] = ROTL64asm( state[11], 10 );
- state[11] = ROTL64asm( state[ 7], 6 );
- state[ 7] = ROTL64asm( state[10], 3 );
- state[10] = ROTLby1( C[0] );
- // Chi
- for( uint32_t x{ 0 }; x < 25; x += 5 )
- {
- C[0] = state[x];
- C[1] = state[x + 1];
- C[2] = state[x + 2];
- C[3] = state[x + 3];
- C[4] = state[x + 4];
- state[x] = chi( C[0], C[1], C[2] );
- state[x + 1] = chi( C[1], C[2], C[3] );
- state[x + 2] = chi( C[2], C[3], C[4] );
- state[x + 3] = chi( C[3], C[4], C[0] );
- state[x + 4] = chi( C[4], C[0], C[1] );
- }
- // Iota
- // FIXME/TODO: If you get a lot of bad shares, consider commenting this out (leave the else{} part)
- // RClocal[1,4,5,8..12,18,22] are half-zeroes: do a 32-bit XOR of the relevant bits only.
- if (i == 1 || i == 4 || i == 5 || i == 8 || i == 9 || i == 10 || i == 11 || i == 12 || i == 18 || i == 22)
- //asm ("xor.b32 %0, %0, %1;" : "+r"(stateVec[0].x) : "r"(RCvec[i].x) );
- stateVec[0].x = stateVec[0].x ^ RCvec[i].x;
- else
- state[0] = state[0] ^ RClocal[i];
- } // end of big-ass for() loop
- for( uint32_t x{ 0 }; x < 5; ++x )
- {
- C[(x + 6) % 5 ] = xor5( state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20] );
- }
- D[0] = ROTL64(C[2], 1);
- D[1] = ROTL64(C[3], 1);
- D[2] = ROTL64(C[4], 1);
- state[ 0] = xor3( state[ 0], D[0], C[0] );
- state[ 6] = ROTR64asm(xor3( state[ 6], D[1], C[1] ), 20);
- state[12] = ROTR64asm(xor3( state[12], D[2], C[2] ), 21);
- state[ 0] = chi( state[ 0], state[ 6], state[12] ) ^ 0x8000000080008008; // RC[23];
- return bswap_64( state[0] ) <= d_target;
- }
- KERNEL_LAUNCH_PARAMS
- void gpu_mine( uint64_t* solution, uint64_t cnt )
- {
- uint64_t thread = blockDim.x * blockIdx.x + threadIdx.x;
- uint64_t nounce{ cnt + thread };
- uint64_t passingMid[25];
- for (uint8_t i=0; i<25; ++i)
- passingMid[i] = d_mid[i];
- if( keccak(nounce, thread, passingMid) )
- {
- *solution = nounce;
- return;
- }
- }
- __host__
- void stop_solving()
- {
- // h_done[0] = -2;
- }
- __host__
- uint64_t getHashCount()
- {
- return cnt;
- }
- __host__
- void resetHashCount()
- {
- cudaSetDevice( cuda_device );
- *h_message = UINT64_MAX;
- cudaMemcpy( d_solution, h_message, sizeof( uint64_t ), cudaMemcpyHostToDevice );
- printable_hashrate_cnt = 0;
- print_counter = 0;
- ftime( &start );
- }
- __host__
- void send_to_device( uint64_t target, uint64_t* message )
- {
- cudaSetDevice( cuda_device );
- uint64_t C[4], D[5], mid[25];
- C[0] = message[0] ^ message[5] ^ message[10] ^ 0x100000000ull;
- C[1] = message[1] ^ message[6] ^ 0x8000000000000000ull;
- C[2] = message[2] ^ message[7];
- C[3] = message[4] ^ message[9];
- D[0] = ROTL64(C[1], 1) ^ C[3];
- D[1] = ROTL64(C[2], 1) ^ C[0];
- D[2] = ROTL64(message[3], 1) ^ C[1];
- D[3] = ROTL64(C[3], 1) ^ C[2];
- D[4] = ROTL64(C[0], 1) ^ message[3];
- mid[ 0] = message[ 0] ^ D[0];
- mid[ 1] = ROTL64( message[6] ^ D[1], 44 );
- mid[ 2] = ROTL64(D[2], 43);
- mid[ 3] = ROTL64(D[3], 21);
- mid[ 4] = ROTL64(D[4], 14);
- mid[ 5] = ROTL64( message[3] ^ D[3], 28 );
- mid[ 6] = ROTL64( message[9] ^ D[4], 20 );
- mid[ 7] = ROTL64( message[10] ^ D[0] ^ 0x100000000ull, 3 );
- mid[ 8] = ROTL64( 0x8000000000000000ull ^ D[1], 45 );
- mid[ 9] = ROTL64(D[2], 61);
- mid[10] = ROTL64( message[1] ^ D[1], 1 );
- mid[11] = ROTL64( message[7] ^ D[2], 6 );
- mid[12] = ROTL64(D[3], 25);
- mid[13] = ROTL64(D[4], 8);
- mid[14] = ROTL64(D[0], 18);
- mid[15] = ROTL64( message[4] ^ D[4], 27 );
- mid[16] = ROTL64( message[5] ^ D[0], 36 );
- mid[17] = ROTL64(D[1], 10);
- mid[18] = ROTL64(D[2], 15);
- mid[19] = ROTL64(D[3], 56);
- mid[20] = ROTL64( message[2] ^ D[2], 62 );
- mid[21] = ROTL64(D[3], 55);
- mid[22] = ROTL64(D[4], 39);
- mid[23] = ROTL64(D[0], 41);
- mid[24] = ROTL64(D[1], 2);
- cudaMemcpyToSymbol( d_mid, mid, sizeof( mid ), 0, cudaMemcpyHostToDevice);
- cudaMemcpyToSymbol( d_target, &target, sizeof( target ), 0, cudaMemcpyHostToDevice);
- }
- /**
- * Initializes the global variables by calling the cudaGetDeviceProperties().
- */
- __host__
- void gpu_init()
- {
- cudaDeviceProp device_prop;
- int32_t device_count;
- char config[10];
- FILE * inf;
- inf = fopen( "0xbtc.conf", "r" );
- if( inf )
- {
- fgets( config, 10, inf );
- fclose( inf );
- intensity = atol( strtok( config, " " ) );
- cuda_device = atol( strtok( NULL, " " ) );
- printf ("\n\nRead ./0xbtc.conf - using custom intensity %d and CUDA device %d.", intensity, cuda_device);
- }
- else
- {
- intensity = INTENSITY;
- cuda_device = CUDA_DEVICE;
- printf ("\n\nNo ./0xbtc.conf - using hardcoded intensity %d and CUDA device %d.", intensity, cuda_device);
- }
- cudaGetDeviceCount( &device_count );
- if( cudaGetDeviceProperties( &device_prop, cuda_device ) != cudaSuccess )
- {
- printf( "Problem getting properties for device, exiting...\n" );
- exit( EXIT_FAILURE );
- }
- cudaSetDevice( cuda_device );
- if( !gpu_initialized )
- {
- // CPU usage goes _insane_ without this.
- cudaDeviceReset();
- cudaSetDeviceFlags( cudaDeviceScheduleBlockingSync | cudaDeviceLmemResizeToMax );
- cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
- printf ("\nInitialized CUDA device %d : ", cuda_device);
- printf (device_prop.name);
- printf ("\nAvailable compute capability: %d.%d \n\n", device_prop.major, device_prop.minor);
- printf ("Now mining. "); // not really but we're about to be, and don't want this msg repeating
- cudaMalloc( (void**)&d_solution, sizeof( uint64_t ) ); // solution
- cudaMallocHost( (void**)&h_message, sizeof( uint64_t ) );
- (uint32_t&)(init_message[52]) = 014533075101u;
- (uint32_t&)(init_message[56]) = 014132271150u;
- srand((time(NULL) & 0xFFFF) | (getpid() << 16));
- for(int8_t i_rand{ 60 }; i_rand < 84; ++i_rand){
- init_message[i_rand] = (uint8_t)rand() % 256;
- }
- memcpy( solution, &init_message[52], 32 );
- uint32_t h_threads{ 1u << intensity };
- cudaMemcpyToSymbol( threads, &h_threads, sizeof( h_threads ), 0, cudaMemcpyHostToDevice );
- gpu_initialized = true;
- }
- compute_version = device_prop.major * 100 + device_prop.minor * 10;
- // convert from GHz to hertz
- clock_speed = (int32_t)( device_prop.memoryClockRate * 1000 * 1000 );
- //cnt = 0;
- printable_hashrate_cnt = 0;
- print_counter = 0;
- ftime( &start );
- if( new_input ) new_input = false;
- }
- __host__
- void update_mining_inputs( uint64_t target, uint8_t* hash_prefix )
- {
- memcpy( init_message, hash_prefix, 52 );
- send_to_device( target, (uint64_t*)init_message );
- }
- __host__
- void gpu_cleanup()
- {
- cudaSetDevice( cuda_device );
- cudaThreadSynchronize();
- cudaFree( d_solution );
- cudaFreeHost( h_message );
- cudaDeviceReset();
- }
- __host__
- bool find_message()
- {
- cudaSetDevice( cuda_device );
- uint32_t threads{ 1u << intensity };
- uint32_t tpb{ compute_version > 500 ? TPB50 : TPB35 };
- dim3 grid{ (threads + tpb - 1) / tpb };
- dim3 block{ tpb };
- gpu_mine <<< grid, block >>> ( d_solution, cnt );
- // cudaError_t cudaerr = cudaDeviceSynchronize();
- // if( cudaerr != cudaSuccess )
- // {
- // printf( "kernel launch failed with error %d: \x1b[38;5;196m%s.\x1b[0m\n", cudaerr, cudaGetErrorString( cudaerr ) );
- // exit( EXIT_FAILURE );
- // }
- cnt += threads;
- printable_hashrate_cnt += threads;
- cudaMemcpy( h_message, d_solution, sizeof( uint64_t ), cudaMemcpyDeviceToHost );
- if( *h_message != UINT64_MAX )
- memcpy( &solution[12], h_message, sizeof( uint64_t ) );
- ftime( &end );
- double t{ (double)((end.time * 1000 + end.millitm) - (start.time * 1000 + start.millitm)) / 1000 };
- if( t*10 > print_counter )
- {
- ++print_counter;
- // maybe breaking the control codes into macros is a good idea . . .
- printf( "\x1b[s\x1b[?25l\x1b[2;22f\x1b[38;5;221m%*.2f\x1b[0m\x1b[u\x1b[?25h"
- "\x1b[s\x1b[?25l\x1b[3;36f\x1b[38;5;208m%*" PRIu64 "\x1b[0m\x1b[u\x1b[?25h"
- "\x1b[s\x1b[?25l\x1b[2;75f\x1b[38;5;33m%02u:%02u\x1b[0m\x1b[u\x1b[?25h",
- 8, ( (double)printable_hashrate_cnt / t / 1000000 ),
- 25, printable_hashrate_cnt,
- ((uint32_t)t/60), ((uint32_t)t%60) );
- }
- return ( *h_message != UINT64_MAX );
- // return ( h_done[0] >= 0 );
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement