Advertisement
Guest User

Untitled

a guest
Apr 24th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 22.59 KB | None | 0 0
  1. // TODO: these are not used yet !!
  2. #define HB_PREFER_PRMT_OVER_SHIFT     0     // prefer permutation functions over funnel shift-based functions
  3. #define HB_RESET_COUNTER              1     // reset `cnt` each solution/new challenge (req'd for HB_CNT0_SHORTCUT)
  4. #define HB_CNT0_SHORTCUT              1     // apply shortcut when `cnt` is 0 at start of keccak()
  5. #define HB_SELECTIVE_32BIT_XOR        1     // where only half a 64bit var is nonzero, only XOR the relevant half
  6.  
  7. //
  8. // HashBurner Optimizations by LtTofu/Mag517
  9. // Date: 22 APR 2018
  10. //
  11.  
  12. /*
  13. Author: Mikers
  14. date march 4, 2018 for 0xbitcoin dev
  15.  
  16. based off of https://github.com/Dunhili/SHA3-gpu-brute-force-cracker/blob/master/sha3.cu
  17.  
  18.  * Author: Brian Bowden
  19.  * Date: 5/12/14
  20.  *
  21.  * This is the parallel version of SHA-3.
  22.  */
  23.  
  24. #include "cuda_sha3.h"
  25.  
  26. int32_t intensity;
  27. int32_t cuda_device;
  28. int32_t clock_speed;
  29. int32_t compute_version;
  30. struct timeb start, end;
  31.  
  32. uint64_t cnt;
  33. uint64_t printable_hashrate_cnt;
  34. uint64_t print_counter;
  35.  
  36. bool gpu_initialized;
  37. bool new_input;
  38.  
  39. uint8_t solution[32] = { 0 };
  40.  
  41. uint64_t* h_message;
  42. uint8_t init_message[84];
  43.  
  44. uint64_t* d_solution;
  45.  
  46. uint8_t* d_challenge;
  47. uint8_t* d_hash_prefix;
  48. __constant__ uint64_t d_mid[25];
  49. __constant__ uint64_t d_target;
  50. __constant__ uint32_t threads;
  51.  
  52. /* Round Constants
  53. - - - - - -
  54. __device__ __constant__ const uint64_t RC[24] = {
  55.   0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
  56.   0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
  57.   0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
  58.   0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
  59.   0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
  60.   0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
  61.   0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
  62.   0x8000000000008080, 0x0000000080000001, 0x8000000080008008
  63. };*/
  64.  
  65. __device__ __forceinline__
  66. uint64_t bswap_64( uint64_t input )
  67. {
  68.   asm( "{"
  69.        "  .reg .u32 oh, ol;"
  70.        "  mov.b64 {oh,ol}, %0;"
  71.        "  prmt.b32 oh, oh, 0, 0x0123;"
  72.        "  prmt.b32 ol, ol, 0, 0x0123;"
  73.        "  mov.b64 %0, {ol,oh};"
  74.        "}" : "+l"(input) );
  75.   return input;
  76. }
  77.  
  78. // try doing this with two offsettings of output operand instead
  79. __device__ __forceinline__
  80. uint64_t ROTL64asm (uint64_t input, uint32_t magnitude)
  81. {
  82.     asm ("{"
  83.          ".reg .b32 hi, lo, mag, scr;"
  84.          "mov.b32 mag, %1;"
  85.          "mov.b64 {hi,lo}, %0;"
  86.          "shf.l.wrap.b32 scr, lo, hi, mag;"
  87.          "shf.l.wrap.b32 lo, hi, lo, mag;"
  88.          "mov.b64 %0, {scr,lo};"
  89.          "}" : "+l"(input) : "r"(magnitude) );
  90.     return input;
  91. }
  92.  
  93. // try doing this with two offsettings of output operand instead
  94. __device__ __forceinline__
  95. uint64_t ROTR64asm (uint64_t input, uint32_t magnitude)
  96. {
  97.     asm ("{"
  98.          ".reg .b32 hi, lo, mag, scr;"
  99.          "mov.b32 mag, %1;"
  100.          "mov.b64 {hi,lo}, %0;"
  101.          "shf.r.wrap.b32 scr, hi, lo, mag;"
  102.          "shf.r.wrap.b32 lo, lo, hi, mag;"
  103.          "mov.b64 %0, {scr,lo};"
  104.          "}" : "+l"(input) : "r"(magnitude) );
  105.     return input;
  106. }
  107.  
  108. __device__ __forceinline__
  109. uint64_t xor5( uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e )
  110. {
  111.   // try writing into a scratch register
  112.  
  113.   asm("{"
  114.       "   .reg .b64 pad;        "
  115.       "   xor.b64 pad, %0, %1;  "
  116.       "   xor.b64 pad, pad, %2; "
  117.       "   xor.b64 pad, pad, %3; "
  118.       "   xor.b64 %0, pad, %4;  "
  119.       "}" : "+l"(a) : "l"(b), "l"(c), "l"(d), "l"(e) );
  120.  
  121.   return a;
  122. }
  123.  
  124. // Rotation Test for 32- but which direction? Use this later.
  125. __device__ __forceinline__
  126. uint64_t ROTby32_Flip (uint64_t input)
  127. {
  128.     asm ("{"
  129.          ".reg .b32 hi, lo;"
  130.          "mov.b64 {lo,hi}, %0;"
  131.          "mov.b64 %0, {hi,lo};"
  132.          "}" : "+l"(input) );
  133.     return input;
  134. }
  135.  
  136. // Rotation Test for 32- but which direction? Use this later.
  137. __device__ __forceinline__
  138. uint64_t ROTby32_Prmt (uint64_t input)
  139. {
  140.     // FIXME: out of order
  141.     asm ( "{"
  142.           ".reg .b32 hi, lo, scr;"
  143.           "mov.b64 {hi,lo}, %0;"
  144.           "prmt.b32 scr, hi, lo, 0x7654;"
  145.           "prmt.b32 lo, hi, lo, 0x3210;"
  146.           "mov.b64 %0, {lo,scr};"
  147.           "}"
  148.           : "+l"(input) );
  149.  
  150.     /* Alternate Version: should provide same result: ROTR64==ROTL64 by 32 */
  151.     /*     asm ( "{"
  152.           ".reg .b32 hi, lo, scr;"
  153.           "mov.b64 {hi,lo}, %0;"
  154.           "prmt.b32 scr, hi, lo, 0x1076;"
  155.           "prmt.b32 lo, hi, lo, 0x5432;"
  156.           "mov.b64 %0, {scr,lo};"
  157.           "}"
  158.           : "+l"(input) );
  159.     printf ("PermB: %" PRIx64 " \n\n", input);*/
  160.  
  161.     return input;
  162. }
  163.  
  164. __device__ __forceinline__
  165. uint64_t ROTLby24 (uint64_t input)
  166. {
  167.     asm ( "{"
  168.           ".reg .b32 hi, lo, scr;"
  169.           "mov.b64 {hi,lo}, %0;"
  170.           "prmt.b32 scr, hi, lo, 0x0765;"
  171.           "prmt.b32 lo, hi, lo, 0x4321;"
  172.           "mov.b64 %0, {scr,lo};"
  173.           "}" : "+l"(input) );
  174.     return input;
  175. }
  176. // TODO: ROTRby24!
  177.  
  178. __device__ __forceinline__
  179. uint64_t ROTLby16 (uint64_t input)
  180. {
  181.     asm ( "{"
  182.           "   .reg .b32 hi, lo, scr;"
  183.           "   mov.b64 {hi,lo}, %0;"
  184.           "   prmt.b32 scr, hi, lo, 0x5432;"
  185.           "   prmt.b32 lo, hi, lo, 0x1076;"
  186.           "   mov.b64 %0, {lo,scr};"
  187.           " }" : "+l"(input) );
  188.     return input;
  189. }
  190.  
  191. // see if this is faster using x and y vectors, no extra regs
  192. __device__ __forceinline__
  193. uint64_t ROTLby8 (uint64_t input)
  194. {
  195.     asm ( "{"
  196.           ".reg .b32 hi, lo, scr;"
  197.           "mov.b64 {hi,lo}, %0;"
  198.           "prmt.b32 scr, hi, lo, 0x2107;"
  199.           "prmt.b32 lo, hi, lo, 0x6543;"
  200.           "mov.b64 %0, {scr,lo};"
  201.           "}"
  202.           : "+l"(input) );
  203.     return input;
  204. }
  205.  
  206. __device__ __forceinline__
  207. uint64_t ROTRby8 (uint64_t input)
  208. {
  209.       asm ( "{"
  210.           ".reg .b32 hi, lo, scr;"
  211.           "mov.b64 {hi,lo}, %0;"
  212.           "prmt.b32 scr, lo, hi, 0x0765;"
  213.           "prmt.b32 lo, lo, hi, 0x4321;"
  214.           "mov.b64 %0, {scr,lo};"
  215.           "}"
  216.           : "+l"(input) );
  217.     return input;
  218. }
  219.  
  220. // TODO: Look for a snappier way to do this. Should still be slightly
  221. //       faster than the variable-magnitude version above.
  222. __device__ __forceinline__
  223. uint64_t ROTLby1 (uint64_t input)
  224. {
  225.     asm ("{"
  226.          ".reg .b32 hi, lo, scr;"
  227.          "mov.b64 {hi,lo}, %0;"
  228.          "shf.l.wrap.b32 scr, lo, hi, 1;"   // magnitude replaced w/ immediate
  229.          "shf.l.wrap.b32 lo, hi, lo, 1;"    // magnitude replaced w/ immediate
  230.          "mov.b64 %0, {scr,lo};"
  231.          "}" : "+l"(input) );
  232.     return input;
  233. }
  234.  
  235. // try doing this with two offsettings of output operand instead
  236. __device__ __forceinline__
  237. uint64_t ROTRby1 (uint64_t input)
  238. {
  239.     asm ("{"
  240.          ".reg .b32 hi, lo, scr;"
  241.          "mov.b64 {hi,lo}, %0;"
  242.          "shf.r.wrap.b32 scr, hi, lo, 1;"
  243.          "shf.r.wrap.b32 lo, lo, hi, 1;"
  244.          "mov.b64 %0, {scr,lo};"
  245.          "}" : "+l"(input) );
  246.     return input;
  247. }
  248.  
  249. __device__ __forceinline__
  250. uint64_t xor3( uint64_t a, uint64_t b, uint64_t c )
  251. {
  252.   asm( "{"
  253.        ".reg .b64 pad;"
  254.        "  xor.b64 pad, %0, %1;"
  255.        "  xor.b64 %0, pad, %2;"
  256.        "}" : "+l"(a) : "l"(b), "l"(c) );
  257.   return a;
  258. }
  259.  
  260.  
  261. // FIXME: Assuming SM 5.x+
  262. __device__ __forceinline__
  263. uint64_t chi( uint64_t a, uint64_t b, uint64_t c )
  264. {
  265. #if __CUDA_ARCH__ >= 500
  266.   asm( "{"
  267.        "  .reg .b32 ah, al, bh, bl, ch, cl;"
  268.        "  mov.b64 {ah,al}, %0;"
  269.        "  mov.b64 {bh,bl}, %1;"
  270.        "  mov.b64 {ch,cl}, %2;"
  271.  
  272.        "  lop3.b32 ah, ah, bh, ch, 0xD2;"
  273.        "  lop3.b32 al, al, bl, cl, 0xD2;"
  274.        "  mov.b64 %0, {ah,al};"
  275.        "}" : "+l"(a) : "l"(b), "l"(c) );
  276.   return a;
  277. #else
  278.   return a ^ ((~b) & c);
  279. #endif
  280. }
  281.  
  282. // Permutations Testing Function
  283. __device__ __forceinline__
  284. uint64_t PermTest ( uint64_t input )
  285. {
  286.     input = 0x11223344AABBCCDD;             // 16L and 16R rotations equal ??
  287.  
  288.     printf ("Input   : %" PRIx64 " \n", input);
  289.     //printf ("Goal_08L: %" PRIx64 " \n", ROTL64(input, 8));
  290.     //printf ("Goal_16L: %" PRIx64 " \n", ROTL64(input, 16));
  291.     //printf ("Goal_24L: %" PRIx64 " \n", ROTL64(input, 24));
  292.     //printf ("Goal_32L: %" PRIx64 " \n\n", ROTL64(input, 32));
  293.     //printf ("Goal_08R: %" PRIx64 " \n", ROTR64(input, 8));
  294.     //printf ("Goal_16R: %" PRIx64 " \n", ROTR64(input, 16));
  295.     printf ("Goal_24R: %" PRIx64 " \n", ROTR64(input, 24));
  296.     //printf ("Goal_32R: %" PRIx64 " \n\n", ROTR64(input, 32));
  297.  
  298.     asm ( "{"
  299.           ".reg .b32 hi, lo, scr;"
  300.           "mov.b64 {hi,lo}, %0;"
  301.           "prmt.b32 scr, lo, lo, 0x1320;"
  302.           "prmt.b32 lo, hi, lo, 0x0321;"
  303.           "mov.b64 %0, {scr,lo};"
  304.           "}" : "+l"(input) );
  305.  
  306.     printf ("PermA: %" PRIx64 " \n", input);
  307.  
  308.     asm ( "{"
  309.           ".reg .b32 hi, lo, scr;"
  310.           "mov.b64 {hi,lo}, %0;"
  311.           "prmt.b32 scr, hi, 0, 0x3120;"
  312.           "prmt.b32 lo, lo, 0, 0x3120;"
  313.           "mov.b64 %0, {scr, lo};"
  314.           "}"
  315.           : "+l"(input) );
  316.     printf ("PermB: %" PRIx64 " \n\n", input);
  317.  
  318.     return input;
  319. }
  320.  
  321. __device__
  322. bool keccak( uint64_t nounce, uint32_t thread, uint64_t i_mid[] )
  323. {
  324.   uint64_t state[25], C[5], D[5];
  325.  
  326.   // Round Constants
  327.   uint64_t RClocal[24] =
  328.   {
  329.     /* Element     (elements which are '32bit': 1, 4-5, 8, 9-12, 18, 22)      */
  330.     /* -------     ------------------  ------------------  ------------------ */
  331.     /* 00..02  */  0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
  332.     /* 03..05  */  0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
  333.     /* 06..08  */  0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
  334.     /* 09..11  */  0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
  335.     /* 12..14  */  0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
  336.     /* 15..17  */  0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
  337.     /* 18..20  */  0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
  338.     /* 21..23  */  0x8000000000008080, 0x0000000080000001, 0x8000000080008008
  339.     /* -------     ------------------  ------------------  ------------------ */
  340.   }; uint2* RCvec = (uint2*)&RClocal;  // vectorized access to RClocal[]
  341.  
  342.   uint2* stateVec = (uint2*)&state;    // vectorized access to state[]
  343.  
  344.   //if (thread == 543210)
  345.   //    PermTest (0);
  346.  
  347.   C[0] = i_mid[ 2] ^ ROTR64asm(nounce, 20);
  348.   C[1] = i_mid[ 4] ^ ROTL64asm(nounce, 14);
  349.  
  350.   state[ 0] = chi( i_mid[ 0], i_mid[ 1], C[ 0] );
  351. #if HB_SELECTIVE_32BIT_XOR
  352.   stateVec[0].x = stateVec[0].x ^ 1;                // only the relevant half (see: RClocal[0])
  353. #else state[0] = state[0] ^ 0x0000000000000001;     // formerly RC[0], now an immediate
  354. #endif
  355.  
  356.   state[ 1] = chi( i_mid[ 1], C[ 0], i_mid[ 3] );
  357.   state[ 2] = chi( C[ 0], i_mid[ 3], C[ 1] );
  358.   state[ 3] = chi( i_mid[ 3], C[ 1], i_mid[ 0] );
  359.   state[ 4] = chi( C[ 1], i_mid[ 0], i_mid[ 1] );
  360.  
  361.   C[0] = i_mid[ 6] ^ ROTL64asm(nounce, 20);
  362.   C[1] = i_mid[ 9] ^ ROTR64(nounce,  2);
  363.   state[ 5] = chi( i_mid[ 5], C[ 0], i_mid[7] );
  364.   state[ 6] = chi( C[0], i_mid[ 7], i_mid[8] );
  365.   state[ 7] = chi( i_mid[ 7], i_mid[ 8], C[1] );
  366.   state[ 8] = chi( i_mid[ 8], C[1], i_mid[5] );
  367.   state[ 9] = chi( C[1], i_mid[ 5], C[0] );
  368.  
  369.   C[0] = i_mid[11] ^ nounce*128;                    //ROTL64(nounce, 7);
  370.   C[1] = i_mid[13] ^ ROTLby8(nounce);
  371.   state[10] = chi( i_mid[10], C[0], i_mid[12] );
  372.   state[11] = chi( C[0], i_mid[12], C[1] );
  373.   state[12] = chi( i_mid[12], C[1], i_mid[14] );
  374.   state[13] = chi( C[1], i_mid[14], i_mid[10] );
  375.   state[14] = chi( i_mid[14], i_mid[10], C[0] );
  376.  
  377.   C[0] = i_mid[15] ^ ROTL64asm(nounce, 27);
  378.   C[1] = i_mid[18] ^ ROTLby16 (nounce);
  379.   state[15] = chi( C[0], i_mid[16], i_mid[17] );
  380.   state[16] = chi( i_mid[16], i_mid[17], C[1] );
  381.   state[17] = chi( i_mid[17], C[1], i_mid[19] );
  382.   state[18] = chi( C[1], i_mid[19], C[0] );
  383.   state[19] = chi( i_mid[19], C[0], i_mid[16] );
  384.  
  385.   C[0] = i_mid[20] ^ ROTRby1(nounce);
  386.   C[1] = i_mid[21] ^ ROTRby1(ROTRby8(nounce));      //ROTRby1(ROTRby8(nounce)); Fixme? EXPERIMENTAL :)
  387.   C[2] = i_mid[22] ^ ROTR64(nounce, 25);
  388.   state[20] = chi( C[0], C[1], C[2] );
  389.   state[21] = chi( C[1], C[2], i_mid[23] );
  390.   state[22] = chi( C[2], i_mid[23], i_mid[24] );
  391.   state[23] = chi( i_mid[23], i_mid[24], C[0] );
  392.   state[24] = chi( i_mid[24], C[0], C[1] );
  393.  
  394. #if __CUDA_ARCH__ >= 350
  395. #  pragma unroll
  396. #endif
  397.   for( int32_t i{ 1 }; i < 23; ++i )
  398.   {
  399.       // Theta
  400.       for( uint32_t x{ 0 }; x < 5; ++x )
  401.       {
  402.         C[(x + 6) % 5] = xor5( state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20] );
  403.       }
  404.  
  405.   #if __CUDA_ARCH__ >= 350
  406.       for( uint32_t x{ 0 }; x < 5; ++x )
  407.       {
  408.         D[x] = ROTL64(C[(x + 2) % 5], 1);
  409.         state[x]      = xor3( state[x]     , D[x], C[x] );
  410.         state[x +  5] = xor3( state[x +  5], D[x], C[x] );
  411.         state[x + 10] = xor3( state[x + 10], D[x], C[x] );
  412.         state[x + 15] = xor3( state[x + 15], D[x], C[x] );
  413.         state[x + 20] = xor3( state[x + 20], D[x], C[x] );
  414.       }
  415.   #else
  416.       for( uint32_t x{ 0 }; x < 5; ++x )
  417.       {
  418.         D[x] = ROTL64(C[(x + 2) % 5], 1) ^ C[x];
  419.         state[x]      = state[x]      ^ D[x];
  420.         state[x +  5] = state[x +  5] ^ D[x];
  421.         state[x + 10] = state[x + 10] ^ D[x];
  422.         state[x + 15] = state[x + 15] ^ D[x];
  423.         state[x + 20] = state[x + 20] ^ D[x];
  424.       }
  425.   #endif
  426.  
  427.       // Rho Pi
  428.       C[0] = state[1];
  429.       state[ 1] = ROTR64asm( state[ 6], 20 );           //ROTR64asm( state[ 6], 20 );
  430.       state[ 6] = ROTL64asm( state[ 9], 20 );
  431.       state[ 9] = ROTR64( state[22],  3 );
  432.       state[22] = ROTR64asm( state[14], 25 );
  433.       state[14] = ROTL64asm( state[20], 18 );           //ROTL64asm( ROTLby16(state[20]), 2 );
  434.       state[20] = ROTR64( state[ 2],  2 );
  435.       state[ 2] = ROTR64asm( state[12], 21 );
  436.       state[12] = ROTL64asm( state[13], 25 );
  437.       state[13] = ROTLby8( state[19] );
  438.       state[19] = ROTRby8( state[23] );
  439.       state[23] = ROTR64asm( state[15], 23 );
  440.       state[15] = ROTL64asm( state[ 4], 27 );
  441.       state[ 4] = ROTL64asm( state[24], 14 );
  442.       state[24] = ROTL64( state[21],  2 );
  443.       state[21] = ROTR64asm( state[ 8], 9 );            // R9
  444.       state[ 8] = ROTR64asm( state[16], 19 );
  445.       state[16] = ROTR64asm (state[5], 28);     // R28
  446.       state[ 5] = ROTL64asm(ROTLby24(state[ 3]), 4);     // L28
  447.       state[ 3] = ROTL64asm( state[18], 21 );
  448.       state[18] = ROTL64asm( state[17], 15 );
  449.       state[17] = ROTL64asm( state[11], 10 );
  450.       state[11] = ROTL64asm( state[ 7],  6 );
  451.       state[ 7] = ROTL64asm( state[10],  3 );
  452.       state[10] = ROTLby1( C[0] );
  453.  
  454.       // Chi
  455.       for( uint32_t x{ 0 }; x < 25; x += 5 )
  456.       {
  457.         C[0] = state[x];
  458.         C[1] = state[x + 1];
  459.         C[2] = state[x + 2];
  460.         C[3] = state[x + 3];
  461.         C[4] = state[x + 4];
  462.         state[x]     = chi( C[0], C[1], C[2] );
  463.         state[x + 1] = chi( C[1], C[2], C[3] );
  464.         state[x + 2] = chi( C[2], C[3], C[4] );
  465.         state[x + 3] = chi( C[3], C[4], C[0] );
  466.         state[x + 4] = chi( C[4], C[0], C[1] );
  467.       }
  468.  
  469.       // Iota
  470.       // FIXME/TODO: If you get a lot of bad shares, consider commenting this out (leave the else{} part)
  471.       // RClocal[1,4,5,8..12,18,22] are half-zeroes: do a 32-bit XOR of the relevant bits only.
  472.       if (i == 1 || i == 4 || i == 5 || i == 8 || i == 9 || i == 10 || i == 11 || i == 12 || i == 18 || i == 22)
  473.         //asm ("xor.b32 %0, %0, %1;" : "+r"(stateVec[0].x) : "r"(RCvec[i].x) );
  474.         stateVec[0].x = stateVec[0].x ^ RCvec[i].x;
  475.       else
  476.           state[0] = state[0] ^ RClocal[i];
  477.  
  478.   } // end of big-ass for() loop
  479.  
  480.   for( uint32_t x{ 0 }; x < 5; ++x )
  481.   {
  482.     C[(x + 6) % 5 ] = xor5( state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20] );
  483.   }
  484.  
  485.   D[0] = ROTL64(C[2], 1);
  486.   D[1] = ROTL64(C[3], 1);
  487.   D[2] = ROTL64(C[4], 1);
  488.  
  489.   state[ 0] = xor3( state[ 0], D[0], C[0] );
  490.   state[ 6] = ROTR64asm(xor3( state[ 6], D[1], C[1] ), 20);
  491.   state[12] = ROTR64asm(xor3( state[12], D[2], C[2] ), 21);
  492.  
  493.   state[ 0] = chi( state[ 0], state[ 6], state[12] ) ^ 0x8000000080008008; // RC[23];
  494.  
  495.   return bswap_64( state[0] ) <= d_target;
  496. }
  497.  
  498. KERNEL_LAUNCH_PARAMS
  499. void gpu_mine( uint64_t* solution, uint64_t cnt )
  500. {
  501.   uint64_t thread = blockDim.x * blockIdx.x + threadIdx.x;
  502.   uint64_t nounce{ cnt + thread };
  503.   uint64_t passingMid[25];
  504.  
  505.   for (uint8_t i=0; i<25; ++i)
  506.       passingMid[i] = d_mid[i];
  507.  
  508.   if( keccak(nounce, thread, passingMid) )
  509.   {
  510.     *solution = nounce;
  511.     return;
  512.   }
  513. }
  514.  
  515. __host__
  516. void stop_solving()
  517. {
  518.   // h_done[0] = -2;
  519. }
  520.  
  521. __host__
  522. uint64_t getHashCount()
  523. {
  524.   return cnt;
  525. }
  526.  
  527. __host__
  528. void resetHashCount()
  529. {
  530.   cudaSetDevice( cuda_device );
  531.  
  532.   *h_message = UINT64_MAX;
  533.   cudaMemcpy( d_solution, h_message, sizeof( uint64_t ), cudaMemcpyHostToDevice );
  534.  
  535.   printable_hashrate_cnt = 0;
  536.   print_counter = 0;
  537.  
  538.   ftime( &start );
  539. }
  540.  
  541. __host__
  542. void send_to_device( uint64_t target, uint64_t* message )
  543. {
  544.   cudaSetDevice( cuda_device );
  545.  
  546.   uint64_t C[4], D[5], mid[25];
  547.   C[0] = message[0] ^ message[5] ^ message[10] ^ 0x100000000ull;
  548.   C[1] = message[1] ^ message[6] ^ 0x8000000000000000ull;
  549.   C[2] = message[2] ^ message[7];
  550.   C[3] = message[4] ^ message[9];
  551.  
  552.   D[0] = ROTL64(C[1], 1) ^ C[3];
  553.   D[1] = ROTL64(C[2], 1) ^ C[0];
  554.   D[2] = ROTL64(message[3], 1) ^ C[1];
  555.   D[3] = ROTL64(C[3], 1) ^ C[2];
  556.   D[4] = ROTL64(C[0], 1) ^ message[3];
  557.  
  558.   mid[ 0] = message[ 0] ^ D[0];
  559.   mid[ 1] = ROTL64( message[6] ^ D[1], 44 );
  560.   mid[ 2] = ROTL64(D[2], 43);
  561.   mid[ 3] = ROTL64(D[3], 21);
  562.   mid[ 4] = ROTL64(D[4], 14);
  563.   mid[ 5] = ROTL64( message[3] ^ D[3], 28 );
  564.   mid[ 6] = ROTL64( message[9] ^ D[4], 20 );
  565.   mid[ 7] = ROTL64( message[10] ^ D[0] ^ 0x100000000ull, 3 );
  566.   mid[ 8] = ROTL64( 0x8000000000000000ull ^ D[1], 45 );
  567.   mid[ 9] = ROTL64(D[2], 61);
  568.   mid[10] = ROTL64( message[1] ^ D[1],  1 );
  569.   mid[11] = ROTL64( message[7] ^ D[2],  6 );
  570.   mid[12] = ROTL64(D[3], 25);
  571.   mid[13] = ROTL64(D[4],  8);
  572.   mid[14] = ROTL64(D[0], 18);
  573.   mid[15] = ROTL64( message[4] ^ D[4], 27 );
  574.   mid[16] = ROTL64( message[5] ^ D[0], 36 );
  575.   mid[17] = ROTL64(D[1], 10);
  576.   mid[18] = ROTL64(D[2], 15);
  577.   mid[19] = ROTL64(D[3], 56);
  578.   mid[20] = ROTL64( message[2] ^ D[2], 62 );
  579.   mid[21] = ROTL64(D[3], 55);
  580.   mid[22] = ROTL64(D[4], 39);
  581.   mid[23] = ROTL64(D[0], 41);
  582.   mid[24] = ROTL64(D[1],  2);
  583.  
  584.   cudaMemcpyToSymbol( d_mid, mid, sizeof( mid ), 0, cudaMemcpyHostToDevice);
  585.  
  586.   cudaMemcpyToSymbol( d_target, &target, sizeof( target ), 0, cudaMemcpyHostToDevice);
  587. }
  588.  
  589. /**
  590.  * Initializes the global variables by calling the cudaGetDeviceProperties().
  591.  */
  592. __host__
  593. void gpu_init()
  594. {
  595.   cudaDeviceProp device_prop;
  596.   int32_t device_count;
  597.  
  598.   char config[10];
  599.   FILE * inf;
  600.   inf = fopen( "0xbtc.conf", "r" );
  601.   if( inf )
  602.   {
  603.     fgets( config, 10, inf );
  604.     fclose( inf );
  605.     intensity = atol( strtok( config, " " ) );
  606.     cuda_device = atol( strtok( NULL, " " ) );
  607.     printf ("\n\nRead ./0xbtc.conf - using custom intensity %d and CUDA device %d.", intensity, cuda_device);
  608.   }
  609.   else
  610.   {
  611.     intensity = INTENSITY;
  612.     cuda_device = CUDA_DEVICE;
  613.     printf ("\n\nNo ./0xbtc.conf - using hardcoded intensity %d and CUDA device %d.", intensity, cuda_device);
  614.   }
  615.  
  616.   cudaGetDeviceCount( &device_count );
  617.  
  618.   if( cudaGetDeviceProperties( &device_prop, cuda_device ) != cudaSuccess )
  619.   {
  620.     printf( "Problem getting properties for device, exiting...\n" );
  621.     exit( EXIT_FAILURE );
  622.   }
  623.  
  624.   cudaSetDevice( cuda_device );
  625.  
  626.   if( !gpu_initialized )
  627.   {
  628.     // CPU usage goes _insane_ without this.
  629.     cudaDeviceReset();
  630.     cudaSetDeviceFlags( cudaDeviceScheduleBlockingSync | cudaDeviceLmemResizeToMax );
  631.     cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
  632.  
  633.     printf ("\nInitialized CUDA device %d : ", cuda_device);
  634.     printf (device_prop.name);
  635.     printf ("\nAvailable compute capability: %d.%d \n\n", device_prop.major, device_prop.minor);
  636.     printf ("Now mining. ");   // not really but we're about to be, and don't want this msg repeating
  637.  
  638.     cudaMalloc( (void**)&d_solution, sizeof( uint64_t ) ); // solution
  639.     cudaMallocHost( (void**)&h_message, sizeof( uint64_t ) );
  640.  
  641.     (uint32_t&)(init_message[52]) = 014533075101u;
  642.     (uint32_t&)(init_message[56]) = 014132271150u;
  643.  
  644.     srand((time(NULL) & 0xFFFF) | (getpid() << 16));
  645.     for(int8_t i_rand{ 60 }; i_rand < 84; ++i_rand){
  646.       init_message[i_rand] = (uint8_t)rand() % 256;
  647.     }
  648.     memcpy( solution, &init_message[52], 32 );
  649.  
  650.     uint32_t h_threads{ 1u << intensity };
  651.     cudaMemcpyToSymbol( threads, &h_threads, sizeof( h_threads ), 0, cudaMemcpyHostToDevice );
  652.  
  653.     gpu_initialized = true;
  654.   }
  655.  
  656.   compute_version = device_prop.major * 100 + device_prop.minor * 10;
  657.  
  658.   // convert from GHz to hertz
  659.   clock_speed = (int32_t)( device_prop.memoryClockRate * 1000 * 1000 );
  660.  
  661.   //cnt = 0;
  662.   printable_hashrate_cnt = 0;
  663.   print_counter = 0;
  664.  
  665.   ftime( &start );
  666.   if( new_input ) new_input = false;
  667. }
  668.  
  669. __host__
  670. void update_mining_inputs( uint64_t target, uint8_t* hash_prefix )
  671. {
  672.   memcpy( init_message, hash_prefix, 52 );
  673.   send_to_device( target, (uint64_t*)init_message );
  674. }
  675.  
  676. __host__
  677. void gpu_cleanup()
  678. {
  679.   cudaSetDevice( cuda_device );
  680.  
  681.   cudaThreadSynchronize();
  682.  
  683.   cudaFree( d_solution );
  684.   cudaFreeHost( h_message );
  685.  
  686.   cudaDeviceReset();
  687. }
  688.  
  689. __host__
  690. bool find_message()
  691. {
  692.   cudaSetDevice( cuda_device );
  693.  
  694.   uint32_t threads{ 1u << intensity };
  695.  
  696.   uint32_t tpb{ compute_version > 500 ? TPB50 : TPB35 };
  697.   dim3 grid{ (threads + tpb - 1) / tpb };
  698.   dim3 block{ tpb };
  699.  
  700.   gpu_mine <<< grid, block >>> ( d_solution, cnt );
  701.   // cudaError_t cudaerr = cudaDeviceSynchronize();
  702.   // if( cudaerr != cudaSuccess )
  703.   // {
  704.   //  printf( "kernel launch failed with error %d: \x1b[38;5;196m%s.\x1b[0m\n", cudaerr, cudaGetErrorString( cudaerr ) );
  705.   //  exit( EXIT_FAILURE );
  706.   // }
  707.  
  708.   cnt += threads;
  709.   printable_hashrate_cnt += threads;
  710.  
  711.   cudaMemcpy( h_message, d_solution, sizeof( uint64_t ), cudaMemcpyDeviceToHost );
  712.   if( *h_message != UINT64_MAX )
  713.     memcpy( &solution[12], h_message, sizeof( uint64_t ) );
  714.  
  715.   ftime( &end );
  716.   double t{ (double)((end.time * 1000 + end.millitm) - (start.time * 1000 + start.millitm)) / 1000 };
  717.  
  718.   if( t*10 > print_counter )
  719.   {
  720.     ++print_counter;
  721.  
  722.     // maybe breaking the control codes into macros is a good idea . . .
  723.     printf( "\x1b[s\x1b[?25l\x1b[2;22f\x1b[38;5;221m%*.2f\x1b[0m\x1b[u\x1b[?25h"
  724.             "\x1b[s\x1b[?25l\x1b[3;36f\x1b[38;5;208m%*" PRIu64 "\x1b[0m\x1b[u\x1b[?25h"
  725.             "\x1b[s\x1b[?25l\x1b[2;75f\x1b[38;5;33m%02u:%02u\x1b[0m\x1b[u\x1b[?25h",
  726.             8, ( (double)printable_hashrate_cnt / t / 1000000 ),
  727.             25, printable_hashrate_cnt,
  728.             ((uint32_t)t/60), ((uint32_t)t%60) );
  729.   }
  730.  
  731.   return ( *h_message != UINT64_MAX );
  732.   // return ( h_done[0] >= 0 );
  733. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement