Advertisement
starchyfort

Untitled

Aug 4th, 2018
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 39.55 KB | None | 0 0
  1. #include "cuda.h"                        // check for dups ...
  2. #include "cuda_runtime.h"
  3. #include <iostream>
  4. #include "device_launch_parameters.h"
  5. #include <stdint.h>                        // check for dup in cpp, see if it matters or is necessary
  6. #include <inttypes.h>                    // same check for dups
  7. #include <stdio.h>                        // ditto
  8. #include <time.h>
  9. #include <sys/timeb.h>                    // TODO: double-check linux compatibility for future port/build, should be fine
  10. #include "Windows.h"
  11.  
  12. #include <string>
  13.  
  14. #define        CUDA_HOWMANY_DEVICES    2
  15. #define        CUDA_MAX_DEVICES        16
  16.  
  17. using namespace std;
  18.  
  19. extern bool gSolnFound[CUDA_MAX_DEVICES];
  20. extern string gStr_Hashrate[CUDA_MAX_DEVICES];
  21.  
  22. // TODO: initialization, including nullptr ?
  23. bool    gSolving = false;                               // if set to false, solving stops, then CUDA shutdown
  24. uint64_t* d_solution;
  25. uint8_t* d_challenge;
  26. uint8_t* d_hash_prefix;
  27. __constant__ uint64_t    d_mid[CUDA_MAX_DEVICES][25];
  28. __constant__ uint64_t    d_target[CUDA_MAX_DEVICES];
  29. __constant__ uint32_t    threads[CUDA_MAX_DEVICES];
  30.  
  31. extern uint8_t gSolution[CUDA_MAX_DEVICES][32];                                    // stores sol'n after a successful solve. (TODO: sol'ns queue)
  32.                                                                                 // see equivalent (actual) declaration in main_includes.h
  33. extern double cuda_solvetime[CUDA_MAX_DEVICES];                                    // stores time taken this solve
  34.  
  35.  
  36. uint64_t    printable_hashrate_cnt[CUDA_MAX_DEVICES] = { 0 };
  37.  
  38. // function prototypes
  39. extern void print_bytes(uint8_t inArray[], uint8_t len);        // defined in C/C++ code compiled by non-CUDA compiler
  40. extern void EnqueueSolution();                                    // defined in comms_pool.h
  41. int cuda_shutdown();                                            // later in this file
  42.  
  43.  
  44. #define ROTL64(x, y) (((x) << (y)) ^ ((x) >> (64 - (y))))
  45. #define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
  46.  
  47. #define            DEF_CUDA_DEVICE_INDEX_DEFAULT         0
  48. #define            DEF_CUDA_DEVICE_INTENSITY            24
  49.  
  50. cudaDeviceProp  cdp_chosenDevice[CUDA_MAX_DEVICES];                        // TODO: initialize with appropriate null
  51. uint8_t            gCudaDeviceIntensities[CUDA_MAX_DEVICES] = { 0 };
  52.  
  53. int                device_count = 0;
  54. extern string    gpuName[CUDA_MAX_DEVICES];
  55. extern string    gStr_HashCount[CUDA_MAX_DEVICES];
  56.  
  57. int                gCuda_Device0Intensity = DEF_CUDA_DEVICE_INTENSITY;        // Should be overwritten by real setting (config file, OptionsForm.)
  58. uint64_t        cudaDeviceClockSpeed[CUDA_MAX_DEVICES] = { 0 };            // TODO: an array of devices
  59. uint8_t            cudaDeviceComputeCapability[CUDA_MAX_DEVICES] = { 0 };    // TODO: likewise, and default fallback just in case
  60. bool            cudaDeviceIsInitialized[CUDA_MAX_DEVICES] = { 0 };        // TODO: an array of devices
  61.  
  62. uint32_t        numThreads[CUDA_MAX_DEVICES] = { 0 };
  63. uint64_t        theCounter = 0;                            // TODO: multiple devices, independent or unified counter?
  64. uint64_t        printableHashrateCounter[CUDA_MAX_DEVICES] = { 0 };        // ^^
  65. uint64_t        printableCounter[CUDA_MAX_DEVICES] = { 0 };                // ^^
  66. uint64_t        print_timer[CUDA_MAX_DEVICES] = { 0 };                    // TODO: rename these- they're too confusingly similar ;)
  67.  
  68. extern uint8_t  solution[CUDA_MAX_DEVICES][32];
  69.  
  70. // TODO: initialize? (nullptr)
  71. // TODO: ** we need to reset to UINT64_MAX each time we get a sol'n! IMPT Reminder !! **
  72. extern uint64_t* h_message[CUDA_MAX_DEVICES];            // init to nullptr? (FIXME)
  73.  
  74. struct timeb    tStart[CUDA_MAX_DEVICES], tEnd[CUDA_MAX_DEVICES];                            // ^^
  75.                                                         // TODO/FIXME: This shouldn't be necessary- work out make options which don't require it
  76. //#include "cuda_funcs.h"
  77.  
  78.  
  79. //
  80. // CUDACALLCHECKERROR: Processes result (type cudaError) of a CUDA call. Provides error-checking
  81. uint8_t cudaCallCheckError(cudaError theResult)
  82. {
  83.     if (theResult == cudaSuccess)
  84.     {
  85.         printf("Successful!\n");
  86.         return 0; // no error
  87.     }
  88.     else
  89.     {
  90.         printf("Error.\nCUDA Result: %s \n", cudaGetErrorString(theResult));
  91.         return 1; // for error counters
  92.     }
  93. }
  94.  
  95.  
  96.  
  97. __device__ __forceinline__
  98. uint64_t bswap_64(uint64_t input)
  99. {
  100.     asm("{"
  101.         "  .reg .u32 oh, ol;"
  102.         "  mov.b64 {oh,ol}, %0;"
  103.         "  prmt.b32 oh, oh, 0, 0x0123;"
  104.         "  prmt.b32 ol, ol, 0, 0x0123;"
  105.         "  mov.b64 %0, {ol,oh};"
  106.         "}" : "+l"(input));
  107.     return input;
  108. }
  109.  
  110. // try doing this with two offsettings of output operand instead
  111. __device__ __forceinline__
  112. uint64_t ROTL64asm(uint64_t input, uint32_t magnitude)
  113. {
  114. #if __CUDA_ARCH__ >= 320
  115.     asm("{"
  116.         ".reg .b32 hi, lo, mag, scr;"
  117.         "mov.b32 mag, %1;"
  118.         "mov.b64 {hi,lo}, %0;"
  119.         "shf.l.wrap.b32 scr, lo, hi, mag;"
  120.         "shf.l.wrap.b32 lo, hi, lo, mag;"
  121.         "mov.b64 %0, {scr,lo};"
  122.         "}" : "+l"(input) : "r"(magnitude));
  123.     return input;
  124. #else
  125.     return ROTL64(input, magnitude);
  126. #endif
  127. }
  128.  
  129. // try doing this with two offsettings of output operand instead
  130. __device__ __forceinline__
  131. uint64_t ROTR64asm(uint64_t input, uint32_t magnitude)
  132. {
  133.     // TODO/FIXME: verify correct version is running on multiple arches
  134. #if __CUDA_ARCH__ >= 320
  135.     asm("{"
  136.         ".reg .b32 hi, lo, mag, scr;"
  137.         "mov.b32 mag, %1;"
  138.         "mov.b64 {hi,lo}, %0;"
  139.         "shf.r.wrap.b32 scr, hi, lo, mag;"
  140.         "shf.r.wrap.b32 lo, lo, hi, mag;"
  141.         "mov.b64 %0, {scr,lo};"
  142.         "}" : "+l"(input) : "r"(magnitude));
  143.     return input;
  144. #else
  145.     return ROTR64(input, magnitude);
  146. #endif
  147. }
  148.  
  149. __device__ __forceinline__
  150. uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
  151. {
  152.     asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(b));
  153.     asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(c));
  154.     asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(d));
  155.     asm("  xor.b64 %0, %0, %1;" : "+l"(a) : "l"(e));
  156.     return a;
  157. }
  158.  
  159. // shortcut to rotation by 32 (flip halves), then rotate left by `mag`
  160. __device__ __forceinline__
  161. uint64_t ROTLfrom32(uint64_t rtdby32, uint32_t magnitude)
  162. {
  163.     asm("{"
  164.         "    .reg .b32 hi, lo, scr, mag;       "
  165.         "    mov.b64 {lo,hi}, %0;              "      // halves reversed since rotl'd by 32
  166.         "    mov.b32 mag, %1;                  "
  167.         "    shf.l.wrap.b32 scr, lo, hi, mag;  "
  168.         "    shf.l.wrap.b32 lo, hi, lo, mag;   "
  169.         "    mov.b64 %0, {scr,lo};             "
  170.         "}" : "+l"(rtdby32) : "r"(magnitude));    // see if this is faster w/ uint2 .x and .y
  171.                                                   // for saving shf results out
  172.     return rtdby32;   // return rotation from the rotation by 32
  173. }
  174.  
  175. // shortcut to rotation by 32 (flip halves), then rotate right by `mag`
  176. __device__ __forceinline__
  177. uint64_t ROTRfrom32(uint64_t rtdby32, uint32_t magnitude)
  178. {
  179.     asm("{"
  180.         "    .reg .b32 hi, lo, scr, mag;       "
  181.         "    mov.b64 {lo,hi}, %0;              "      // halves reversed since rotl'd by 32
  182.         "    mov.b32 mag, %1;                  "
  183.         "    shf.r.wrap.b32 scr, hi, lo, mag;  "
  184.         "    shf.r.wrap.b32 lo, lo, hi, mag;   "
  185.         "    mov.b64 %0, {scr,lo};             "
  186.         "}" : "+l"(rtdby32) : "r"(magnitude));    // see if this is faster w/ uint2 .x and .y
  187.                                                   // for saving shf results out
  188.     return rtdby32;   // return rotation from the rotation by 32
  189. }
  190.  
  191. __device__ __forceinline__
  192. uint64_t ROTLby16(uint64_t input)
  193. {
  194.     asm("{"
  195.         "   .reg .b32 hi, lo, scr;"
  196.         "   mov.b64 {hi,lo}, %0;"
  197.         "   prmt.b32 scr, hi, lo, 0x5432;"
  198.         "   prmt.b32 lo, hi, lo, 0x1076;"
  199.         "   mov.b64 %0, {lo,scr};"
  200.         " }" : "+l"(input));
  201.     return input;
  202. }
  203.  
  204. // see if this is faster using x and y vectors, no extra regs
  205. __device__ __forceinline__
  206. uint64_t ROTLby8(uint64_t input)
  207. {
  208.     asm("{"
  209.         ".reg .b32 hi, lo, scr;"
  210.         "mov.b64 {hi,lo}, %0;"
  211.         "prmt.b32 scr, hi, lo, 0x2107;"
  212.         "prmt.b32 lo, hi, lo, 0x6543;"
  213.         "mov.b64 %0, {scr,lo};"
  214.         "}"
  215.         : "+l"(input));
  216.     return input;
  217. }
  218.  
  219. __device__ __forceinline__
  220. uint64_t ROTRby8(uint64_t input)
  221. {
  222.     asm("{"
  223.         ".reg .b32 hi, lo, scr;"
  224.         "mov.b64 {hi,lo}, %0;"
  225.         "prmt.b32 scr, lo, hi, 0x0765;"
  226.         "prmt.b32 lo, lo, hi, 0x4321;"
  227.         "mov.b64 %0, {scr,lo};"
  228.         "}"
  229.         : "+l"(input));
  230.     return input;
  231. }
  232.  
  233. // TODO: Look for a snappier way to do this. Should still be slightly
  234. //       faster than the variable-magnitude version above.
  235. __device__ __forceinline__
  236. uint64_t ROTLby1(uint64_t input)
  237. {
  238. #if __CUDA_ARCH__ >= 320
  239.     asm("{"
  240.         ".reg .b32 hi, lo, scr;"
  241.         "mov.b64 {hi,lo}, %0;"
  242.         "shf.l.wrap.b32 scr, lo, hi, 1;"   // magnitude replaced w/ immediate
  243.         "shf.l.wrap.b32 lo, hi, lo, 1;"    // magnitude replaced w/ immediate
  244.         "mov.b64 %0, {scr,lo};"
  245.         "}" : "+l"(input));
  246.     return input;
  247. #else
  248.     return ROTL64(input, 1);
  249. #endif
  250. }
  251.  
  252. // try doing this with two offsettings of output operand instead
  253. __device__ __forceinline__
  254. uint64_t ROTRby1(uint64_t input)
  255. {
  256. #if __CUDA_ARCH__ >= 320
  257.     asm("{"
  258.         ".reg .b32 hi, lo, scr;"
  259.         "mov.b64 {hi,lo}, %0;"
  260.         "shf.r.wrap.b32 scr, hi, lo, 1;"
  261.         "shf.r.wrap.b32 lo, lo, hi, 1;"
  262.         "mov.b64 %0, {scr,lo};"
  263.         "}" : "+l"(input));
  264.     return input;
  265. #else
  266.     return ROTR64(input, 1);
  267. #endif
  268. }
  269.  
  270. __device__ __forceinline__
  271. uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
  272. {
  273.     uint64_t output{ 0 };
  274.     asm("{"
  275.         "  xor.b64 %0, %1, %2;"
  276.         "  xor.b64 %0, %0, %3;"
  277.         "}" : "+l"(output) : "l"(a), "l"(b), "l"(c));
  278.     return output;
  279. }
  280.  
  281. // FIXME: Assuming SM 5.x+
  282. __device__ __forceinline__
  283. uint64_t lop3_0xD2(uint64_t a, uint64_t b, uint64_t c)
  284. {   // FIXME/TODO: make SURE that the correct version is running on Maxwell Gen2, Pascal!
  285. #if __CUDA_ARCH__ >= 500
  286.     asm("{"
  287.         "  .reg .b32 ah, al, bh, bl, ch, cl;"
  288.         "  mov.b64 {ah,al}, %0;"
  289.         "  mov.b64 {bh,bl}, %1;"
  290.         "  mov.b64 {ch,cl}, %2;"
  291.  
  292.         "  lop3.b32 ah, ah, bh, ch, 0xD2;"
  293.         "  lop3.b32 al, al, bl, cl, 0xD2;"
  294.         "  mov.b64 %0, {ah,al};"
  295.         "}" : "+l"(a) : "l"(b), "l"(c));
  296.     return a;
  297. #else
  298.     return a ^ ((~b) & c);
  299. #endif
  300. }
  301.  
  302. __device__
  303. bool keccak(uint8_t deviceNum, uint64_t nonce, uint32_t thread) // had 'passing optimizations' and 'passing mid' in 3.4t
  304. {
  305.     uint64_t state[25], C[5], D[5], scratch;
  306.  
  307.     //if (thread == 543210)
  308.     //    PermTest (0);
  309.     uint64_t RClocal[24] =
  310.     {
  311.         /* Element     (elements which are '32bit': 1, 4-5, 8, 9-12, 18, 22)      */
  312.         /* -------     ------------------  ------------------  ------------------ */
  313.         /* 00..02  */  0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
  314.         /* 03..05  */  0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
  315.         /* 06..08  */  0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
  316.         /* 09..11  */  0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
  317.         /* 12..14  */  0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
  318.         /* 15..17  */  0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
  319.         /* 18..20  */  0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
  320.         /* 21..23  */  0x8000000000008080, 0x0000000080000001, 0x8000000080008008
  321.         /* -------     ------------------  ------------------  ------------------ */
  322.     };
  323.  
  324.     // TODO: consider defining these conditionally, preprocessor
  325.     //if (HB_SELECTIVE_32BIT_XOR)
  326.     //uint2* RCvec = (uint2*)&RClocal;     // vectorized access to RClocal[]
  327.     //uint2* stateVec = (uint2*)&state;    // vectorized access to state[]
  328.  
  329.     //if (thread == 543210)
  330.     //    PermTest (0);
  331.  
  332.     C[0] = d_mid[deviceNum][2] ^ ROTR64asm(nonce, 20);
  333.     C[1] = d_mid[deviceNum][4] ^ ROTL64asm(nonce, 14);
  334.  
  335.     state[0] = lop3_0xD2(d_mid[deviceNum][0], d_mid[deviceNum][1], C[0]);   //^ 0x0000000000000001;
  336. #if HB_SELECTIVE_32BIT_XOR                          // shouldn't it be .y? only XOR low end of RC 0
  337.                                                       //stateVec[0].x = stateVec[0].x ^ 0x00000001;   // because the rest is leading zeroes :)
  338.     asm("xor.b32 %0, %0, 0x00000001;" : "+r"(stateVec[0].x));
  339. #else
  340.     state[0] = state[0] ^ 0x0000000000000001;     // was RC[0]
  341. #endif
  342.  
  343.     state[1] = lop3_0xD2(d_mid[deviceNum][1], C[0], d_mid[deviceNum][3]);
  344.     state[2] = lop3_0xD2(C[0], d_mid[deviceNum][3], C[1]);
  345.     state[3] = lop3_0xD2(d_mid[deviceNum][3], C[1], d_mid[deviceNum][0]);
  346.     state[4] = lop3_0xD2(C[1], d_mid[deviceNum][0], d_mid[deviceNum][1]);
  347.  
  348.     C[0] = d_mid[deviceNum][6] ^ ROTL64asm(nonce, 20);          // nonce*1048576;
  349.     C[1] = d_mid[deviceNum][9] ^ ROTR64(nonce, 2);
  350.     state[5] = lop3_0xD2(d_mid[deviceNum][5], C[0], d_mid[deviceNum][7]);
  351.     state[6] = lop3_0xD2(C[0], d_mid[deviceNum][7], d_mid[deviceNum][8]);
  352.     state[7] = lop3_0xD2(d_mid[deviceNum][7], d_mid[deviceNum][8], C[1]);
  353.     state[8] = lop3_0xD2(d_mid[deviceNum][8], C[1], d_mid[deviceNum][5]);
  354.     state[9] = lop3_0xD2(C[1], d_mid[deviceNum][5], C[0]);
  355.  
  356.     // experimental rotation replacement
  357.     scratch = nonce * 128;
  358.     C[0] = d_mid[deviceNum][11] ^ scratch;                    // ROTL by 7
  359.     C[1] = d_mid[deviceNum][13] ^ scratch * 2;                  // ROTL by 8
  360.     state[10] = lop3_0xD2(d_mid[deviceNum][10], C[0], d_mid[deviceNum][12]);
  361.     state[11] = lop3_0xD2(C[0], d_mid[deviceNum][12], C[1]);
  362.     state[12] = lop3_0xD2(d_mid[deviceNum][12], C[1], d_mid[deviceNum][14]);
  363.     state[13] = lop3_0xD2(C[1], d_mid[deviceNum][14], d_mid[deviceNum][10]);
  364.     state[14] = lop3_0xD2(d_mid[deviceNum][14], d_mid[deviceNum][10], C[0]);
  365.  
  366.     C[0] = d_mid[deviceNum][15] ^ ROTL64asm(nonce, 27);   //nonce*134217728;
  367.     C[1] = d_mid[deviceNum][18] ^ ROTLby16(nonce);
  368.     state[15] = lop3_0xD2(C[0], d_mid[deviceNum][16], d_mid[deviceNum][17]);
  369.     state[16] = lop3_0xD2(d_mid[deviceNum][16], d_mid[deviceNum][17], C[1]);
  370.     state[17] = lop3_0xD2(d_mid[deviceNum][17], C[1], d_mid[deviceNum][19]);
  371.     state[18] = lop3_0xD2(C[1], d_mid[deviceNum][19], C[0]);
  372.     state[19] = lop3_0xD2(d_mid[deviceNum][19], C[0], d_mid[deviceNum][16]);
  373.  
  374.     C[0] = d_mid[deviceNum][20] ^ ROTRby1(nonce);
  375.     C[1] = d_mid[deviceNum][21] ^ ROTR64(nonce, 9);      //idea: ROTRby1(ROTRby8(nonce));
  376.     C[2] = d_mid[deviceNum][22] ^ ROTR64(nonce, 25);
  377.     state[20] = lop3_0xD2(C[0], C[1], C[2]);
  378.     state[21] = lop3_0xD2(C[1], C[2], d_mid[deviceNum][23]);
  379.     state[22] = lop3_0xD2(C[2], d_mid[deviceNum][23], d_mid[deviceNum][24]);
  380.     state[23] = lop3_0xD2(d_mid[deviceNum][23], d_mid[deviceNum][24], C[0]);
  381.     state[24] = lop3_0xD2(d_mid[deviceNum][24], C[0], C[1]);
  382.  
  383.  
  384. #if __CUDA_ARCH__ >= 350
  385. #  pragma unroll
  386. #endif
  387.     for (int32_t i{ 1 }; i < 23; ++i)
  388.     {
  389.         // Theta
  390.         for (uint32_t x{ 0 }; x < 5; ++x)
  391.         {
  392.             C[(x + 6) % 5] = xor5(state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20]);
  393.         }
  394.  
  395. #if __CUDA_ARCH__ >= 350
  396.         for (uint32_t x{ 0 }; x < 5; ++x)
  397.         {
  398.             D[x] = ROTL64(C[(x + 2) % 5], 1);
  399.             state[x] = xor3(state[x], D[x], C[x]);
  400.             state[x + 5] = xor3(state[x + 5], D[x], C[x]);
  401.             state[x + 10] = xor3(state[x + 10], D[x], C[x]);
  402.             state[x + 15] = xor3(state[x + 15], D[x], C[x]);
  403.             state[x + 20] = xor3(state[x + 20], D[x], C[x]);
  404.         }
  405. #else
  406.         for (uint32_t x{ 0 }; x < 5; ++x)
  407.         {
  408.             D[x] = ROTL64(C[(x + 2) % 5], 1) ^ C[x];
  409.             state[x] = state[x] ^ D[x];
  410.             state[x + 5] = state[x + 5] ^ D[x];
  411.             state[x + 10] = state[x + 10] ^ D[x];
  412.             state[x + 15] = state[x + 15] ^ D[x];
  413.             state[x + 20] = state[x + 20] ^ D[x];
  414.         }
  415. #endif
  416.  
  417.         // Rho Pi
  418.         C[0] = state[1];
  419.         state[1] = ROTR64asm(state[6], 20);
  420.         state[6] = ROTL64asm(state[9], 20);
  421.         state[9] = ROTR64(state[22], 3);
  422.         state[22] = ROTR64asm(state[14], 25);
  423.         state[14] = ROTL64asm(state[20], 18);
  424.         state[20] = ROTR64(state[2], 2);
  425.         state[2] = ROTR64asm(state[12], 21);
  426.         state[12] = ROTL64asm(state[13], 25);
  427.         state[13] = ROTLby8(state[19]);
  428.         state[19] = ROTRby8(state[23]);
  429.         state[23] = ROTR64asm(state[15], 23);
  430.  
  431.         // FIXME: watch out for thread divergence within warp. But this should be fine.
  432.         //if (optimizations[NUM_HBOPT_ROTLBACK32])
  433. #if __CUDA_ARCH__ >= 320
  434.         state[15] = ROTRfrom32(state[4], 5);        // (make toggleable- forced on for now)
  435. #else
  436.         state[15] = ROTL64(state[4], 27);
  437. #endif
  438.  
  439.         /*if (thread==1236667)
  440.         {
  441.         printf ("st15: %" PRIx64 " \n", state[15]);
  442.         printf ("tstR: %" PRIx64 " \n", ROTRfrom32(state[4], 5) );
  443.         printf ("tstL: %" PRIx64 " \n\n", ROTLfrom32(state[4], 5) );
  444.         }*/
  445.  
  446.         state[4] = ROTL64asm(state[24], 14);
  447.         state[24] = ROTL64(state[21], 2);
  448.         state[21] = ROTR64(state[8], 9);            // R9
  449.         state[8] = ROTR64asm(state[16], 19);
  450.  
  451.         // FIXME: watch out for thread divergence. Slowdown here. Figure out why
  452.         //if (optimizations[NUM_HBOPT_ROTLBACK32])    // forced on
  453.         //{
  454. #if __CUDA_ARCH__ >= 320
  455.         state[16] = ROTLfrom32(state[5], 4);
  456.         state[5] = ROTRfrom32(state[3], 4);
  457. #else
  458.         state[16] = ROTR64(state[5], 28);
  459.         state[5] = ROTL64(state[3], 28);
  460. #endif
  461.         state[3] = ROTL64asm(state[18], 21);
  462.         state[18] = ROTL64asm(state[17], 15);
  463.         state[17] = ROTL64asm(state[11], 10);
  464.         state[11] = ROTL64(state[7], 6);
  465.         state[7] = ROTL64(state[10], 3);
  466.         state[10] = ROTLby1(C[0]);
  467.  
  468.         // lop3_0xD2
  469.         for (uint32_t x{ 0 }; x < 25; x += 5)
  470.         {
  471.             C[0] = state[x];
  472.             C[1] = state[x + 1];
  473.             C[2] = state[x + 2];
  474.             C[3] = state[x + 3];
  475.             C[4] = state[x + 4];
  476.             state[x] = lop3_0xD2(C[0], C[1], C[2]);
  477.             state[x + 1] = lop3_0xD2(C[1], C[2], C[3]);
  478.             state[x + 2] = lop3_0xD2(C[2], C[3], C[4]);
  479.             state[x + 3] = lop3_0xD2(C[3], C[4], C[0]);
  480.             state[x + 4] = lop3_0xD2(C[4], C[0], C[1]);
  481.         }
  482.  
  483.         // Iota
  484.         state[0] = state[0] ^ RClocal[i];
  485.     }
  486.  
  487.     for (uint32_t x{ 0 }; x < 5; ++x)
  488.     {
  489.         C[(x + 6) % 5] = xor5(state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20]);
  490.     }
  491.  
  492.     D[0] = ROTLby1(C[2]);
  493.     D[1] = ROTLby1(C[3]);
  494.     D[2] = ROTLby1(C[4]);
  495.  
  496.     state[0] = xor3(state[0], D[0], C[0]);
  497.     state[6] = ROTR64asm(xor3(state[6], D[1], C[1]), 20);
  498.     state[12] = ROTR64(xor3(state[12], D[2], C[2]), 21);
  499.  
  500.     state[0] = lop3_0xD2(state[0], state[6], state[12]) ^ 0x8000000080008008;    // was RC[23];
  501.  
  502.     return bswap_64(state[0]) <= d_target[deviceNum];
  503. }
  504.  
  505.  
  506. //
  507. // TODO: Currently only initializes one GPU with a semi-hardcoded index #.
  508. //         Initialize multiple GPUs and put those to work, too! :)
  509. //         `theCudaDeviceIndex` is selected device's index#, with CUDA's enumeration style
  510. // TODO: This function is too big and multifunction: let's break it up into convenient pieces
  511. //         and only run the parts we need to, when we need to.
  512. int cuda_device_init()
  513. {
  514.     string scratchStr = "";
  515.     uint8_t i = 0;
  516.     // Used by experimental Cosmic 3.5a code that matches CUDA device indices to NVML and NVapi ones
  517.     //                                                            (they enumerate devices differently.)
  518.     //char buf[16] = "?";
  519.  
  520.     // TODO: Decide if I want to return error types as integer or cudaSuccess (be consistent.)
  521.     //cudaError_t   cudaResult = cudaSuccess;
  522.  
  523.     device_count = cudaGetDeviceCount(&device_count);
  524.     printf( "cuda_init(): found %d CUDA devices.\n", device_count );
  525.     for (i = 0; i < CUDA_HOWMANY_DEVICES; ++i)
  526.     {
  527.         gCudaDeviceIntensities[i] = 24;    // HAX! (TODO)
  528.  
  529.         printf("Initializing CUDA device with index # %d ...\n", i);
  530.             gCudaDeviceIntensities[i] = 24;
  531.         if (cudaGetDeviceProperties(&cdp_chosenDevice[i], i) == cudaSuccess)
  532.             printf("Successfully got CUDA device # %d! \n", i);
  533.         else
  534.         {
  535.             printf("cuda_init(): Couldn't get CUDA device (# %d ). (Busy? Too high intensity setting?\n", i);
  536.             //TODO: display chosen CUDA device index # in Win32 dialog, too
  537.             //MessageBox(NULL, "Couldn't get CUDA device. (Busy? Too high intensity setting?", "From Function: cuda_init()", MB_ICONEXCLAMATION | MB_OK);
  538.             continue;
  539.         }
  540.  
  541.         // FIXME: Is this better just before cudaGetDeviceProperties() above?
  542.         printf( "Setting device #%d ...\n", i );
  543.         cudaCallCheckError( cudaSetDevice(i) );
  544.  
  545.         if (!cudaDeviceIsInitialized[i])
  546.         {
  547.             printf(" Resetting Device #%d ...\n" );
  548.             if (cudaCallCheckError( cudaDeviceReset() ) != 0)                                        // In 0xBitcoin-Miner, without this reset, CPU usage was too high.
  549.                 continue;    // run for() loop for next device(s)
  550.  
  551.             printf("Setting device %d flags.\n", i);
  552.             cudaCallCheckError( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | cudaDeviceLmemResizeToMax) );        // FIXME: ideal selection (see arches)
  553.             printf("Setting device %d cache configuration.\n", i);
  554.             cudaCallCheckError( cudaDeviceSetCacheConfig(cudaFuncCachePreferL1) );                                        // FIXME: ideal configuration (see arches)
  555.  
  556.             printf("\nInitialized CUDA device %d : %s", i, cdp_chosenDevice[i].name);
  557.             printf("\nAvailable compute capability: %d.%d \n\n", cdp_chosenDevice[i].major, cdp_chosenDevice[i].minor);
  558.             gpuName[i] = cdp_chosenDevice[i].name;
  559.  
  560.             //TODO: Here, I take the cdp(cuda device prop, chosen device)'s Compute Capability and condense for encoding into Share margin
  561.             //computeCapabilityForMargin = (cdp_chosenDevice.major * 10) + cdp_chosenDevice.minor;  // e.g. major:6,minor:1 becomes 61
  562.             //numberMultiprocessorsForMargin = cdp_chosenDevice.multiProcessorCount;
  563.  
  564.             cudaDeviceIsInitialized[i] = true;            // TODO: this will be an array of devices or similar approach
  565.  
  566.             cudaDeviceComputeCapability[i] = cdp_chosenDevice[i].major * 100 + cdp_chosenDevice[i].minor * 10;
  567.             // REF: from 0xBitcoin-Miner. GHz to Hz.
  568.             // TODO/FIXME: int cudaDeviceProp::clockRate [inherited]
  569.             // Clock frequency in kilohertz (GPU, correct?)
  570.             cudaDeviceClockSpeed[i] = (int64_t)(cdp_chosenDevice[i].memoryClockRate * 1000 * 1000);
  571.             printableHashrateCounter[i] = 0;        // ^^
  572.             printableCounter[i] = 0;                // ^^
  573.             ftime(&tStart[i]);
  574.         }
  575.  
  576.         theCounter = 0;                        // TODO: multiple devices, independent or unified counter?
  577.    
  578.         return 0;                            // no error. 1 is fatal error (see above.)
  579.  
  580.         // ^^ relation ?
  581.         // TODO: we'll get to this
  582.         /*if (new_input)
  583.             new_input = false;*/
  584.  
  585.         // this is for NVML, which can get handles from PCI ID as a string.
  586.         // we need to get the same device that CUDA is using, for NVML/NVapi use.
  587.         // TODO: implement this. Source: Cosmic v3.4(t)
  588.  
  589.         //cudaResult = cudaDeviceGetPCIBusId(buf, 16, cuda_device);
  590.         //if (cudaResult == cudaSuccess)
  591.         //    printf("cudaDeviceGetPCIBusID : %s \n", buf);
  592.         //else
  593.         //    printf("error getting BusID: %s \n", cudaGetErrorString(cudaResult));
  594.  
  595.         //printf ("-- CUDA pciDeviceID: %d \n", device_prop.pciDeviceID);
  596.         //printf ("-- CUDA pciBusID: %d \n\n", device_prop.pciBusID);
  597.  
  598.         //std::string cppStr;
  599.         //cppStr = buf;
  600.  
  601.         //return cppStr;     // return the PCI info to C++ code where it'll get NVML/NVapi handles
  602.                            //return device_prop.pciBusID;
  603.  
  604.         // REF: "cudaDeviceReset must be called before exiting in order for profiling and
  605.         // tracing tools such as Nsight and Visual Profiler to show complete traces."
  606.         // TODO: We want to reset when the GPU stops working
  607.  
  608.         // FIXME: redundant?
  609. }
  610.  
  611. // TODO/FIXME: consider alternate compute capability check(m.arch check). Forced TPB50
  612. /*#if //__CUDA_ARCH__ > 500
  613. #  define KERNEL_LAUNCH_PARAMS __global__ __launch_bounds__(TPB50, 1)
  614. #else
  615. #  define KERNEL_LAUNCH_PARAMS __global__ __launch_bounds__(TPB35, 2)
  616. #endif*/
  617.  
  618. // REF: uint32_t tpb{ compute_version > 500 ? TPB50 : TPB35 };
  619. #define TPB50 1024u            // compute_50 and up
  620. #define TPB35 384u            // compute_35 and down
  621.  
  622. __global__ __launch_bounds__(TPB50, 1)            // Enforce maximum threads! (TODO/FIXME)
  623. void mining_kernel( uint8_t deviceNum, uint64_t* possSolution, uint64_t theCounter )
  624. {
  625.     uint64_t thread = blockDim.x * blockIdx.x + threadIdx.x;
  626.     uint64_t nonce = theCounter + thread;
  627.     // Was here: passing optimizations (COSMiC 3.4t)
  628.     //uint64_t passingMid[25];
  629.     //uint8_t i;                        // will be set to 0 next line
  630.  
  631.     // TODO: while(1 && pauseVar|abortVar) faster?
  632.     if ( keccak(deviceNum, nonce, thread) == true )    // write solving device # to *devicenum
  633.     {
  634.         *possSolution = nonce; // write to function parameter
  635.         printf("\n\n* keccak returning true on device #%d with   cnt: %" PRIu64 ", nonce: %" PRIx64 " ! * \n\n", deviceNum, theCounter, nonce);        // no newline needed (debuggery)
  636.         return;
  637.     }
  638. }
  639.  
  640. //
  641. // HANDLE_KERNELS: Host function that launches kernels on the CUDA device(s).
  642. __host__
  643. int handle_kernels ()
  644. {
  645.     //#define NPT 2                // eh?
  646.     //cudaError anyError;            // TODO: init?
  647.     cudaError cudaResult = cudaSuccess;
  648.     uint64_t scratch = 0;
  649.     double     scr_meter = 0;
  650.     double     t = 0;
  651.  
  652.     for (uint8_t i = 0; i < CUDA_HOWMANY_DEVICES - 1; ++i)
  653.     {
  654.         // TODO: do this once per device, not every call
  655.         // TODO: fine-tune this for detected device's capabilities, and intensity per-device
  656.         uint32_t threads = (1u << gCuda_Device0Intensity);    // TODO: multi-GPU, separate intensities, defaults
  657.         int threads_per_block = TPB50;            // FIXME: for specific arches!
  658.         dim3 grid = (threads + threads_per_block - 1) / threads_per_block;
  659.         dim3 block = threads_per_block;
  660.  
  661.         cudaResult = cudaSetDevice(i);
  662.         if (cudaResult != cudaSuccess)
  663.             printf( "** cudaSetDevice(%d) failed: %s \n\n", i, cudaGetErrorString( cudaResult ) );
  664.  
  665.         cudaMemcpy( d_solution[i], h_message[i], 8, cudaMemcpyHostToDevice );   // 8 bytes
  666.  
  667.         mining_kernel <<< grid, block >>> (i, d_solution, theCounter);
  668.         cudaResult = cudaDeviceSynchronize();
  669.         if (cudaResult != cudaSuccess )
  670.             printf( "Kernel launch error (device #%d): %s \n\n", i, cudaGetErrorString(cudaResult) );
  671.         //Sleep(0.35);
  672.  
  673.         cudaMemcpy( h_message[i], d_solution[i], 8, cudaMemcpyDeviceToHost );       // 8 bytes      
  674.  
  675.         // if a solution was found by device `i`
  676.         if (*h_message[i] != UINT64_MAX)
  677.         {
  678.             printf( "\n\n** manage_kernels() for device #%d: Possible Solution Found ** \n\n", i );
  679.  
  680.             printf( "*h_message[%d] after solving is: %" PRIx64 " (expressed as uint64_t, hex): \n", i, *h_message[i] );
  681.             printf( "Copying into solution[%d][len:32] starting at element 12...\n", i );
  682.             memcpy( &solution[i][12], h_message[i], 8 );                           // 8 bytes
  683.  
  684.             printf( "Setting gSolnFound[i] to `true`. (TODO: sol'n queue) ... \n", i );
  685.             //memcpy(gSolution, solution, 32);
  686.             gSolnFound[i] = true;
  687.         }
  688.  
  689.         // this stuff happens whether a solution was found this call or not.
  690.         ftime( &tEnd[i] );
  691.         //cnt = 0;
  692.         theCounter += threads;
  693.         printable_hashrate_cnt[i] += threads;
  694.  
  695.         t = (double)((tEnd[i].time * 1000 + tEnd[i].millitm) - (tStart[i].time * 1000 + tStart[i].millitm)) / 1000;
  696.         cuda_solvetime[i] = t;        // # of ms taken for UI display and processing by statistics functions
  697.         scr_meter = (double)(printable_hashrate_cnt[i] / t / 1000000);
  698.         scratch = (uint64_t)scr_meter;
  699.         //++print_counter[i];
  700.  
  701.         // copy hashrate to global string for HUD display
  702.         gStr_Hashrate[i] = to_string(scratch);
  703.         gStr_HashCount[i] = to_string(printable_hashrate_cnt);
  704.         //}
  705.        
  706.     }
  707.  
  708.     printf( "GPU0 HR: %s, HCount: %s | GPU1 HR: %s, HCount: %s", gStr_Hashrate[0], gStr_HashCount[0], gStr_Hashrate[1], gStr_HashCount[1] );
  709.    
  710.     return -1;        // no solution found by any GPU
  711.  
  712.     // faster? return (*h_message != UINT64_MAX);
  713.     // return ( h_done[0] >= 0 );
  714. }
  715.  
  716. //
  717. // //
  718. void CUDA_Solve()
  719. {
  720.     printf("CUDA_Solve CPU thread starting.\n");
  721.     uint8_t errorCount = 0;
  722.  
  723.     // REF:
  724.     //   updateGPULoop(true);
  725.     //   CUDASolver::bytes_t byte_solution(32);
  726.     //   resetHashCount();
  727.  
  728.     *h_message[0] = UINT64_MAX;
  729.     *h_message[1] = UINT64_MAX;
  730.     gSolving = true;
  731.  
  732.     // TODO: rewrite this loop. Push sol'ns to queue where net thread can handle them.
  733.     //         figure out why ftime(&tStart) to reset the round time causes a delay when a share is
  734.     //         submitted. Write solutions sent out the log for analysis/look for dup shares.
  735.     do
  736.     {
  737.         if (h_message == nullptr || gSolving == false)
  738.         {
  739.             cout << "in do-while loop: gSolving==false or h_message==nullptr, breaking out" << endl;
  740.             break;
  741.         }
  742.  
  743.         if (handle_kernels() == true)
  744.         {
  745.             //if (*h_message != UINT64_MAX)
  746.             //{
  747.             if ( handle_kernels() != 99 )        // 99 = init value
  748.  
  749.  
  750.                 printf("In CUDA_Solve(): *h_message is != UINT64_MAX :) \n");
  751.                 printf("Candidate Solution: %" PRIx64 " \n", *h_message);
  752.  
  753.                 //CUDASolver::pushSolution(bytesToString(byte_solution));
  754.                 //resetHashCount();
  755.                 EnqueueSolution();
  756.  
  757.                 // consider updating the challenge, target now if needed
  758.                 //theCounter = 0;
  759.                 printable_hashrate_cnt = 0;
  760.             }
  761.  
  762.             *h_message[0] = UINT64_MAX;
  763.             *h_message[1] = UINT64_MAX;
  764.  
  765.             ftime(&tStart[i]);                        // store time when we did this
  766.         }
  767.     } while (*h_message == UINT64_MAX) ;
  768.  
  769.     // run cuda_shutdown(), get errorCount
  770.     if (cuda_shutdown() > 0)
  771.         printf("%d Error(s) while shutting down CUDA \n", errorCount);
  772.  
  773.     printf("CUDA_Solve CPU thread (ending).\n");
  774. }
  775.  
  776. //
  777. // //
  778. void StopCudaSolver()
  779. {
  780.     gSolving = false;
  781. }
  782.  
  783. //
  784. // SEND_TO_DEVICE: Builds a midstate from the Init Message, sends mid and target to devices
  785. __host__
  786. void send_to_device(uint64_t target, uint64_t* message) //uint64_t* message)
  787. {
  788.     uint64_t C[4], D[5], mid[25];
  789.     //cout << init_message << endl;
  790.  
  791.     printf("send_to_device: got 'target' (expressed as uint64_t, hex): %" PRIx64 " \n", target);
  792.  
  793.     //cudaSetDevice(numChosenDevice);
  794.  
  795.     printf("send_to_device(): got 'message' (expressed as byte array): \n");
  796.     //for (uint8_t y = 0; y < 10; ++y)
  797.     //    printf(" %" PRIx64 " ", message[y]);
  798.     print_bytes((uint8_t*)message, 84);
  799.     printf("\n\n");
  800.  
  801.     printf("Building the Midstate ...\n");
  802.     C[0] = message[0] ^ message[5] ^ message[10] ^ 0x100000000ull;
  803.     C[1] = message[1] ^ message[6] ^ 0x8000000000000000ull;
  804.     C[2] = message[2] ^ message[7];
  805.     C[3] = message[4] ^ message[9];
  806.  
  807.     D[0] = ROTL64(C[1], 1) ^ C[3];
  808.     D[1] = ROTL64(C[2], 1) ^ C[0];
  809.     D[2] = ROTL64(message[3], 1) ^ C[1];
  810.     D[3] = ROTL64(C[3], 1) ^ C[2];
  811.     D[4] = ROTL64(C[0], 1) ^ message[3];
  812.  
  813.     mid[0] = message[0] ^ D[0];
  814.     mid[1] = ROTL64(message[6] ^ D[1], 44);
  815.     mid[2] = ROTL64(D[2], 43);
  816.     mid[3] = ROTL64(D[3], 21);
  817.     mid[4] = ROTL64(D[4], 14);
  818.     mid[5] = ROTL64(message[3] ^ D[3], 28);
  819.     mid[6] = ROTL64(message[9] ^ D[4], 20);
  820.     mid[7] = ROTL64(message[10] ^ D[0] ^ 0x100000000ull, 3);
  821.     mid[8] = ROTL64(0x8000000000000000ull ^ D[1], 45);
  822.     mid[9] = ROTL64(D[2], 61);
  823.     mid[10] = ROTL64(message[1] ^ D[1], 1);
  824.     mid[11] = ROTL64(message[7] ^ D[2], 6);
  825.     mid[12] = ROTL64(D[3], 25);
  826.     mid[13] = ROTL64(D[4], 8);
  827.     mid[14] = ROTL64(D[0], 18);
  828.     mid[15] = ROTL64(message[4] ^ D[4], 27);
  829.     mid[16] = ROTL64(message[5] ^ D[0], 36);
  830.     mid[17] = ROTL64(D[1], 10);
  831.     mid[18] = ROTL64(D[2], 15);
  832.     mid[19] = ROTL64(D[3], 56);
  833.     mid[20] = ROTL64(message[2] ^ D[2], 62);
  834.     mid[21] = ROTL64(D[3], 55);
  835.     mid[22] = ROTL64(D[4], 39);
  836.     mid[23] = ROTL64(D[0], 41);
  837.     mid[24] = ROTL64(D[1], 2);
  838.  
  839.     printf( "Done. The generated Midstate: \n" );
  840.     // TODO: remove this debuggery
  841.     uint8_t *myPointer = (uint8_t*)&mid;
  842.     uint8_t mid_bytes[200] = { 0 };
  843.     for (uint8_t n = 0; n < 200; ++n)
  844.     {
  845.         mid_bytes[n] = myPointer[n];
  846.         printf("%02x ", myPointer[n]);
  847.     }
  848.  
  849.     // TODO/FIXME: cudaMAlloc these first? Doesn't seem done in 0xbitcoin-miner.
  850.     //                Check CUDA best practices.
  851.    
  852.     // probably unnecessary
  853.     printf("send_to_device(): setting device # 0 ... ");
  854.     cudaCallCheckError( cudaSetDevice(0) );
  855.  
  856.     printf("\n* Note: We are sending the SAME midstate and using the SAME solution byte array \n");
  857.     printf("          to give GPU0/GPU1 identical work for testing purposes.\n\n");
  858.  
  859.     // 0 is offset: not device #
  860.     printf("send_to_device(): copying midstate to d_mid[0] ... ");
  861.     cudaCallCheckError ( cudaMemcpyToSymbol(d_mid[0], mid, sizeof(mid), 0, cudaMemcpyHostToDevice) );    // gpu 0
  862.     printf("send_to_device(): copying target to d_target[0] ...");
  863.     cudaCallCheckError ( cudaMemcpyToSymbol(d_target[0], &target, sizeof(target), 0, cudaMemcpyHostToDevice) );    // gpu 0
  864.  
  865.     // probably unnecessary
  866.     printf("send_to_device(): setting device # 1 ... ");
  867.     cudaCallCheckError( cudaSetDevice(1) );
  868.  
  869.     printf("send_to_device(): copying midstate to d_mid[1] ... ");
  870.     cudaCallCheckError( cudaMemcpyToSymbol(d_mid[1], mid, sizeof(mid), 0, cudaMemcpyHostToDevice) );    // gpu 1
  871.     printf("send_to_device(): copying target to d_target[1] ... ");
  872.     cudaCallCheckError( cudaMemcpyToSymbol(d_target[1], &target, sizeof(target), 0, cudaMemcpyHostToDevice) );    // gpu 1
  873.     // 0 is offset: not device #
  874.  
  875.     // at this time, the set device is GPU # 1.
  876. }
  877.  
  878. uint8_t cuda_allocate_memory()
  879. {
  880.     // FIXME: use definite byte amounts.
  881.     uint64_t delme_mid[25];
  882.     uint8_t errorCount = 0;
  883.  
  884.     // GPU 0 (TODO: for loop, detect devices).
  885.     printf("cuda_allocate_memory: setting device # 0 ...");
  886.     cudaCallCheckError( cudaSetDevice(0) );
  887.  
  888.     printf("Allocating device memory for d_mid[0] (size %d bytes) ...", (int)(sizeof(delme_mid)));
  889.     cudaCallCheckError ( cudaMalloc( (void**)&d_mid[0], sizeof(delme_mid) ));
  890.  
  891.     printf("Allocating device memory for d_target[0] (size 8 bytes ... ");
  892.     cudaCallCheckError( cudaMalloc( (void**)&d_target[0], 8) );        // 8 bytes, size of uint64_t, gpu0
  893.  
  894.     // GPU 1 (TODO: for loop, detect devices).
  895.     printf("cuda_allocate_memory: setting device # 1 ...");
  896.     cudaCallCheckError( cudaSetDevice(1) );
  897.  
  898.     printf("Allocating device memory for d_mid[1] (size %d bytes) ...", (int)(sizeof(delme_mid)));
  899.     cudaCallCheckError( cudaMalloc( (void**)&d_mid[1], sizeof(delme_mid) ));
  900.  
  901.     printf("Allocating device memory for d_target[1] (size 8 bytes) ... ");
  902.     cudaCallCheckError( cudaMalloc( (void**)&d_target[1], 8) );        // 8 bytes, size of uint64_t, gpu1
  903.  
  904.     printf("Allocating device memory for d_solution[0] (size 8 bytes) ... ");
  905.     cudaCallCheckError( cudaMalloc( (void**)&d_solution[0], 8) );    // 8 bytes
  906.  
  907.     printf("Allocating device memory for d_solution[1] (size 8 bytes) ... ");
  908.     cudaCallCheckError( cudaMalloc( (void**)&d_solution[1], 8) );    // 8 bytes
  909.  
  910.     printf("Allocating host memory for h_message[0] (size 8 bytes) ... ");
  911.     cudaCallCheckError( cudaMallocHost( (void**)&h_message[0], sizeof(uint64_t) ) );                   // was sizeof(uint64_t). 8 bytes*/
  912.  
  913.     printf("Allocating host memory for h_message[1] (size 8 bytes) ... ");
  914.     cudaCallCheckError( cudaMallocHost( (void**)&h_message[1], sizeof(uint64_t) ) );                   // was sizeof(uint64_t). 8 bytes*/
  915.    
  916.     if (errorCount == 0)
  917.     {
  918.         *h_message[0] = UINT64_MAX;
  919.         *h_message[1] = UINT64_MAX;
  920.         return 0;    // errorCount=0
  921.     }
  922.     else
  923.         return errorCount;
  924.  
  925.     /*REF: move this to where we send the device its vars !!
  926.        *h_message = UINT64_MAX;
  927.        cudaMemcpy( d_solution, h_message, 8, cudaMemcpyHostToDevice );   // sizeof(uint64_t), 8 bytes
  928.     */
  929.  
  930.     //uint8_t* d_hash_prefix;        // leftover from 0xbitcoin-miner. purpose?
  931.  
  932.     /*m_address( ADDRESS_LENGTH ),
  933.     m_challenge( UINT256_LENGTH ),
  934.     m_target( 0 ),*/
  935.     //numThreads = (1u << theIntensity);
  936.     //cudaMemcpyToSymbol(threads, &h_threads, sizeof(h_threads), 0, cudaMemcpyHostToDevice);
  937. }
  938.  
  939. __host__
  940. uint8_t FreeDeviceVariables()
  941. {
  942.     cudaError theResult;
  943.     uint8_t errorCount = 0;
  944.  
  945.     theResult = cudaFree(d_solution); //(void*)
  946.     if (theResult != cudaSuccess)
  947.     {
  948.         ++errorCount;
  949.         printf("Error in FreeDeviceVariables (freeing d_solution): %s \n", cudaGetErrorString(theResult));
  950.     }
  951.    
  952.     cout << "Note: Not freeing d_target as it is a __constant__ \n" << endl;
  953.     /*
  954.     theResult = cudaFree(d_target);
  955.     {
  956.         ++errorCount;
  957.         printf("Error in FreeDeviceVariables (freeing d_target): %s \n", cudaGetErrorString(theResult));
  958.     }
  959.     */
  960.  
  961.     cout << "Note: Not freeing d_mid as it is a __constant__ \n" << endl;
  962.     /*
  963.     theResult = cudaFree(d_mid);
  964.     {
  965.         ++errorCount;
  966.         printf("Error in FreeDeviceVariables (freeing d_mid): %s \n", cudaGetErrorString(theResult));
  967.     }
  968.     */
  969.  
  970.     return errorCount;            // 0 if no errors
  971. }
  972.  
  973. int cuda_shutdown()
  974. {
  975.     // cuda shutdown
  976.     cudaError theResult;
  977.     uint8_t errorCount = 0;
  978.  
  979.     printf("\n\n** Shutting down CUDA ...\n\n");
  980.  
  981.     cudaDeviceIsInitialized = false;
  982.  
  983.     // select the device (TODO: multi-GPU)
  984.     theResult = cudaSetDevice(numChosenDevice);
  985.     if (theResult != cudaSuccess)
  986.     {
  987.         printf("Error in cuda_shutdown (selecting device): %s \n", cudaGetErrorString(theResult));
  988.         ++errorCount;
  989.     }
  990.  
  991.     // free memory allocated by CUDA (including host vars)
  992.     theResult = cudaFreeHost(h_message);
  993.     if (theResult != cudaSuccess)
  994.     {
  995.         printf("Error in cuda_shutdown (free h_message): %s \n", cudaGetErrorString(theResult));
  996.         ++errorCount;
  997.     }
  998.    
  999.     errorCount += FreeDeviceVariables();    // Free Device Variables, add # of errors, if any, to errorCount
  1000.  
  1001.     if (theResult != cudaSuccess)
  1002.     {
  1003.         printf("Error in cuda_shutdown (reset device): %s \n", cudaGetErrorString(theResult));
  1004.         ++errorCount;
  1005.     }
  1006.  
  1007.     printf("Finished freeing CUDA variables w/ %d errors.\n\n", errorCount);
  1008.  
  1009.     printable_hashrate_cnt = 0;                // reset counter
  1010.  
  1011.     // reset the device
  1012.     theResult = cudaDeviceReset();
  1013.     if (theResult != cudaSuccess)
  1014.     {
  1015.         printf("Error in cuda_shutdown (reset device): %s \n", cudaGetErrorString(theResult));
  1016.         ++errorCount;
  1017.     }
  1018.  
  1019.     return errorCount;            // 0 if no errors
  1020. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement