Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda.h" // check for dups ...
- #include "cuda_runtime.h"
- #include <iostream>
- #include "device_launch_parameters.h"
- #include <stdint.h> // check for dup in cpp, see if it matters or is necessary
- #include <inttypes.h> // same check for dups
- #include <stdio.h> // ditto
- #include <time.h>
- #include <sys/timeb.h> // TODO: double-check linux compatibility for future port/build, should be fine
- #include "Windows.h"
- #include <string>
- #define CUDA_HOWMANY_DEVICES 2
- #define CUDA_MAX_DEVICES 16
- using namespace std;
- extern bool gSolnFound[CUDA_MAX_DEVICES];
- extern string gStr_Hashrate[CUDA_MAX_DEVICES];
- // TODO: initialization, including nullptr ?
- bool gSolving = false; // if set to false, solving stops, then CUDA shutdown
- uint64_t* d_solution;
- uint8_t* d_challenge;
- uint8_t* d_hash_prefix;
- __constant__ uint64_t d_mid[CUDA_MAX_DEVICES][25];
- __constant__ uint64_t d_target[CUDA_MAX_DEVICES];
- __constant__ uint32_t threads[CUDA_MAX_DEVICES];
- extern uint8_t gSolution[CUDA_MAX_DEVICES][32]; // stores sol'n after a successful solve. (TODO: sol'ns queue)
- // see equivalent (actual) declaration in main_includes.h
- extern double cuda_solvetime[CUDA_MAX_DEVICES]; // stores time taken this solve
- uint64_t printable_hashrate_cnt[CUDA_MAX_DEVICES] = { 0 };
- // function prototypes
- extern void print_bytes(uint8_t inArray[], uint8_t len); // defined in C/C++ code compiled by non-CUDA compiler
- extern void EnqueueSolution(); // defined in comms_pool.h
- int cuda_shutdown(); // later in this file
- #define ROTL64(x, y) (((x) << (y)) ^ ((x) >> (64 - (y))))
- #define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
- #define DEF_CUDA_DEVICE_INDEX_DEFAULT 0
- #define DEF_CUDA_DEVICE_INTENSITY 24
- cudaDeviceProp cdp_chosenDevice[CUDA_MAX_DEVICES]; // TODO: initialize with appropriate null
- uint8_t gCudaDeviceIntensities[CUDA_MAX_DEVICES] = { 0 };
- int device_count = 0;
- extern string gpuName[CUDA_MAX_DEVICES];
- extern string gStr_HashCount[CUDA_MAX_DEVICES];
- int gCuda_Device0Intensity = DEF_CUDA_DEVICE_INTENSITY; // Should be overwritten by real setting (config file, OptionsForm.)
- uint64_t cudaDeviceClockSpeed[CUDA_MAX_DEVICES] = { 0 }; // TODO: an array of devices
- uint8_t cudaDeviceComputeCapability[CUDA_MAX_DEVICES] = { 0 }; // TODO: likewise, and default fallback just in case
- bool cudaDeviceIsInitialized[CUDA_MAX_DEVICES] = { 0 }; // TODO: an array of devices
- uint32_t numThreads[CUDA_MAX_DEVICES] = { 0 };
- uint64_t theCounter = 0; // TODO: multiple devices, independent or unified counter?
- uint64_t printableHashrateCounter[CUDA_MAX_DEVICES] = { 0 }; // ^^
- uint64_t printableCounter[CUDA_MAX_DEVICES] = { 0 }; // ^^
- uint64_t print_timer[CUDA_MAX_DEVICES] = { 0 }; // TODO: rename these- they're too confusingly similar ;)
- extern uint8_t solution[CUDA_MAX_DEVICES][32];
- // TODO: initialize? (nullptr)
- // TODO: ** we need to reset to UINT64_MAX each time we get a sol'n! IMPT Reminder !! **
- extern uint64_t* h_message[CUDA_MAX_DEVICES]; // init to nullptr? (FIXME)
- struct timeb tStart[CUDA_MAX_DEVICES], tEnd[CUDA_MAX_DEVICES]; // ^^
- // TODO/FIXME: This shouldn't be necessary- work out make options which don't require it
- //#include "cuda_funcs.h"
- //
- // CUDACALLCHECKERROR: Processes result (type cudaError) of a CUDA call. Provides error-checking
- uint8_t cudaCallCheckError(cudaError theResult)
- {
- if (theResult == cudaSuccess)
- {
- printf("Successful!\n");
- return 0; // no error
- }
- else
- {
- printf("Error.\nCUDA Result: %s \n", cudaGetErrorString(theResult));
- return 1; // for error counters
- }
- }
- __device__ __forceinline__
- uint64_t bswap_64(uint64_t input)
- {
- asm("{"
- " .reg .u32 oh, ol;"
- " mov.b64 {oh,ol}, %0;"
- " prmt.b32 oh, oh, 0, 0x0123;"
- " prmt.b32 ol, ol, 0, 0x0123;"
- " mov.b64 %0, {ol,oh};"
- "}" : "+l"(input));
- return input;
- }
- // try doing this with two offsettings of output operand instead
- __device__ __forceinline__
- uint64_t ROTL64asm(uint64_t input, uint32_t magnitude)
- {
- #if __CUDA_ARCH__ >= 320
- asm("{"
- ".reg .b32 hi, lo, mag, scr;"
- "mov.b32 mag, %1;"
- "mov.b64 {hi,lo}, %0;"
- "shf.l.wrap.b32 scr, lo, hi, mag;"
- "shf.l.wrap.b32 lo, hi, lo, mag;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) : "r"(magnitude));
- return input;
- #else
- return ROTL64(input, magnitude);
- #endif
- }
- // try doing this with two offsettings of output operand instead
- __device__ __forceinline__
- uint64_t ROTR64asm(uint64_t input, uint32_t magnitude)
- {
- // TODO/FIXME: verify correct version is running on multiple arches
- #if __CUDA_ARCH__ >= 320
- asm("{"
- ".reg .b32 hi, lo, mag, scr;"
- "mov.b32 mag, %1;"
- "mov.b64 {hi,lo}, %0;"
- "shf.r.wrap.b32 scr, hi, lo, mag;"
- "shf.r.wrap.b32 lo, lo, hi, mag;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input) : "r"(magnitude));
- return input;
- #else
- return ROTR64(input, magnitude);
- #endif
- }
- __device__ __forceinline__
- uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
- {
- asm(" xor.b64 %0, %0, %1;" : "+l"(a) : "l"(b));
- asm(" xor.b64 %0, %0, %1;" : "+l"(a) : "l"(c));
- asm(" xor.b64 %0, %0, %1;" : "+l"(a) : "l"(d));
- asm(" xor.b64 %0, %0, %1;" : "+l"(a) : "l"(e));
- return a;
- }
- // shortcut to rotation by 32 (flip halves), then rotate left by `mag`
- __device__ __forceinline__
- uint64_t ROTLfrom32(uint64_t rtdby32, uint32_t magnitude)
- {
- asm("{"
- " .reg .b32 hi, lo, scr, mag; "
- " mov.b64 {lo,hi}, %0; " // halves reversed since rotl'd by 32
- " mov.b32 mag, %1; "
- " shf.l.wrap.b32 scr, lo, hi, mag; "
- " shf.l.wrap.b32 lo, hi, lo, mag; "
- " mov.b64 %0, {scr,lo}; "
- "}" : "+l"(rtdby32) : "r"(magnitude)); // see if this is faster w/ uint2 .x and .y
- // for saving shf results out
- return rtdby32; // return rotation from the rotation by 32
- }
- // shortcut to rotation by 32 (flip halves), then rotate right by `mag`
- __device__ __forceinline__
- uint64_t ROTRfrom32(uint64_t rtdby32, uint32_t magnitude)
- {
- asm("{"
- " .reg .b32 hi, lo, scr, mag; "
- " mov.b64 {lo,hi}, %0; " // halves reversed since rotl'd by 32
- " mov.b32 mag, %1; "
- " shf.r.wrap.b32 scr, hi, lo, mag; "
- " shf.r.wrap.b32 lo, lo, hi, mag; "
- " mov.b64 %0, {scr,lo}; "
- "}" : "+l"(rtdby32) : "r"(magnitude)); // see if this is faster w/ uint2 .x and .y
- // for saving shf results out
- return rtdby32; // return rotation from the rotation by 32
- }
- __device__ __forceinline__
- uint64_t ROTLby16(uint64_t input)
- {
- asm("{"
- " .reg .b32 hi, lo, scr;"
- " mov.b64 {hi,lo}, %0;"
- " prmt.b32 scr, hi, lo, 0x5432;"
- " prmt.b32 lo, hi, lo, 0x1076;"
- " mov.b64 %0, {lo,scr};"
- " }" : "+l"(input));
- return input;
- }
- // see if this is faster using x and y vectors, no extra regs
- __device__ __forceinline__
- uint64_t ROTLby8(uint64_t input)
- {
- asm("{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, hi, lo, 0x2107;"
- "prmt.b32 lo, hi, lo, 0x6543;"
- "mov.b64 %0, {scr,lo};"
- "}"
- : "+l"(input));
- return input;
- }
- __device__ __forceinline__
- uint64_t ROTRby8(uint64_t input)
- {
- asm("{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "prmt.b32 scr, lo, hi, 0x0765;"
- "prmt.b32 lo, lo, hi, 0x4321;"
- "mov.b64 %0, {scr,lo};"
- "}"
- : "+l"(input));
- return input;
- }
- // TODO: Look for a snappier way to do this. Should still be slightly
- // faster than the variable-magnitude version above.
- __device__ __forceinline__
- uint64_t ROTLby1(uint64_t input)
- {
- #if __CUDA_ARCH__ >= 320
- asm("{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "shf.l.wrap.b32 scr, lo, hi, 1;" // magnitude replaced w/ immediate
- "shf.l.wrap.b32 lo, hi, lo, 1;" // magnitude replaced w/ immediate
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input));
- return input;
- #else
- return ROTL64(input, 1);
- #endif
- }
- // try doing this with two offsettings of output operand instead
- __device__ __forceinline__
- uint64_t ROTRby1(uint64_t input)
- {
- #if __CUDA_ARCH__ >= 320
- asm("{"
- ".reg .b32 hi, lo, scr;"
- "mov.b64 {hi,lo}, %0;"
- "shf.r.wrap.b32 scr, hi, lo, 1;"
- "shf.r.wrap.b32 lo, lo, hi, 1;"
- "mov.b64 %0, {scr,lo};"
- "}" : "+l"(input));
- return input;
- #else
- return ROTR64(input, 1);
- #endif
- }
- __device__ __forceinline__
- uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
- {
- uint64_t output{ 0 };
- asm("{"
- " xor.b64 %0, %1, %2;"
- " xor.b64 %0, %0, %3;"
- "}" : "+l"(output) : "l"(a), "l"(b), "l"(c));
- return output;
- }
- // FIXME: Assuming SM 5.x+
- __device__ __forceinline__
- uint64_t lop3_0xD2(uint64_t a, uint64_t b, uint64_t c)
- { // FIXME/TODO: make SURE that the correct version is running on Maxwell Gen2, Pascal!
- #if __CUDA_ARCH__ >= 500
- asm("{"
- " .reg .b32 ah, al, bh, bl, ch, cl;"
- " mov.b64 {ah,al}, %0;"
- " mov.b64 {bh,bl}, %1;"
- " mov.b64 {ch,cl}, %2;"
- " lop3.b32 ah, ah, bh, ch, 0xD2;"
- " lop3.b32 al, al, bl, cl, 0xD2;"
- " mov.b64 %0, {ah,al};"
- "}" : "+l"(a) : "l"(b), "l"(c));
- return a;
- #else
- return a ^ ((~b) & c);
- #endif
- }
- __device__
- bool keccak(uint8_t deviceNum, uint64_t nonce, uint32_t thread) // had 'passing optimizations' and 'passing mid' in 3.4t
- {
- uint64_t state[25], C[5], D[5], scratch;
- //if (thread == 543210)
- // PermTest (0);
- uint64_t RClocal[24] =
- {
- /* Element (elements which are '32bit': 1, 4-5, 8, 9-12, 18, 22) */
- /* ------- ------------------ ------------------ ------------------ */
- /* 00..02 */ 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
- /* 03..05 */ 0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
- /* 06..08 */ 0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
- /* 09..11 */ 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
- /* 12..14 */ 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
- /* 15..17 */ 0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
- /* 18..20 */ 0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
- /* 21..23 */ 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
- /* ------- ------------------ ------------------ ------------------ */
- };
- // TODO: consider defining these conditionally, preprocessor
- //if (HB_SELECTIVE_32BIT_XOR)
- //uint2* RCvec = (uint2*)&RClocal; // vectorized access to RClocal[]
- //uint2* stateVec = (uint2*)&state; // vectorized access to state[]
- //if (thread == 543210)
- // PermTest (0);
- C[0] = d_mid[deviceNum][2] ^ ROTR64asm(nonce, 20);
- C[1] = d_mid[deviceNum][4] ^ ROTL64asm(nonce, 14);
- state[0] = lop3_0xD2(d_mid[deviceNum][0], d_mid[deviceNum][1], C[0]); //^ 0x0000000000000001;
- #if HB_SELECTIVE_32BIT_XOR // shouldn't it be .y? only XOR low end of RC 0
- //stateVec[0].x = stateVec[0].x ^ 0x00000001; // because the rest is leading zeroes :)
- asm("xor.b32 %0, %0, 0x00000001;" : "+r"(stateVec[0].x));
- #else
- state[0] = state[0] ^ 0x0000000000000001; // was RC[0]
- #endif
- state[1] = lop3_0xD2(d_mid[deviceNum][1], C[0], d_mid[deviceNum][3]);
- state[2] = lop3_0xD2(C[0], d_mid[deviceNum][3], C[1]);
- state[3] = lop3_0xD2(d_mid[deviceNum][3], C[1], d_mid[deviceNum][0]);
- state[4] = lop3_0xD2(C[1], d_mid[deviceNum][0], d_mid[deviceNum][1]);
- C[0] = d_mid[deviceNum][6] ^ ROTL64asm(nonce, 20); // nonce*1048576;
- C[1] = d_mid[deviceNum][9] ^ ROTR64(nonce, 2);
- state[5] = lop3_0xD2(d_mid[deviceNum][5], C[0], d_mid[deviceNum][7]);
- state[6] = lop3_0xD2(C[0], d_mid[deviceNum][7], d_mid[deviceNum][8]);
- state[7] = lop3_0xD2(d_mid[deviceNum][7], d_mid[deviceNum][8], C[1]);
- state[8] = lop3_0xD2(d_mid[deviceNum][8], C[1], d_mid[deviceNum][5]);
- state[9] = lop3_0xD2(C[1], d_mid[deviceNum][5], C[0]);
- // experimental rotation replacement
- scratch = nonce * 128;
- C[0] = d_mid[deviceNum][11] ^ scratch; // ROTL by 7
- C[1] = d_mid[deviceNum][13] ^ scratch * 2; // ROTL by 8
- state[10] = lop3_0xD2(d_mid[deviceNum][10], C[0], d_mid[deviceNum][12]);
- state[11] = lop3_0xD2(C[0], d_mid[deviceNum][12], C[1]);
- state[12] = lop3_0xD2(d_mid[deviceNum][12], C[1], d_mid[deviceNum][14]);
- state[13] = lop3_0xD2(C[1], d_mid[deviceNum][14], d_mid[deviceNum][10]);
- state[14] = lop3_0xD2(d_mid[deviceNum][14], d_mid[deviceNum][10], C[0]);
- C[0] = d_mid[deviceNum][15] ^ ROTL64asm(nonce, 27); //nonce*134217728;
- C[1] = d_mid[deviceNum][18] ^ ROTLby16(nonce);
- state[15] = lop3_0xD2(C[0], d_mid[deviceNum][16], d_mid[deviceNum][17]);
- state[16] = lop3_0xD2(d_mid[deviceNum][16], d_mid[deviceNum][17], C[1]);
- state[17] = lop3_0xD2(d_mid[deviceNum][17], C[1], d_mid[deviceNum][19]);
- state[18] = lop3_0xD2(C[1], d_mid[deviceNum][19], C[0]);
- state[19] = lop3_0xD2(d_mid[deviceNum][19], C[0], d_mid[deviceNum][16]);
- C[0] = d_mid[deviceNum][20] ^ ROTRby1(nonce);
- C[1] = d_mid[deviceNum][21] ^ ROTR64(nonce, 9); //idea: ROTRby1(ROTRby8(nonce));
- C[2] = d_mid[deviceNum][22] ^ ROTR64(nonce, 25);
- state[20] = lop3_0xD2(C[0], C[1], C[2]);
- state[21] = lop3_0xD2(C[1], C[2], d_mid[deviceNum][23]);
- state[22] = lop3_0xD2(C[2], d_mid[deviceNum][23], d_mid[deviceNum][24]);
- state[23] = lop3_0xD2(d_mid[deviceNum][23], d_mid[deviceNum][24], C[0]);
- state[24] = lop3_0xD2(d_mid[deviceNum][24], C[0], C[1]);
- #if __CUDA_ARCH__ >= 350
- # pragma unroll
- #endif
- for (int32_t i{ 1 }; i < 23; ++i)
- {
- // Theta
- for (uint32_t x{ 0 }; x < 5; ++x)
- {
- C[(x + 6) % 5] = xor5(state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20]);
- }
- #if __CUDA_ARCH__ >= 350
- for (uint32_t x{ 0 }; x < 5; ++x)
- {
- D[x] = ROTL64(C[(x + 2) % 5], 1);
- state[x] = xor3(state[x], D[x], C[x]);
- state[x + 5] = xor3(state[x + 5], D[x], C[x]);
- state[x + 10] = xor3(state[x + 10], D[x], C[x]);
- state[x + 15] = xor3(state[x + 15], D[x], C[x]);
- state[x + 20] = xor3(state[x + 20], D[x], C[x]);
- }
- #else
- for (uint32_t x{ 0 }; x < 5; ++x)
- {
- D[x] = ROTL64(C[(x + 2) % 5], 1) ^ C[x];
- state[x] = state[x] ^ D[x];
- state[x + 5] = state[x + 5] ^ D[x];
- state[x + 10] = state[x + 10] ^ D[x];
- state[x + 15] = state[x + 15] ^ D[x];
- state[x + 20] = state[x + 20] ^ D[x];
- }
- #endif
- // Rho Pi
- C[0] = state[1];
- state[1] = ROTR64asm(state[6], 20);
- state[6] = ROTL64asm(state[9], 20);
- state[9] = ROTR64(state[22], 3);
- state[22] = ROTR64asm(state[14], 25);
- state[14] = ROTL64asm(state[20], 18);
- state[20] = ROTR64(state[2], 2);
- state[2] = ROTR64asm(state[12], 21);
- state[12] = ROTL64asm(state[13], 25);
- state[13] = ROTLby8(state[19]);
- state[19] = ROTRby8(state[23]);
- state[23] = ROTR64asm(state[15], 23);
- // FIXME: watch out for thread divergence within warp. But this should be fine.
- //if (optimizations[NUM_HBOPT_ROTLBACK32])
- #if __CUDA_ARCH__ >= 320
- state[15] = ROTRfrom32(state[4], 5); // (make toggleable- forced on for now)
- #else
- state[15] = ROTL64(state[4], 27);
- #endif
- /*if (thread==1236667)
- {
- printf ("st15: %" PRIx64 " \n", state[15]);
- printf ("tstR: %" PRIx64 " \n", ROTRfrom32(state[4], 5) );
- printf ("tstL: %" PRIx64 " \n\n", ROTLfrom32(state[4], 5) );
- }*/
- state[4] = ROTL64asm(state[24], 14);
- state[24] = ROTL64(state[21], 2);
- state[21] = ROTR64(state[8], 9); // R9
- state[8] = ROTR64asm(state[16], 19);
- // FIXME: watch out for thread divergence. Slowdown here. Figure out why
- //if (optimizations[NUM_HBOPT_ROTLBACK32]) // forced on
- //{
- #if __CUDA_ARCH__ >= 320
- state[16] = ROTLfrom32(state[5], 4);
- state[5] = ROTRfrom32(state[3], 4);
- #else
- state[16] = ROTR64(state[5], 28);
- state[5] = ROTL64(state[3], 28);
- #endif
- state[3] = ROTL64asm(state[18], 21);
- state[18] = ROTL64asm(state[17], 15);
- state[17] = ROTL64asm(state[11], 10);
- state[11] = ROTL64(state[7], 6);
- state[7] = ROTL64(state[10], 3);
- state[10] = ROTLby1(C[0]);
- // lop3_0xD2
- for (uint32_t x{ 0 }; x < 25; x += 5)
- {
- C[0] = state[x];
- C[1] = state[x + 1];
- C[2] = state[x + 2];
- C[3] = state[x + 3];
- C[4] = state[x + 4];
- state[x] = lop3_0xD2(C[0], C[1], C[2]);
- state[x + 1] = lop3_0xD2(C[1], C[2], C[3]);
- state[x + 2] = lop3_0xD2(C[2], C[3], C[4]);
- state[x + 3] = lop3_0xD2(C[3], C[4], C[0]);
- state[x + 4] = lop3_0xD2(C[4], C[0], C[1]);
- }
- // Iota
- state[0] = state[0] ^ RClocal[i];
- }
- for (uint32_t x{ 0 }; x < 5; ++x)
- {
- C[(x + 6) % 5] = xor5(state[x], state[x + 5], state[x + 10], state[x + 15], state[x + 20]);
- }
- D[0] = ROTLby1(C[2]);
- D[1] = ROTLby1(C[3]);
- D[2] = ROTLby1(C[4]);
- state[0] = xor3(state[0], D[0], C[0]);
- state[6] = ROTR64asm(xor3(state[6], D[1], C[1]), 20);
- state[12] = ROTR64(xor3(state[12], D[2], C[2]), 21);
- state[0] = lop3_0xD2(state[0], state[6], state[12]) ^ 0x8000000080008008; // was RC[23];
- return bswap_64(state[0]) <= d_target[deviceNum];
- }
- //
- // TODO: Currently only initializes one GPU with a semi-hardcoded index #.
- // Initialize multiple GPUs and put those to work, too! :)
- // `theCudaDeviceIndex` is selected device's index#, with CUDA's enumeration style
- // TODO: This function is too big and multifunction: let's break it up into convenient pieces
- // and only run the parts we need to, when we need to.
- int cuda_device_init()
- {
- string scratchStr = "";
- uint8_t i = 0;
- // Used by experimental Cosmic 3.5a code that matches CUDA device indices to NVML and NVapi ones
- // (they enumerate devices differently.)
- //char buf[16] = "?";
- // TODO: Decide if I want to return error types as integer or cudaSuccess (be consistent.)
- //cudaError_t cudaResult = cudaSuccess;
- device_count = cudaGetDeviceCount(&device_count);
- printf( "cuda_init(): found %d CUDA devices.\n", device_count );
- for (i = 0; i < CUDA_HOWMANY_DEVICES; ++i)
- {
- gCudaDeviceIntensities[i] = 24; // HAX! (TODO)
- printf("Initializing CUDA device with index # %d ...\n", i);
- gCudaDeviceIntensities[i] = 24;
- if (cudaGetDeviceProperties(&cdp_chosenDevice[i], i) == cudaSuccess)
- printf("Successfully got CUDA device # %d! \n", i);
- else
- {
- printf("cuda_init(): Couldn't get CUDA device (# %d ). (Busy? Too high intensity setting?\n", i);
- //TODO: display chosen CUDA device index # in Win32 dialog, too
- //MessageBox(NULL, "Couldn't get CUDA device. (Busy? Too high intensity setting?", "From Function: cuda_init()", MB_ICONEXCLAMATION | MB_OK);
- continue;
- }
- // FIXME: Is this better just before cudaGetDeviceProperties() above?
- printf( "Setting device #%d ...\n", i );
- cudaCallCheckError( cudaSetDevice(i) );
- if (!cudaDeviceIsInitialized[i])
- {
- printf(" Resetting Device #%d ...\n" );
- if (cudaCallCheckError( cudaDeviceReset() ) != 0) // In 0xBitcoin-Miner, without this reset, CPU usage was too high.
- continue; // run for() loop for next device(s)
- printf("Setting device %d flags.\n", i);
- cudaCallCheckError( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | cudaDeviceLmemResizeToMax) ); // FIXME: ideal selection (see arches)
- printf("Setting device %d cache configuration.\n", i);
- cudaCallCheckError( cudaDeviceSetCacheConfig(cudaFuncCachePreferL1) ); // FIXME: ideal configuration (see arches)
- printf("\nInitialized CUDA device %d : %s", i, cdp_chosenDevice[i].name);
- printf("\nAvailable compute capability: %d.%d \n\n", cdp_chosenDevice[i].major, cdp_chosenDevice[i].minor);
- gpuName[i] = cdp_chosenDevice[i].name;
- //TODO: Here, I take the cdp(cuda device prop, chosen device)'s Compute Capability and condense for encoding into Share margin
- //computeCapabilityForMargin = (cdp_chosenDevice.major * 10) + cdp_chosenDevice.minor; // e.g. major:6,minor:1 becomes 61
- //numberMultiprocessorsForMargin = cdp_chosenDevice.multiProcessorCount;
- cudaDeviceIsInitialized[i] = true; // TODO: this will be an array of devices or similar approach
- cudaDeviceComputeCapability[i] = cdp_chosenDevice[i].major * 100 + cdp_chosenDevice[i].minor * 10;
- // REF: from 0xBitcoin-Miner. GHz to Hz.
- // TODO/FIXME: int cudaDeviceProp::clockRate [inherited]
- // Clock frequency in kilohertz (GPU, correct?)
- cudaDeviceClockSpeed[i] = (int64_t)(cdp_chosenDevice[i].memoryClockRate * 1000 * 1000);
- printableHashrateCounter[i] = 0; // ^^
- printableCounter[i] = 0; // ^^
- ftime(&tStart[i]);
- }
- theCounter = 0; // TODO: multiple devices, independent or unified counter?
- return 0; // no error. 1 is fatal error (see above.)
- // ^^ relation ?
- // TODO: we'll get to this
- /*if (new_input)
- new_input = false;*/
- // this is for NVML, which can get handles from PCI ID as a string.
- // we need to get the same device that CUDA is using, for NVML/NVapi use.
- // TODO: implement this. Source: Cosmic v3.4(t)
- //cudaResult = cudaDeviceGetPCIBusId(buf, 16, cuda_device);
- //if (cudaResult == cudaSuccess)
- // printf("cudaDeviceGetPCIBusID : %s \n", buf);
- //else
- // printf("error getting BusID: %s \n", cudaGetErrorString(cudaResult));
- //printf ("-- CUDA pciDeviceID: %d \n", device_prop.pciDeviceID);
- //printf ("-- CUDA pciBusID: %d \n\n", device_prop.pciBusID);
- //std::string cppStr;
- //cppStr = buf;
- //return cppStr; // return the PCI info to C++ code where it'll get NVML/NVapi handles
- //return device_prop.pciBusID;
- // REF: "cudaDeviceReset must be called before exiting in order for profiling and
- // tracing tools such as Nsight and Visual Profiler to show complete traces."
- // TODO: We want to reset when the GPU stops working
- // FIXME: redundant?
- }
- // TODO/FIXME: consider alternate compute capability check(m.arch check). Forced TPB50
- /*#if //__CUDA_ARCH__ > 500
- # define KERNEL_LAUNCH_PARAMS __global__ __launch_bounds__(TPB50, 1)
- #else
- # define KERNEL_LAUNCH_PARAMS __global__ __launch_bounds__(TPB35, 2)
- #endif*/
- // REF: uint32_t tpb{ compute_version > 500 ? TPB50 : TPB35 };
- #define TPB50 1024u // compute_50 and up
- #define TPB35 384u // compute_35 and down
- __global__ __launch_bounds__(TPB50, 1) // Enforce maximum threads! (TODO/FIXME)
- void mining_kernel( uint8_t deviceNum, uint64_t* possSolution, uint64_t theCounter )
- {
- uint64_t thread = blockDim.x * blockIdx.x + threadIdx.x;
- uint64_t nonce = theCounter + thread;
- // Was here: passing optimizations (COSMiC 3.4t)
- //uint64_t passingMid[25];
- //uint8_t i; // will be set to 0 next line
- // TODO: while(1 && pauseVar|abortVar) faster?
- if ( keccak(deviceNum, nonce, thread) == true ) // write solving device # to *devicenum
- {
- *possSolution = nonce; // write to function parameter
- printf("\n\n* keccak returning true on device #%d with cnt: %" PRIu64 ", nonce: %" PRIx64 " ! * \n\n", deviceNum, theCounter, nonce); // no newline needed (debuggery)
- return;
- }
- }
- //
- // HANDLE_KERNELS: Host function that launches kernels on the CUDA device(s).
- __host__
- int handle_kernels ()
- {
- //#define NPT 2 // eh?
- //cudaError anyError; // TODO: init?
- cudaError cudaResult = cudaSuccess;
- uint64_t scratch = 0;
- double scr_meter = 0;
- double t = 0;
- for (uint8_t i = 0; i < CUDA_HOWMANY_DEVICES - 1; ++i)
- {
- // TODO: do this once per device, not every call
- // TODO: fine-tune this for detected device's capabilities, and intensity per-device
- uint32_t threads = (1u << gCuda_Device0Intensity); // TODO: multi-GPU, separate intensities, defaults
- int threads_per_block = TPB50; // FIXME: for specific arches!
- dim3 grid = (threads + threads_per_block - 1) / threads_per_block;
- dim3 block = threads_per_block;
- cudaResult = cudaSetDevice(i);
- if (cudaResult != cudaSuccess)
- printf( "** cudaSetDevice(%d) failed: %s \n\n", i, cudaGetErrorString( cudaResult ) );
- cudaMemcpy( d_solution[i], h_message[i], 8, cudaMemcpyHostToDevice ); // 8 bytes
- mining_kernel <<< grid, block >>> (i, d_solution, theCounter);
- cudaResult = cudaDeviceSynchronize();
- if (cudaResult != cudaSuccess )
- printf( "Kernel launch error (device #%d): %s \n\n", i, cudaGetErrorString(cudaResult) );
- //Sleep(0.35);
- cudaMemcpy( h_message[i], d_solution[i], 8, cudaMemcpyDeviceToHost ); // 8 bytes
- // if a solution was found by device `i`
- if (*h_message[i] != UINT64_MAX)
- {
- printf( "\n\n** manage_kernels() for device #%d: Possible Solution Found ** \n\n", i );
- printf( "*h_message[%d] after solving is: %" PRIx64 " (expressed as uint64_t, hex): \n", i, *h_message[i] );
- printf( "Copying into solution[%d][len:32] starting at element 12...\n", i );
- memcpy( &solution[i][12], h_message[i], 8 ); // 8 bytes
- printf( "Setting gSolnFound[i] to `true`. (TODO: sol'n queue) ... \n", i );
- //memcpy(gSolution, solution, 32);
- gSolnFound[i] = true;
- }
- // this stuff happens whether a solution was found this call or not.
- ftime( &tEnd[i] );
- //cnt = 0;
- theCounter += threads;
- printable_hashrate_cnt[i] += threads;
- t = (double)((tEnd[i].time * 1000 + tEnd[i].millitm) - (tStart[i].time * 1000 + tStart[i].millitm)) / 1000;
- cuda_solvetime[i] = t; // # of ms taken for UI display and processing by statistics functions
- scr_meter = (double)(printable_hashrate_cnt[i] / t / 1000000);
- scratch = (uint64_t)scr_meter;
- //++print_counter[i];
- // copy hashrate to global string for HUD display
- gStr_Hashrate[i] = to_string(scratch);
- gStr_HashCount[i] = to_string(printable_hashrate_cnt);
- //}
- }
- printf( "GPU0 HR: %s, HCount: %s | GPU1 HR: %s, HCount: %s", gStr_Hashrate[0], gStr_HashCount[0], gStr_Hashrate[1], gStr_HashCount[1] );
- return -1; // no solution found by any GPU
- // faster? return (*h_message != UINT64_MAX);
- // return ( h_done[0] >= 0 );
- }
- //
- // //
- void CUDA_Solve()
- {
- printf("CUDA_Solve CPU thread starting.\n");
- uint8_t errorCount = 0;
- // REF:
- // updateGPULoop(true);
- // CUDASolver::bytes_t byte_solution(32);
- // resetHashCount();
- *h_message[0] = UINT64_MAX;
- *h_message[1] = UINT64_MAX;
- gSolving = true;
- // TODO: rewrite this loop. Push sol'ns to queue where net thread can handle them.
- // figure out why ftime(&tStart) to reset the round time causes a delay when a share is
- // submitted. Write solutions sent out the log for analysis/look for dup shares.
- do
- {
- if (h_message == nullptr || gSolving == false)
- {
- cout << "in do-while loop: gSolving==false or h_message==nullptr, breaking out" << endl;
- break;
- }
- if (handle_kernels() == true)
- {
- //if (*h_message != UINT64_MAX)
- //{
- if ( handle_kernels() != 99 ) // 99 = init value
- printf("In CUDA_Solve(): *h_message is != UINT64_MAX :) \n");
- printf("Candidate Solution: %" PRIx64 " \n", *h_message);
- //CUDASolver::pushSolution(bytesToString(byte_solution));
- //resetHashCount();
- EnqueueSolution();
- // consider updating the challenge, target now if needed
- //theCounter = 0;
- printable_hashrate_cnt = 0;
- }
- *h_message[0] = UINT64_MAX;
- *h_message[1] = UINT64_MAX;
- ftime(&tStart[i]); // store time when we did this
- }
- } while (*h_message == UINT64_MAX) ;
- // run cuda_shutdown(), get errorCount
- if (cuda_shutdown() > 0)
- printf("%d Error(s) while shutting down CUDA \n", errorCount);
- printf("CUDA_Solve CPU thread (ending).\n");
- }
- //
- // //
- void StopCudaSolver()
- {
- gSolving = false;
- }
- //
- // SEND_TO_DEVICE: Builds a midstate from the Init Message, sends mid and target to devices
- __host__
- void send_to_device(uint64_t target, uint64_t* message) //uint64_t* message)
- {
- uint64_t C[4], D[5], mid[25];
- //cout << init_message << endl;
- printf("send_to_device: got 'target' (expressed as uint64_t, hex): %" PRIx64 " \n", target);
- //cudaSetDevice(numChosenDevice);
- printf("send_to_device(): got 'message' (expressed as byte array): \n");
- //for (uint8_t y = 0; y < 10; ++y)
- // printf(" %" PRIx64 " ", message[y]);
- print_bytes((uint8_t*)message, 84);
- printf("\n\n");
- printf("Building the Midstate ...\n");
- C[0] = message[0] ^ message[5] ^ message[10] ^ 0x100000000ull;
- C[1] = message[1] ^ message[6] ^ 0x8000000000000000ull;
- C[2] = message[2] ^ message[7];
- C[3] = message[4] ^ message[9];
- D[0] = ROTL64(C[1], 1) ^ C[3];
- D[1] = ROTL64(C[2], 1) ^ C[0];
- D[2] = ROTL64(message[3], 1) ^ C[1];
- D[3] = ROTL64(C[3], 1) ^ C[2];
- D[4] = ROTL64(C[0], 1) ^ message[3];
- mid[0] = message[0] ^ D[0];
- mid[1] = ROTL64(message[6] ^ D[1], 44);
- mid[2] = ROTL64(D[2], 43);
- mid[3] = ROTL64(D[3], 21);
- mid[4] = ROTL64(D[4], 14);
- mid[5] = ROTL64(message[3] ^ D[3], 28);
- mid[6] = ROTL64(message[9] ^ D[4], 20);
- mid[7] = ROTL64(message[10] ^ D[0] ^ 0x100000000ull, 3);
- mid[8] = ROTL64(0x8000000000000000ull ^ D[1], 45);
- mid[9] = ROTL64(D[2], 61);
- mid[10] = ROTL64(message[1] ^ D[1], 1);
- mid[11] = ROTL64(message[7] ^ D[2], 6);
- mid[12] = ROTL64(D[3], 25);
- mid[13] = ROTL64(D[4], 8);
- mid[14] = ROTL64(D[0], 18);
- mid[15] = ROTL64(message[4] ^ D[4], 27);
- mid[16] = ROTL64(message[5] ^ D[0], 36);
- mid[17] = ROTL64(D[1], 10);
- mid[18] = ROTL64(D[2], 15);
- mid[19] = ROTL64(D[3], 56);
- mid[20] = ROTL64(message[2] ^ D[2], 62);
- mid[21] = ROTL64(D[3], 55);
- mid[22] = ROTL64(D[4], 39);
- mid[23] = ROTL64(D[0], 41);
- mid[24] = ROTL64(D[1], 2);
- printf( "Done. The generated Midstate: \n" );
- // TODO: remove this debuggery
- uint8_t *myPointer = (uint8_t*)∣
- uint8_t mid_bytes[200] = { 0 };
- for (uint8_t n = 0; n < 200; ++n)
- {
- mid_bytes[n] = myPointer[n];
- printf("%02x ", myPointer[n]);
- }
- // TODO/FIXME: cudaMAlloc these first? Doesn't seem done in 0xbitcoin-miner.
- // Check CUDA best practices.
- // probably unnecessary
- printf("send_to_device(): setting device # 0 ... ");
- cudaCallCheckError( cudaSetDevice(0) );
- printf("\n* Note: We are sending the SAME midstate and using the SAME solution byte array \n");
- printf(" to give GPU0/GPU1 identical work for testing purposes.\n\n");
- // 0 is offset: not device #
- printf("send_to_device(): copying midstate to d_mid[0] ... ");
- cudaCallCheckError ( cudaMemcpyToSymbol(d_mid[0], mid, sizeof(mid), 0, cudaMemcpyHostToDevice) ); // gpu 0
- printf("send_to_device(): copying target to d_target[0] ...");
- cudaCallCheckError ( cudaMemcpyToSymbol(d_target[0], &target, sizeof(target), 0, cudaMemcpyHostToDevice) ); // gpu 0
- // probably unnecessary
- printf("send_to_device(): setting device # 1 ... ");
- cudaCallCheckError( cudaSetDevice(1) );
- printf("send_to_device(): copying midstate to d_mid[1] ... ");
- cudaCallCheckError( cudaMemcpyToSymbol(d_mid[1], mid, sizeof(mid), 0, cudaMemcpyHostToDevice) ); // gpu 1
- printf("send_to_device(): copying target to d_target[1] ... ");
- cudaCallCheckError( cudaMemcpyToSymbol(d_target[1], &target, sizeof(target), 0, cudaMemcpyHostToDevice) ); // gpu 1
- // 0 is offset: not device #
- // at this time, the set device is GPU # 1.
- }
- uint8_t cuda_allocate_memory()
- {
- // FIXME: use definite byte amounts.
- uint64_t delme_mid[25];
- uint8_t errorCount = 0;
- // GPU 0 (TODO: for loop, detect devices).
- printf("cuda_allocate_memory: setting device # 0 ...");
- cudaCallCheckError( cudaSetDevice(0) );
- printf("Allocating device memory for d_mid[0] (size %d bytes) ...", (int)(sizeof(delme_mid)));
- cudaCallCheckError ( cudaMalloc( (void**)&d_mid[0], sizeof(delme_mid) ));
- printf("Allocating device memory for d_target[0] (size 8 bytes ... ");
- cudaCallCheckError( cudaMalloc( (void**)&d_target[0], 8) ); // 8 bytes, size of uint64_t, gpu0
- // GPU 1 (TODO: for loop, detect devices).
- printf("cuda_allocate_memory: setting device # 1 ...");
- cudaCallCheckError( cudaSetDevice(1) );
- printf("Allocating device memory for d_mid[1] (size %d bytes) ...", (int)(sizeof(delme_mid)));
- cudaCallCheckError( cudaMalloc( (void**)&d_mid[1], sizeof(delme_mid) ));
- printf("Allocating device memory for d_target[1] (size 8 bytes) ... ");
- cudaCallCheckError( cudaMalloc( (void**)&d_target[1], 8) ); // 8 bytes, size of uint64_t, gpu1
- printf("Allocating device memory for d_solution[0] (size 8 bytes) ... ");
- cudaCallCheckError( cudaMalloc( (void**)&d_solution[0], 8) ); // 8 bytes
- printf("Allocating device memory for d_solution[1] (size 8 bytes) ... ");
- cudaCallCheckError( cudaMalloc( (void**)&d_solution[1], 8) ); // 8 bytes
- printf("Allocating host memory for h_message[0] (size 8 bytes) ... ");
- cudaCallCheckError( cudaMallocHost( (void**)&h_message[0], sizeof(uint64_t) ) ); // was sizeof(uint64_t). 8 bytes*/
- printf("Allocating host memory for h_message[1] (size 8 bytes) ... ");
- cudaCallCheckError( cudaMallocHost( (void**)&h_message[1], sizeof(uint64_t) ) ); // was sizeof(uint64_t). 8 bytes*/
- if (errorCount == 0)
- {
- *h_message[0] = UINT64_MAX;
- *h_message[1] = UINT64_MAX;
- return 0; // errorCount=0
- }
- else
- return errorCount;
- /*REF: move this to where we send the device its vars !!
- *h_message = UINT64_MAX;
- cudaMemcpy( d_solution, h_message, 8, cudaMemcpyHostToDevice ); // sizeof(uint64_t), 8 bytes
- */
- //uint8_t* d_hash_prefix; // leftover from 0xbitcoin-miner. purpose?
- /*m_address( ADDRESS_LENGTH ),
- m_challenge( UINT256_LENGTH ),
- m_target( 0 ),*/
- //numThreads = (1u << theIntensity);
- //cudaMemcpyToSymbol(threads, &h_threads, sizeof(h_threads), 0, cudaMemcpyHostToDevice);
- }
- __host__
- uint8_t FreeDeviceVariables()
- {
- cudaError theResult;
- uint8_t errorCount = 0;
- theResult = cudaFree(d_solution); //(void*)
- if (theResult != cudaSuccess)
- {
- ++errorCount;
- printf("Error in FreeDeviceVariables (freeing d_solution): %s \n", cudaGetErrorString(theResult));
- }
- cout << "Note: Not freeing d_target as it is a __constant__ \n" << endl;
- /*
- theResult = cudaFree(d_target);
- {
- ++errorCount;
- printf("Error in FreeDeviceVariables (freeing d_target): %s \n", cudaGetErrorString(theResult));
- }
- */
- cout << "Note: Not freeing d_mid as it is a __constant__ \n" << endl;
- /*
- theResult = cudaFree(d_mid);
- {
- ++errorCount;
- printf("Error in FreeDeviceVariables (freeing d_mid): %s \n", cudaGetErrorString(theResult));
- }
- */
- return errorCount; // 0 if no errors
- }
- int cuda_shutdown()
- {
- // cuda shutdown
- cudaError theResult;
- uint8_t errorCount = 0;
- printf("\n\n** Shutting down CUDA ...\n\n");
- cudaDeviceIsInitialized = false;
- // select the device (TODO: multi-GPU)
- theResult = cudaSetDevice(numChosenDevice);
- if (theResult != cudaSuccess)
- {
- printf("Error in cuda_shutdown (selecting device): %s \n", cudaGetErrorString(theResult));
- ++errorCount;
- }
- // free memory allocated by CUDA (including host vars)
- theResult = cudaFreeHost(h_message);
- if (theResult != cudaSuccess)
- {
- printf("Error in cuda_shutdown (free h_message): %s \n", cudaGetErrorString(theResult));
- ++errorCount;
- }
- errorCount += FreeDeviceVariables(); // Free Device Variables, add # of errors, if any, to errorCount
- if (theResult != cudaSuccess)
- {
- printf("Error in cuda_shutdown (reset device): %s \n", cudaGetErrorString(theResult));
- ++errorCount;
- }
- printf("Finished freeing CUDA variables w/ %d errors.\n\n", errorCount);
- printable_hashrate_cnt = 0; // reset counter
- // reset the device
- theResult = cudaDeviceReset();
- if (theResult != cudaSuccess)
- {
- printf("Error in cuda_shutdown (reset device): %s \n", cudaGetErrorString(theResult));
- ++errorCount;
- }
- return errorCount; // 0 if no errors
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement