Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /******************************
- doom.cu:
- *******************************/
- extern "C"
- {
- #include "sph/sph_luffa.h"
- #include "miner.h"
- }
- // aus cpu-miner.c
- extern int device_map[8];
- extern void doom_luffa512_cpu_init(int thr_id, int threads);
- extern void doom_luffa512_cpu_setBlock_80(int thr_id, void *pdata, uint32_t *ptarget);
- extern void doom_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *foundNonce, int order);
- inline void doomhash(void *state, const void *input)
- {
- // luffa512
- sph_luffa512_context ctx_luffa;
- uint32_t hash[16];
- sph_luffa512_init(&ctx_luffa);
- sph_luffa512 (&ctx_luffa, input, 80);
- sph_luffa512_close(&ctx_luffa, (void*) hash);
- memcpy(state, hash, 32);
- }
- extern bool opt_benchmark;
- extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done)
- {
- const uint32_t first_nonce = pdata[19];
- if (opt_benchmark)
- ((uint32_t*)ptarget)[7] = 0x0000ff;
- const uint32_t Htarg = ptarget[7];
- const int throughput = 256*256*8*8;
- static bool init[8] = {0,0,0,0,0,0,0,0};
- if (!init[thr_id])
- {
- cudaSetDevice(device_map[thr_id]);
- // Konstanten kopieren, Speicher belegen
- doom_luffa512_cpu_init(thr_id, throughput);
- init[thr_id] = true;
- }
- uint32_t endiandata[20];
- for (int k=0; k < 20; k++)
- be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
- doom_luffa512_cpu_setBlock_80(thr_id, (void*)endiandata, (uint32_t *)ptarget);
- do {
- int order = 0;
- uint32_t foundNonce;
- // erstes luffa512 Hash mit CUDA
- doom_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], &foundNonce, order++);
- if (foundNonce != 0xffffffff)
- {
- uint32_t vhash64[8];
- be32enc(&endiandata[19], foundNonce);
- doomhash(vhash64, endiandata);
- if( (vhash64[7]<=Htarg) && fulltest(vhash64, ptarget) ) {
- pdata[19] = foundNonce;
- *hashes_done = foundNonce - first_nonce + 1;
- return 1;
- } else {
- applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
- }
- }
- pdata[19] += throughput;
- } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
- *hashes_done = pdata[19] - first_nonce + 1;
- return 0;
- }
- /******************************
- doom_luffa512.cu:
- *******************************/
- // aus heavy.cu
- extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
- typedef unsigned char BitSequence;
- typedef unsigned char uint8_t;
- typedef unsigned int uint32_t;
- typedef unsigned long long uint64_t;
- static uint32_t *d_target[8];
- static uint32_t *d_resultNonce[8];
- __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
- typedef struct {
- uint32_t buffer[8]; /* Buffer to be hashed */
- uint32_t chainv[40]; /* Chaining values */
- } hashState;
- static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
- return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
- }
- // Endian Drehung für 32 Bit Typen
- static __device__ uint32_t cuda_swab32(uint32_t x)
- {
- return __byte_perm(x, 0, 0x0123);
- }
- static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
- {
- return __byte_perm(x, x, 0x0123);
- }
- #define MULT2(a,j)\
- tmp = a[7+(8*j)];\
- a[7+(8*j)] = a[6+(8*j)];\
- a[6+(8*j)] = a[5+(8*j)];\
- a[5+(8*j)] = a[4+(8*j)];\
- a[4+(8*j)] = a[3+(8*j)] ^ tmp;\
- a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
- a[2+(8*j)] = a[1+(8*j)];\
- a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
- a[0+(8*j)] = tmp;
- #define TWEAK(a0,a1,a2,a3,j)\
- a0 = (a0<<(j))|(a0>>(32-j));\
- a1 = (a1<<(j))|(a1>>(32-j));\
- a2 = (a2<<(j))|(a2>>(32-j));\
- a3 = (a3<<(j))|(a3>>(32-j));
- #define STEP(c0,c1)\
- SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
- SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
- MIXWORD(chainv[0],chainv[4]);\
- MIXWORD(chainv[1],chainv[5]);\
- MIXWORD(chainv[2],chainv[6]);\
- MIXWORD(chainv[3],chainv[7]);\
- ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
- #define SUBCRUMB(a0,a1,a2,a3,a4)\
- asm( \
- "mov.b32 %4, %0;\n\t" \
- "or.b32 %0, %0, %1;\n\t" \
- "xor.b32 %2, %2, %3;\n\t" \
- "not.b32 %1, %1;\n\t" \
- "xor.b32 %0, %0, %3;\n\t" \
- "and.b32 %3, %3, %4;\n\t" \
- "xor.b32 %1, %1, %3;\n\t" \
- "xor.b32 %3, %3, %2;\n\t" \
- "and.b32 %2, %2, %0;\n\t" \
- "not.b32 %0, %0;\n\t" \
- "xor.b32 %2, %2, %1;\n\t" \
- "or.b32 %1, %1, %3;\n\t" \
- "xor.b32 %4, %4, %1;\n\t" \
- "xor.b32 %3, %3, %2;\n\t" \
- "and.b32 %2, %2, %1;\n\t" \
- "xor.b32 %1, %1, %0;\n\t" \
- "mov.b32 %0, %4;\n\t" \
- :: "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4))
- #define MIXWORD(a0,a4)\
- a4 ^= a0;\
- a0 = (a0<<2) | (a0>>(30));\
- a0 ^= a4;\
- a4 = (a4<<14) | (a4>>(18));\
- a4 ^= a0;\
- a0 = (a0<<10) | (a0>>(22));\
- a0 ^= a4;\
- a4 = (a4<<1) | (a4>>(31));
- #define ADD_CONSTANT(a0,b0,c0,c1)\
- a0 ^= c0;\
- b0 ^= c1;
- /* initial values of chaining variables */
- __constant__ uint32_t c_IV[40];
- const uint32_t h2_IV[40] = {
- 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
- 0x6e292011,0x90152df4,0xee058139,0xdef610bb,
- 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
- 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
- 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
- 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
- 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
- 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
- 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
- 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
- __constant__ uint32_t c_doom_CNS[80];
- uint32_t h2_doom_CNS[80] = {
- 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
- 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
- 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
- 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
- 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
- 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
- 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
- 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
- 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
- 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
- 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
- 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
- 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
- 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
- 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
- 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
- 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
- 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
- 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
- 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
- /***************************************************/
- __device__ __forceinline__ void rnd512(hashState *state)
- {
- int i,j;
- uint32_t t[40];
- uint32_t chainv[8];
- uint32_t tmp;
- #pragma unroll 8
- for(i=0;i<8;i++) {
- t[i]=0;
- #pragma unroll 5
- for(j=0;j<5;j++) {
- t[i] ^= state->chainv[i+8*j];
- }
- }
- MULT2(t, 0);
- #pragma unroll 5
- for(j=0;j<5;j++) {
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i+8*j] ^= t[i];
- }
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- #pragma unroll 8
- for(i=0;i<8;i++) {
- t[i+8*j] = state->chainv[i+8*j];
- }
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- MULT2(state->chainv, j);
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
- }
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- #pragma unroll 8
- for(i=0;i<8;i++) {
- t[i+8*j] = state->chainv[i+8*j];
- }
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- MULT2(state->chainv, j);
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
- }
- }
- #pragma unroll 5
- for(j=0;j<5;j++) {
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i+8*j] ^= state->buffer[i];
- }
- MULT2(state->buffer, 0);
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- chainv[i] = state->chainv[i];
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- STEP(c_doom_CNS[(2*i)],c_doom_CNS[(2*i)+1]);
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i] = chainv[i];
- chainv[i] = state->chainv[i+8];
- }
- TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
- #pragma unroll 8
- for(i=0;i<8;i++) {
- STEP(c_doom_CNS[(2*i)+16],c_doom_CNS[(2*i)+16+1]);
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i+8] = chainv[i];
- chainv[i] = state->chainv[i+16];
- }
- TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
- #pragma unroll 8
- for(i=0;i<8;i++) {
- STEP(c_doom_CNS[(2*i)+32],c_doom_CNS[(2*i)+32+1]);
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i+16] = chainv[i];
- chainv[i] = state->chainv[i+24];
- }
- TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
- #pragma unroll 8
- for(i=0;i<8;i++) {
- STEP(c_doom_CNS[(2*i)+48],c_doom_CNS[(2*i)+48+1]);
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i+24] = chainv[i];
- chainv[i] = state->chainv[i+32];
- }
- TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
- #pragma unroll 8
- for(i=0;i<8;i++) {
- STEP(c_doom_CNS[(2*i)+64],c_doom_CNS[(2*i)+64+1]);
- }
- #pragma unroll 8
- for(i=0;i<8;i++) {
- state->chainv[i+32] = chainv[i];
- }
- }
- __device__ __forceinline__ void Update512(hashState *state, const BitSequence *data)
- {
- #pragma unroll 8
- for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
- rnd512(state);
- #pragma unroll 8
- for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
- rnd512(state);
- #pragma unroll 4
- for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]);
- }
- /***************************************************/
- __device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
- {
- int i,j;
- state->buffer[4] = 0x80000000;
- #pragma unroll 3
- for(int i=5;i<8;i++) state->buffer[i] = 0;
- rnd512(state);
- /*---- blank round with m=0 ----*/
- #pragma unroll 8
- for(i=0;i<8;i++) state->buffer[i] =0;
- rnd512(state);
- #pragma unroll 8
- for(i=0;i<8;i++) {
- b[i] = 0;
- #pragma unroll 5
- for(j=0;j<5;j++) {
- b[i] ^= state->chainv[i+8*j];
- }
- b[i] = BYTES_SWAP32((b[i]));
- }
- #pragma unroll 8
- for(i=0;i<8;i++) state->buffer[i]=0;
- rnd512(state);
- #pragma unroll 8
- for(i=0;i<8;i++) {
- b[8+i] = 0;
- #pragma unroll 5
- for(j=0;j<5;j++) {
- b[8+i] ^= state->chainv[i+8*j];
- }
- b[8+i] = BYTES_SWAP32((b[8+i]));
- }
- }
- /***************************************************/
- // Die Hash-Funktion
- __global__ void doom_luffa512_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *d_target, uint32_t *resNonce)
- {
- int thread = (blockDim.x * blockIdx.x + threadIdx.x);
- if (thread < threads)
- {
- uint32_t nounce = startNounce + thread;
- uint32_t hash[16];
- union {
- uint64_t buf64[16];
- uint32_t buf32[32];
- } buff;
- #pragma unroll 16
- for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
- // die Nounce durch die thread-spezifische ersetzen
- buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce));
- hashState state;
- #pragma unroll 40
- for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
- #pragma unroll 8
- for(int i=0;i<8;i++) state.buffer[i] = 0;
- Update512(&state, (BitSequence*)buff.buf32);
- finalization512(&state, (uint32_t*)hash);
- int position = -1;
- bool rc = true;
- #pragma unroll 8
- for (int i = 7; i >= 0; i--) {
- if (hash[i] > d_target[i]) {
- if(position < i) {
- position = i;
- rc = false;
- }
- }
- if (hash[i] < d_target[i]) {
- if(position < i) {
- position = i;
- rc = true;
- }
- }
- }
- if(rc == true)
- if(resNonce[0] > nounce)
- resNonce[0] = nounce;
- }
- }
- // Setup-Funktionen
- __host__ void doom_luffa512_cpu_init(int thr_id, int threads)
- {
- cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
- cudaMalloc(&d_target[thr_id], 8 * sizeof(uint32_t));
- cudaMemcpyToSymbol( c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice );
- cudaMemcpyToSymbol( c_doom_CNS, h2_doom_CNS, sizeof(h2_doom_CNS), 0, cudaMemcpyHostToDevice );
- }
- __host__ void doom_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *resnonce, int order)
- {
- const int threadsperblock = 256;
- // berechne wie viele Thread Blocks wir brauchen
- dim3 grid((threads + threadsperblock-1)/threadsperblock);
- dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
- size_t shared_size = 0;
- cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
- doom_luffa512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_target[thr_id], d_resultNonce[thr_id]);
- MyStreamSynchronize(NULL, order, thr_id);
- cudaMemcpy(resnonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
- }
- __host__ void doom_luffa512_cpu_setBlock_80(int thr_id, void *pdata, uint32_t *ptarget)
- {
- // Message mit Padding bereitstellen
- // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
- unsigned char PaddedMessage[128];
- memcpy(PaddedMessage, pdata, 80);
- memset(PaddedMessage+80, 0, 48);
- PaddedMessage[80] = 0x80;
- PaddedMessage[111] = 1;
- PaddedMessage[126] = 0x02;
- PaddedMessage[127] = 0x80;
- // die Message zur Berechnung auf der GPU
- cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
- cudaMemcpy(d_target[thr_id], ptarget, 8*sizeof(uint32_t), cudaMemcpyHostToDevice);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement