ghostlander

NeoScrypt OpenCL by Wolf0

Jan 1st, 2016
225
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20
  2. // By Wolf (Wolf0 aka Wolf9466)
  3.  
  4. // Stupid AMD compiler ignores the unroll pragma in these two
  5.  
  6. // Tahiti 3/2,
  7. // Hawaii 4/4 + notneededswap
  8. // Pitcairn 3/4 + notneededswap
  9. #if defined(__Tahiti__)
  10. #define SALSA_SMALL_UNROLL 4
  11. #define CHACHA_SMALL_UNROLL 2
  12. //#define SWAP 1
  13. //#define SHITMAIN 1
  14. //#define WIDE_STRIPE 1
  15. #elif defined(__Pitcairn__)
  16.  
  17. #define SALSA_SMALL_UNROLL 3
  18. #define CHACHA_SMALL_UNROLL 2
  19. //#define SWAP 1
  20. //#define SHITMAIN 1
  21. //#define WIDE_STRIPE 1
  22.  
  23. #else
  24. #define SALSA_SMALL_UNROLL 4
  25. #define CHACHA_SMALL_UNROLL 4
  26. //#define SWAP 1
  27. //#define SHITMAIN 1
  28. //#define WIDE_STRIPE 1
  29. #endif
  30.  
  31. // If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
  32. // as the unroll factor; must divide cleanly into ten.
  33. // Usually a bad idea.
  34. //#define SMALL_BLAKE2S
  35. //#define BLAKE2S_UNROLL 5
  36.  
  37. #define BLOCK_SIZE           64U
  38. #define FASTKDF_BUFFER_SIZE 256U
  39. #ifndef PASSWORD_LEN
  40. #define PASSWORD_LEN         80U
  41. #endif
  42.  
  43. #if !defined(cl_khr_byte_addressable_store)
  44. #error "Device does not support unaligned stores"
  45. #endif
  46.  
  47. // Swaps 128 bytes at a time without using temp vars
  48. void SwapBytes128(void *restrict A, void *restrict B, uint len)
  49. {
  50.     #pragma unroll 2
  51.     for(int i = 0; i < (len >> 7); ++i)
  52.     {
  53.         ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
  54.         ((ulong16 *)B)[i] ^= ((ulong16 *)A)[i];
  55.         ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
  56.     }
  57. }
  58.  
  59. void CopyBytes128(void *restrict dst, const void *restrict src, uint len)
  60. {
  61.     #pragma unroll 2
  62.     for(int i = 0; i < len; ++i)
  63.         ((ulong16 *)dst)[i] = ((ulong16 *)src)[i];
  64. }
  65.  
  66. void CopyBytes(void *restrict dst, const void *restrict src, uint len)
  67. {
  68.     for(int i = 0; i < len; ++i)
  69.         ((uchar *)dst)[i] = ((uchar *)src)[i];
  70. }
  71.  
  72. void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len)
  73. {
  74.     for(int i = 0; i < len; ++i)
  75.         ((uchar *)dst)[i] ^= ((uchar *)src)[i];
  76. }
  77.  
  78. void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len)
  79. {
  80.     #pragma unroll 1
  81.     for(int i = 0; i < len; ++i)
  82.         ((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i];
  83. }
  84.  
  85. // Blake2S
  86.  
  87. #define BLAKE2S_BLOCK_SIZE    64U
  88. #define BLAKE2S_OUT_SIZE      32U
  89. #define BLAKE2S_KEY_SIZE      32U
  90.  
  91. static const __constant uint BLAKE2S_IV[8] =
  92. {
  93.     0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
  94.     0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
  95. };
  96.  
  97. static const __constant uchar BLAKE2S_SIGMA[10][16] =
  98. {
  99.     {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
  100.     { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
  101.     { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
  102.     {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
  103.     {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
  104.     {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
  105.     { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
  106.     { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
  107.     {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
  108.     { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
  109. };
  110.  
  111. #define BLAKE_G(idx0, idx1, a, b, c, d, key)    do { \
  112.     a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \
  113.     d = rotate(d ^ a, 16U); \
  114.     c += d; \
  115.     b = rotate(b ^ c, 20U); \
  116.     a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \
  117.     d = rotate(d ^ a, 24U); \
  118.     c += d; \
  119.     b = rotate(b ^ c, 25U); \
  120. } while(0)
  121.  
  122. #define BLAKE_PARALLEL_G1(idx0, a, b, c, d, key)    do { \
  123.     a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][0]], key[BLAKE2S_SIGMA[idx0][2]], key[BLAKE2S_SIGMA[idx0][4]], key[BLAKE2S_SIGMA[idx0][6]]); \
  124.     d = rotate(d ^ a, 16U); \
  125.     c += d; \
  126.     b = rotate(b ^ c, 20U); \
  127.     a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][1]], key[BLAKE2S_SIGMA[idx0][3]], key[BLAKE2S_SIGMA[idx0][5]], key[BLAKE2S_SIGMA[idx0][7]]); \
  128.     d = rotate(d ^ a, 24U); \
  129.     c += d; \
  130.     b = rotate(b ^ c, 25U); \
  131. } while(0)
  132.  
  133. #define BLAKE_PARALLEL_G2(idx0, a, b, c, d, key)    do { \
  134.     a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][8]], key[BLAKE2S_SIGMA[idx0][10]], key[BLAKE2S_SIGMA[idx0][12]], key[BLAKE2S_SIGMA[idx0][14]]); \
  135.     d = rotate(d ^ a, 16U); \
  136.     c += d; \
  137.     b = rotate(b ^ c, 20U); \
  138.     a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][9]], key[BLAKE2S_SIGMA[idx0][11]], key[BLAKE2S_SIGMA[idx0][13]], key[BLAKE2S_SIGMA[idx0][15]]); \
  139.     d = rotate(d ^ a, 24U); \
  140.     c += d; \
  141.     b = rotate(b ^ c, 25U); \
  142. } while(0)
  143.  
  144. void Blake2S(uint *restrict inout, const uint *restrict inkey)
  145. {
  146.     uint16 V;
  147.     uint8 tmpblock;
  148.  
  149.     // Load first block (IV into V.lo) and constants (IV into V.hi)
  150.     V.lo = V.hi = vload8(0U, BLAKE2S_IV);
  151.  
  152.     // XOR with initial constant
  153.     V.s0 ^= 0x01012020;
  154.  
  155.     // Copy input block for later
  156.     tmpblock = V.lo;
  157.  
  158.     // XOR length of message so far (including this block)
  159.     // There are two uints for this field, but high uint is zero
  160.     V.sc ^= BLAKE2S_BLOCK_SIZE;
  161.  
  162.     // Compress state, using the key as the key
  163.     #ifdef SMALL_BLAKE2S
  164.     #pragma unroll BLAKE2S_UNROLL
  165.     #else
  166.     #pragma unroll
  167.     #endif
  168.     for(int x = 0; x < 10; ++x)
  169.     {
  170.         /*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
  171.         BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
  172.         BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
  173.         BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
  174.         BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
  175.         BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
  176.         BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
  177.         BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);*/
  178.        
  179.         BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
  180.         BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
  181.     }
  182.  
  183.     // XOR low part of state with the high part,
  184.     // then with the original input block.
  185.     V.lo ^= V.hi ^ tmpblock;
  186.  
  187.     // Load constants (IV into V.hi)
  188.     V.hi = vload8(0U, BLAKE2S_IV);
  189.  
  190.     // Copy input block for later
  191.     tmpblock = V.lo;
  192.  
  193.     // XOR length of message into block again
  194.     V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
  195.  
  196.     // Last block compression - XOR final constant into state
  197.     V.se ^= 0xFFFFFFFFU;
  198.  
  199.     // Compress block, using the input as the key
  200.     #ifdef SMALL_BLAKE2S
  201.     #pragma unroll BLAKE2S_UNROLL
  202.     #else
  203.     #pragma unroll
  204.     #endif
  205.     for(int x = 0; x < 10; ++x)
  206.     {
  207.         /*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
  208.         BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
  209.         BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
  210.         BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
  211.         BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
  212.         BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
  213.         BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
  214.         BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);*/
  215.        
  216.         BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
  217.         BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
  218.     }
  219.  
  220.     // XOR low part of state with high part, then with input block
  221.     V.lo ^= V.hi ^ tmpblock;
  222.  
  223.     // Store result in input/output buffer
  224.     vstore8(V.lo, 0, inout);
  225. }
  226.  
  227. /* FastKDF, a fast buffered key derivation function:
  228.  * FASTKDF_BUFFER_SIZE must be a power of 2;
  229.  * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
  230.  * prf_output_size must be <= prf_key_size; */
  231. void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len)
  232. {
  233.  
  234.     /*                    WARNING!
  235.      * This algorithm uses byte-wise addressing for memory blocks.
  236.      * Or in other words, trying to copy an unaligned memory region
  237.      * will significantly slow down the algorithm, when copying uses
  238.      * words or bigger entities. It even may corrupt the data, when
  239.      * the device does not support it properly.
  240.      * Therefore use byte copying, which will not the fastest but at
  241.      * least get reliable results. */
  242.  
  243.     // BLOCK_SIZE            64U
  244.     // FASTKDF_BUFFER_SIZE  256U
  245.     // BLAKE2S_BLOCK_SIZE    64U
  246.     // BLAKE2S_KEY_SIZE      32U
  247.     // BLAKE2S_OUT_SIZE      32U
  248.     uchar bufidx = 0;
  249.     uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
  250.     uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
  251.  
  252.     // Initialize the password buffer
  253.     #pragma unroll 1
  254.     for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];
  255.  
  256.     ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
  257.  
  258.     // Initialize the salt buffer
  259.     if(salt_len == FASTKDF_BUFFER_SIZE)
  260.     {
  261.         ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
  262.         ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
  263.     }
  264.     else
  265.     {
  266.         // salt_len is 80 bytes here
  267.         #pragma unroll 1
  268.         for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10];
  269.  
  270.         // Initialized the rest to zero earlier
  271.         #pragma unroll 1
  272.         for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i];
  273.     }
  274.  
  275.     // The primary iteration
  276.     #pragma unroll 1
  277.     for(int i = 0; i < 32; ++i)
  278.     {
  279.         // Make the key buffer twice the size of the key so it fits a Blake2S block
  280.         // This way, we don't need a temp buffer in the Blake2S function.
  281.         uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 };
  282.  
  283.         // Copy input and key to their buffers
  284.         CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
  285.         CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
  286.  
  287.         // PRF
  288.         //Blake2S((uint *)input, (uint *)key);
  289.        
  290.         uint *inkey = (uint *)key, *inout = (uint *)input;
  291.        
  292.         // PRF
  293.         uint16 V;
  294.         uint8 tmpblock;
  295.  
  296.         // Load first block (IV into V.lo) and constants (IV into V.hi)
  297.         V.lo = V.hi = vload8(0U, BLAKE2S_IV);
  298.  
  299.         // XOR with initial constant
  300.         V.s0 ^= 0x01012020;
  301.  
  302.         // Copy input block for later
  303.         tmpblock = V.lo;
  304.  
  305.         // XOR length of message so far (including this block)
  306.         // There are two uints for this field, but high uint is zero
  307.         V.sc ^= BLAKE2S_BLOCK_SIZE;
  308.  
  309.         // Compress state, using the key as the key
  310.         #pragma unroll
  311.         for(int x = 0; x < 10; ++x)
  312.         {
  313.             BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
  314.             BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
  315.         }
  316.  
  317.         // XOR low part of state with the high part,
  318.         // then with the original input block.
  319.         V.lo ^= V.hi ^ tmpblock;
  320.  
  321.         // Load constants (IV into V.hi)
  322.         V.hi = vload8(0U, BLAKE2S_IV);
  323.  
  324.         // Copy input block for later
  325.         tmpblock = V.lo;
  326.  
  327.         // XOR length of message into block again
  328.         V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
  329.  
  330.         // Last block compression - XOR final constant into state
  331.         V.se ^= 0xFFFFFFFFU;
  332.  
  333.         // Compress block, using the input as the key
  334.         #pragma unroll
  335.         for(int x = 0; x < 10; ++x)
  336.         {  
  337.             BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
  338.             BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
  339.         }
  340.  
  341.         // XOR low part of state with high part, then with input block
  342.         V.lo ^= V.hi ^ tmpblock;
  343.  
  344.         // Store result in input/output buffer
  345.         vstore8(V.lo, 0, inout);
  346.  
  347.        
  348.         // Calculate the next buffer pointer
  349.         bufidx = 0;
  350.  
  351.         for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
  352.             bufidx += input[x];
  353.  
  354.         // bufidx a uchar now - always mod 255
  355.         //bufidx &= (FASTKDF_BUFFER_SIZE - 1);
  356.  
  357.         // Modify the salt buffer
  358.         XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
  359.  
  360.         if(bufidx < BLAKE2S_KEY_SIZE)
  361.         {
  362.             // Head modified, tail updated
  363.             // this was made off the original code... wtf
  364.             //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
  365.             CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
  366.         }
  367.         else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
  368.         {
  369.             // Tail modified, head updated
  370.             CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
  371.         }
  372.     }
  373.  
  374.     // Modify and copy into the output buffer
  375.  
  376.     // Damned compiler crashes
  377.     // Fuck you, AMD
  378.  
  379.     //for(uint i = 0; i < output_len; ++i, ++bufidx)
  380.     //  output[i] = B[bufidx] ^ A[i];
  381.  
  382.     uint left = FASTKDF_BUFFER_SIZE - bufidx;
  383.     //uint left = (~bufidx) + 1
  384.  
  385.     if(left < output_len)
  386.     {
  387.         XORBytes(output, B + bufidx, A, left);
  388.         XORBytes(output + left, B, A + left, output_len - left);
  389.     }
  390.     else
  391.     {
  392.         XORBytes(output, B + bufidx, A, output_len);
  393.     }
  394. }
  395.  
  396. /* FastKDF, a fast buffered key derivation function:
  397.  * FASTKDF_BUFFER_SIZE must be a power of 2;
  398.  * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
  399.  * prf_output_size must be <= prf_key_size; */
  400. void fastkdf1(const uchar password[80], uchar output[256])
  401. {
  402.  
  403.     /*                    WARNING!
  404.      * This algorithm uses byte-wise addressing for memory blocks.
  405.      * Or in other words, trying to copy an unaligned memory region
  406.      * will significantly slow down the algorithm, when copying uses
  407.      * words or bigger entities. It even may corrupt the data, when
  408.      * the device does not support it properly.
  409.      * Therefore use byte copying, which will not the fastest but at
  410.      * least get reliable results. */
  411.  
  412.     // BLOCK_SIZE            64U
  413.     // FASTKDF_BUFFER_SIZE  256U
  414.     // BLAKE2S_BLOCK_SIZE    64U
  415.     // BLAKE2S_KEY_SIZE      32U
  416.     // BLAKE2S_OUT_SIZE      32U
  417.     uchar bufidx = 0;
  418.     uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
  419.     uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
  420.    
  421.     // Initialize the password buffer
  422.     #pragma unroll 1
  423.     for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)A)[i] = ((ulong *)password)[i % 10];
  424.  
  425.     ((uint16 *)(B + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
  426.  
  427.     // The primary iteration
  428.     #pragma unroll 1
  429.     for(int i = 0; i < 32; ++i)
  430.     {
  431.         // Make the key buffer twice the size of the key so it fits a Blake2S block
  432.         // This way, we don't need a temp buffer in the Blake2S function.
  433.         uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 };
  434.        
  435.         // Copy input and key to their buffers
  436.         CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
  437.         CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
  438.        
  439.         uint *inkey = (uint *)key, *inout = (uint *)input;
  440.        
  441.         #ifndef __Hawaii__
  442.        
  443.         // PRF
  444.         uint4 V[4];
  445.         uint8 tmpblock;
  446.        
  447.         tmpblock = vload8(0U, BLAKE2S_IV);
  448.        
  449.         V[0] = V[2] = tmpblock.lo;
  450.         V[1] = V[3] = tmpblock.hi;
  451.        
  452.         V[0].s0 ^= 0x01012020U;
  453.         tmpblock.lo = V[0];
  454.        
  455.         V[3].s0 ^= BLAKE2S_BLOCK_SIZE;
  456.  
  457.         // Compress state, using the key as the key
  458.         #pragma unroll
  459.         for(int x = 0; x < 10; ++x)
  460.         {
  461.             BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey);
  462.             BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey);
  463.         }
  464.        
  465.         V[0] ^= V[2] ^ tmpblock.lo;
  466.         V[1] ^= V[3] ^ tmpblock.hi;
  467.        
  468.         V[2] = vload4(0U, BLAKE2S_IV);
  469.         V[3] = vload4(1U, BLAKE2S_IV);
  470.        
  471.         tmpblock.lo = V[0];
  472.         tmpblock.hi = V[1];
  473.        
  474.         V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1;
  475.         V[3].s2 ^= 0xFFFFFFFFU;
  476.  
  477.         // Compress block, using the input as the key
  478.         #pragma unroll
  479.         for(int x = 0; x < 10; ++x)
  480.         {  
  481.             BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout);
  482.             BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout);
  483.         }
  484.        
  485.         V[0] ^= V[2] ^ tmpblock.lo;
  486.         V[1] ^= V[3] ^ tmpblock.hi;
  487.        
  488.         vstore4(V[0], 0, inout);
  489.         vstore4(V[1], 1, inout);
  490.        
  491.         #else
  492.        
  493.         // PRF
  494.         uint16 V;
  495.         uint8 tmpblock;
  496.  
  497.         // Load first block (IV into V.lo) and constants (IV into V.hi)
  498.         V.lo = V.hi = vload8(0U, BLAKE2S_IV);
  499.  
  500.         // XOR with initial constant
  501.         V.s0 ^= 0x01012020;
  502.  
  503.         // Copy input block for later
  504.         tmpblock = V.lo;
  505.  
  506.         // XOR length of message so far (including this block)
  507.         // There are two uints for this field, but high uint is zero
  508.         V.sc ^= BLAKE2S_BLOCK_SIZE;
  509.  
  510.         // Compress state, using the key as the key
  511.         #pragma unroll
  512.         for(int x = 0; x < 10; ++x)
  513.         {
  514.             BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
  515.             BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
  516.         }
  517.  
  518.         // XOR low part of state with the high part,
  519.         // then with the original input block.
  520.         V.lo ^= V.hi ^ tmpblock;
  521.  
  522.         // Load constants (IV into V.hi)
  523.         V.hi = vload8(0U, BLAKE2S_IV);
  524.  
  525.         // Copy input block for later
  526.         tmpblock = V.lo;
  527.  
  528.         // XOR length of message into block again
  529.         V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
  530.  
  531.         // Last block compression - XOR final constant into state
  532.         V.se ^= 0xFFFFFFFFU;
  533.  
  534.         // Compress block, using the input as the key
  535.         #pragma unroll
  536.         for(int x = 0; x < 10; ++x)
  537.         {  
  538.             BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
  539.             BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
  540.         }
  541.  
  542.         // XOR low part of state with high part, then with input block
  543.         V.lo ^= V.hi ^ tmpblock;
  544.  
  545.         // Store result in input/output buffer
  546.         vstore8(V.lo, 0, inout);
  547.        
  548.         #endif
  549.        
  550.         // Calculate the next buffer pointer
  551.         bufidx = 0;
  552.  
  553.         for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
  554.             bufidx += input[x];
  555.  
  556.         // bufidx a uchar now - always mod 255
  557.         //bufidx &= (FASTKDF_BUFFER_SIZE - 1);
  558.  
  559.         // Modify the salt buffer
  560.         XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
  561.  
  562.         if(bufidx < BLAKE2S_KEY_SIZE)
  563.         {
  564.             // Head modified, tail updated
  565.             // this was made off the original code... wtf
  566.             //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
  567.             CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
  568.         }
  569.         else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
  570.         {
  571.             // Tail modified, head updated
  572.             CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
  573.         }
  574.     }
  575.  
  576.     // Modify and copy into the output buffer
  577.  
  578.     // Damned compiler crashes
  579.     // Fuck you, AMD
  580.  
  581.     //for(uint i = 0; i < output_len; ++i, ++bufidx)
  582.     //  output[i] = B[bufidx] ^ A[i];
  583.  
  584.     uint left = FASTKDF_BUFFER_SIZE - bufidx;
  585.     //uint left = (~bufidx) + 1
  586.  
  587.     if(left < 256)
  588.     {
  589.         XORBytes(output, B + bufidx, A, left);
  590.         XORBytes(output + left, B, A + left, 256 - left);
  591.     }
  592.     else
  593.     {
  594.         XORBytes(output, B + bufidx, A, 256);
  595.     }
  596. }
  597.  
  598. /* FastKDF, a fast buffered key derivation function:
  599.  * FASTKDF_BUFFER_SIZE must be a power of 2;
  600.  * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
  601.  * prf_output_size must be <= prf_key_size; */
  602. void fastkdf2(const uchar password[80], const uchar salt[256],  __global uint* restrict output, const uint target)
  603. {
  604.  
  605.     /*                    WARNING!
  606.      * This algorithm uses byte-wise addressing for memory blocks.
  607.      * Or in other words, trying to copy an unaligned memory region
  608.      * will significantly slow down the algorithm, when copying uses
  609.      * words or bigger entities. It even may corrupt the data, when
  610.      * the device does not support it properly.
  611.      * Therefore use byte copying, which will not the fastest but at
  612.      * least get reliable results. */
  613.  
  614.     // BLOCK_SIZE            64U
  615.     // FASTKDF_BUFFER_SIZE  256U
  616.     // BLAKE2S_BLOCK_SIZE    64U
  617.     // BLAKE2S_KEY_SIZE      32U
  618.     // BLAKE2S_OUT_SIZE      32U
  619.     // salt_len == 256, output_len == 32
  620.     uchar bufidx = 0;
  621.     uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
  622.     uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
  623.     //uchar A[256], B[256];
  624.    
  625.     // Initialize the password buffer
  626.     #pragma unroll 1
  627.     for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];
  628.  
  629.     ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
  630.  
  631.     // Initialize the salt buffer
  632.     ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
  633.     ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
  634.  
  635.     // The primary iteration
  636.     #pragma unroll 1
  637.     for(int i = 0; i < 32; ++i)
  638.     {
  639.         // Make the key buffer twice the size of the key so it fits a Blake2S block
  640.         // This way, we don't need a temp buffer in the Blake2S function.
  641.         uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 };
  642.        
  643.         // Copy input and key to their buffers
  644.         CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
  645.         CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
  646.        
  647.         uint *inkey = (uint *)key, *inout = (uint *)input;
  648.        
  649.         #ifndef __Hawaii__
  650.        
  651.         // PRF
  652.         uint4 V[4];
  653.         uint8 tmpblock;
  654.        
  655.         tmpblock = vload8(0U, BLAKE2S_IV);
  656.        
  657.         V[0] = V[2] = tmpblock.lo;
  658.         V[1] = V[3] = tmpblock.hi;
  659.        
  660.         V[0].s0 ^= 0x01012020U;
  661.         tmpblock.lo = V[0];
  662.        
  663.         V[3].s0 ^= BLAKE2S_BLOCK_SIZE;
  664.  
  665.         // Compress state, using the key as the key
  666.         #pragma unroll
  667.         for(int x = 0; x < 10; ++x)
  668.         {
  669.             BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey);
  670.             BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey);
  671.         }
  672.        
  673.         V[0] ^= V[2] ^ tmpblock.lo;
  674.         V[1] ^= V[3] ^ tmpblock.hi;
  675.        
  676.         V[2] = vload4(0U, BLAKE2S_IV);
  677.         V[3] = vload4(1U, BLAKE2S_IV);
  678.        
  679.         tmpblock.lo = V[0];
  680.         tmpblock.hi = V[1];
  681.        
  682.         V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1;
  683.         V[3].s2 ^= 0xFFFFFFFFU;
  684.  
  685.         // Compress block, using the input as the key
  686.         #pragma unroll
  687.         for(int x = 0; x < 10; ++x)
  688.         {  
  689.             BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout);
  690.             BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout);
  691.         }
  692.        
  693.         V[0] ^= V[2] ^ tmpblock.lo;
  694.         V[1] ^= V[3] ^ tmpblock.hi;
  695.        
  696.         vstore4(V[0], 0, inout);
  697.         vstore4(V[1], 1, inout);
  698.        
  699.         #else
  700.        
  701.         // PRF
  702.         uint16 V;
  703.         uint8 tmpblock;
  704.  
  705.         // Load first block (IV into V.lo) and constants (IV into V.hi)
  706.         V.lo = V.hi = vload8(0U, BLAKE2S_IV);
  707.  
  708.         // XOR with initial constant
  709.         V.s0 ^= 0x01012020;
  710.  
  711.         // Copy input block for later
  712.         tmpblock = V.lo;
  713.  
  714.         // XOR length of message so far (including this block)
  715.         // There are two uints for this field, but high uint is zero
  716.         V.sc ^= BLAKE2S_BLOCK_SIZE;
  717.  
  718.         // Compress state, using the key as the key
  719.         #pragma unroll
  720.         for(int x = 0; x < 10; ++x)
  721.         {
  722.             BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey);
  723.             BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey);
  724.         }
  725.  
  726.         // XOR low part of state with the high part,
  727.         // then with the original input block.
  728.         V.lo ^= V.hi ^ tmpblock;
  729.  
  730.         // Load constants (IV into V.hi)
  731.         V.hi = vload8(0U, BLAKE2S_IV);
  732.  
  733.         // Copy input block for later
  734.         tmpblock = V.lo;
  735.  
  736.         // XOR length of message into block again
  737.         V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
  738.  
  739.         // Last block compression - XOR final constant into state
  740.         V.se ^= 0xFFFFFFFFU;
  741.  
  742.         // Compress block, using the input as the key
  743.         #pragma unroll
  744.         for(int x = 0; x < 10; ++x)
  745.         {  
  746.             BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout);
  747.             BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout);
  748.         }
  749.  
  750.         // XOR low part of state with high part, then with input block
  751.         V.lo ^= V.hi ^ tmpblock;
  752.  
  753.         // Store result in input/output buffer
  754.         vstore8(V.lo, 0, inout);
  755.         #endif
  756.        
  757.         // Calculate the next buffer pointer
  758.         bufidx = 0;
  759.  
  760.         for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
  761.             bufidx += input[x];
  762.  
  763.         // bufidx a uchar now - always mod 255
  764.         //bufidx &= (FASTKDF_BUFFER_SIZE - 1);
  765.  
  766.         // Modify the salt buffer
  767.         XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
  768.  
  769.         if(bufidx < BLAKE2S_KEY_SIZE)
  770.         {
  771.             // Head modified, tail updated
  772.             // this was made off the original code... wtf
  773.             //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
  774.             CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
  775.         }
  776.         else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
  777.         {
  778.             // Tail modified, head updated
  779.             CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
  780.         }      
  781.     }
  782.  
  783.     // Modify and copy into the output buffer
  784.  
  785.     // Damned compiler crashes
  786.     // Fuck you, AMD
  787.    
  788.     uchar outbuf[32];
  789.    
  790.     for(uint i = 0; i < 32; ++i, ++bufidx)
  791.         outbuf[i] = B[bufidx] ^ A[i];
  792.  
  793.     /*uint left = FASTKDF_BUFFER_SIZE - bufidx;
  794.     //uint left = (~bufidx) + 1
  795.     uchar outbuf[32];
  796.  
  797.     if(left < 32)
  798.     {
  799.         XORBytes(outbuf, B + bufidx, A, left);
  800.         XORBytes(outbuf + left, B, A + left, 32 - left);
  801.     }
  802.     else
  803.     {
  804.         XORBytes(outbuf, B + bufidx, A, 32);
  805.     }*/
  806.    
  807.     if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
  808.  
  809. }
  810.  
  811. /*
  812.  s0 s1 s2 s3
  813.  s4 s5 s6 s7
  814.  s8 s9 sa sb
  815.  sc sd se sf
  816. shittify:
  817. s0=s4
  818. s1=s9
  819. s2=se
  820. s3=s3
  821. s4=s8
  822. s5=sd
  823. s6=s2
  824. s7=s7
  825. s8=sc
  826. s9=s1
  827. sa=s6
  828. sb=sb
  829. sc=s0
  830. sd=s5
  831. se=sa
  832. sf=sf
  833. unshittify:
  834. s0=sc
  835. s1=s9
  836. s2=s6
  837. s3=s3
  838. s4=s0
  839. s5=sd
  840. s6=sa
  841. s7=s7
  842. s8=s4
  843. s9=s1
  844. sa=se
  845. sb=sb
  846. sc=s8
  847. sd=s5
  848. se=s2
  849. sf=sf
  850.  
  851. */
  852.  
  853. #define SALSA_CORE(state)       do { \
  854.     state[0] ^= rotate(state[3] + state[2], 7U); \
  855.     state[1] ^= rotate(state[0] + state[3], 9U); \
  856.     state[2] ^= rotate(state[1] + state[0], 13U); \
  857.     state[3] ^= rotate(state[2] + state[1], 18U); \
  858.     state[2] ^= rotate(state[3].wxyz + state[0].zwxy, 7U); \
  859.     state[1] ^= rotate(state[2].wxyz + state[3].zwxy, 9U); \
  860.     state[0] ^= rotate(state[1].wxyz + state[2].zwxy, 13U); \
  861.     state[3] ^= rotate(state[0].wxyz + state[1].zwxy, 18U); \
  862. } while(0)
  863.  
  864. #define SALSA_CORE_SCALAR(state)    do { \
  865.     state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \
  866.     state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \
  867.     state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \
  868.     state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \
  869.     state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \
  870.     state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \
  871.     state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \
  872.     state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \
  873. } while(0)
  874.  
  875. uint16 salsa_small_parallel_rnd(uint16 X)
  876. {
  877. #ifndef SHITMAIN
  878.     uint4 st[4] = { (uint4)(X.s4, X.s9, X.se, X.s3),
  879.                     (uint4)(X.s8, X.sd, X.s2, X.s7),
  880.                     (uint4)(X.sc, X.s1, X.s6, X.sb),
  881.                     (uint4)(X.s0, X.s5, X.sa, X.sf)  };  
  882. #else
  883.     uint4 st[4];
  884.     ((uint16 *)st)[0] = X;
  885. #endif
  886.    
  887.     #if SALSA_SMALL_UNROLL == 1
  888.  
  889.     for(int i = 0; i < 10; ++i)
  890.     {
  891.         SALSA_CORE(st);
  892.     }
  893.  
  894.     #elif SALSA_SMALL_UNROLL == 2
  895.  
  896.     for(int i = 0; i < 5; ++i)
  897.     {
  898.         SALSA_CORE(st);
  899.         SALSA_CORE(st);
  900.     }
  901.  
  902.     #elif SALSA_SMALL_UNROLL == 3
  903.  
  904.     for(int i = 0; i < 4; ++i)
  905.     {
  906.         SALSA_CORE(st);
  907.         if(i == 3) break;
  908.         SALSA_CORE(st);
  909.         SALSA_CORE(st);
  910.     }
  911.  
  912.     #elif SALSA_SMALL_UNROLL == 4
  913.  
  914.     for(int i = 0; i < 3; ++i)
  915.     {
  916.         SALSA_CORE(st);
  917.         SALSA_CORE(st);
  918.         if(i == 2) break;
  919.         SALSA_CORE(st);
  920.         SALSA_CORE(st);
  921.     }
  922.  
  923.     #elif SALSA_SMALL_UNROLL == 5
  924.  
  925.     for(int i = 0; i < 2; ++i)
  926.     {
  927.         SALSA_CORE(st);
  928.         SALSA_CORE(st);
  929.         SALSA_CORE(st);
  930.         SALSA_CORE(st);
  931.         SALSA_CORE(st);
  932.     }
  933.  
  934.     #else
  935.         SALSA_CORE(st);
  936.         SALSA_CORE(st);
  937.         SALSA_CORE(st);
  938.         SALSA_CORE(st);
  939.         SALSA_CORE(st);
  940.         SALSA_CORE(st);
  941.         SALSA_CORE(st);
  942.         SALSA_CORE(st);
  943.         SALSA_CORE(st);
  944.         SALSA_CORE(st);
  945.  
  946.     #endif
  947.  
  948. #ifndef SHITMAIN
  949.     return(X + (uint16)(
  950.         st[3].x, st[2].y, st[1].z, st[0].w,
  951.         st[0].x, st[3].y, st[2].z, st[1].w,
  952.         st[1].x, st[0].y, st[3].z, st[2].w,
  953.         st[2].x, st[1].y, st[0].z, st[3].w));
  954. #else
  955.     return(X + ((uint16 *)st)[0]);
  956. #endif
  957. }
  958.  
  959. uint16 salsa_small_scalar_rnd(uint16 X)
  960. {
  961.     uint16 st = X;
  962.    
  963.     #if SALSA_SMALL_UNROLL == 1
  964.    
  965.     for(int i = 0; i < 10; ++i)
  966.     {
  967.         SALSA_CORE_SCALAR(st);
  968.     }
  969.    
  970.     #elif SALSA_SMALL_UNROLL == 2
  971.    
  972.     for(int i = 0; i < 5; ++i)
  973.     {
  974.         SALSA_CORE_SCALAR(st);
  975.         SALSA_CORE_SCALAR(st);
  976.     }
  977.    
  978.     #elif SALSA_SMALL_UNROLL == 3
  979.    
  980.     for(int i = 0; i < 4; ++i)
  981.     {
  982.         SALSA_CORE_SCALAR(st);
  983.         if(i == 3) break;
  984.         SALSA_CORE_SCALAR(st);
  985.         SALSA_CORE_SCALAR(st);
  986.     }
  987.    
  988.     #elif SALSA_SMALL_UNROLL == 4
  989.    
  990.     for(int i = 0; i < 3; ++i)
  991.     {
  992.         SALSA_CORE_SCALAR(st);
  993.         SALSA_CORE_SCALAR(st);
  994.         if(i == 2) break;
  995.         SALSA_CORE_SCALAR(st);
  996.         SALSA_CORE_SCALAR(st);
  997.     }
  998.    
  999.     #else
  1000.    
  1001.     for(int i = 0; i < 2; ++i)
  1002.     {
  1003.         SALSA_CORE_SCALAR(st);
  1004.         SALSA_CORE_SCALAR(st);
  1005.         SALSA_CORE_SCALAR(st);
  1006.         SALSA_CORE_SCALAR(st);
  1007.         SALSA_CORE_SCALAR(st);
  1008.     }
  1009.    
  1010.     #endif
  1011.    
  1012.     return(X + st);
  1013. }
  1014.  
  1015.  
  1016. #define CHACHA_CORE_PARALLEL(state) do { \
  1017.     state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 16U); \
  1018.     state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 12U); \
  1019.     state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 8U); \
  1020.     state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 7U); \
  1021.     \
  1022.     state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 16); \
  1023.     state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 12U); \
  1024.     state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 8U); \
  1025.     state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 7U); \
  1026. } while(0)
  1027.  
  1028. #define CHACHA_CORE(state)  do { \
  1029.     state.s0 += state.s4; state.sc = as_uint(as_ushort2(state.sc ^ state.s0).s10); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 12U); state.s0 += state.s4; state.sc = rotate(state.sc ^ state.s0, 8U); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 7U); \
  1030.     state.s1 += state.s5; state.sd = as_uint(as_ushort2(state.sd ^ state.s1).s10); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 12U); state.s1 += state.s5; state.sd = rotate(state.sd ^ state.s1, 8U); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 7U); \
  1031.     state.s2 += state.s6; state.se = as_uint(as_ushort2(state.se ^ state.s2).s10); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 12U); state.s2 += state.s6; state.se = rotate(state.se ^ state.s2, 8U); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 7U); \
  1032.     state.s3 += state.s7; state.sf = as_uint(as_ushort2(state.sf ^ state.s3).s10); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 12U); state.s3 += state.s7; state.sf = rotate(state.sf ^ state.s3, 8U); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 7U); \
  1033.     state.s0 += state.s5; state.sf = as_uint(as_ushort2(state.sf ^ state.s0).s10); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 12U); state.s0 += state.s5; state.sf = rotate(state.sf ^ state.s0, 8U); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 7U); \
  1034.     state.s1 += state.s6; state.sc = as_uint(as_ushort2(state.sc ^ state.s1).s10); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 12U); state.s1 += state.s6; state.sc = rotate(state.sc ^ state.s1, 8U); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 7U); \
  1035.     state.s2 += state.s7; state.sd = as_uint(as_ushort2(state.sd ^ state.s2).s10); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 12U); state.s2 += state.s7; state.sd = rotate(state.sd ^ state.s2, 8U); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 7U); \
  1036.     state.s3 += state.s4; state.se = as_uint(as_ushort2(state.se ^ state.s3).s10); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 12U); state.s3 += state.s4; state.se = rotate(state.se ^ state.s3, 8U); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 7U); \
  1037. } while(0)
  1038.  
  1039. uint16 chacha_small_parallel_rnd(uint16 X)
  1040. {
  1041.     uint4 st[4];
  1042.  
  1043.     ((uint16 *)st)[0] = X;
  1044.  
  1045.     #if CHACHA_SMALL_UNROLL == 1
  1046.  
  1047.     for(int i = 0; i < 10; ++i)
  1048.     {
  1049.         CHACHA_CORE_PARALLEL(st);
  1050.     }
  1051.  
  1052.     #elif CHACHA_SMALL_UNROLL == 2
  1053.  
  1054.     for(int i = 0; i < 5; ++i)
  1055.     {
  1056.         CHACHA_CORE_PARALLEL(st);
  1057.         CHACHA_CORE_PARALLEL(st);
  1058.     }
  1059.  
  1060.     #elif CHACHA_SMALL_UNROLL == 3
  1061.  
  1062.     for(int i = 0; i < 4; ++i)
  1063.     {
  1064.         CHACHA_CORE_PARALLEL(st);
  1065.         if(i == 3) break;
  1066.         CHACHA_CORE_PARALLEL(st);
  1067.         CHACHA_CORE_PARALLEL(st);
  1068.     }
  1069.  
  1070.     #elif CHACHA_SMALL_UNROLL == 4
  1071.  
  1072.     for(int i = 0; i < 3; ++i)
  1073.     {
  1074.         CHACHA_CORE_PARALLEL(st);
  1075.         CHACHA_CORE_PARALLEL(st);
  1076.         if(i == 2) break;
  1077.         CHACHA_CORE_PARALLEL(st);
  1078.         CHACHA_CORE_PARALLEL(st);
  1079.     }
  1080.  
  1081.     #elif CHACHA_SMALL_UNROLL == 5
  1082.  
  1083.     for(int i = 0; i < 2; ++i)
  1084.     {
  1085.         CHACHA_CORE_PARALLEL(st);
  1086.         CHACHA_CORE_PARALLEL(st);
  1087.         CHACHA_CORE_PARALLEL(st);
  1088.         CHACHA_CORE_PARALLEL(st);
  1089.         CHACHA_CORE_PARALLEL(st);
  1090.     }
  1091.     #else
  1092.    
  1093.     CHACHA_CORE_PARALLEL(st);
  1094.     CHACHA_CORE_PARALLEL(st);
  1095.     CHACHA_CORE_PARALLEL(st);
  1096.     CHACHA_CORE_PARALLEL(st);
  1097.     CHACHA_CORE_PARALLEL(st);
  1098.     CHACHA_CORE_PARALLEL(st);
  1099.     CHACHA_CORE_PARALLEL(st);
  1100.     CHACHA_CORE_PARALLEL(st);
  1101.     CHACHA_CORE_PARALLEL(st);
  1102.     CHACHA_CORE_PARALLEL(st);
  1103.  
  1104.     #endif
  1105.  
  1106.     return(X + ((uint16 *)st)[0]);
  1107. }
  1108.  
  1109. uint16 chacha_small_scalar_rnd(uint16 X)
  1110. {  
  1111.     uint16 st = X;
  1112.    
  1113.     #if CHACHA_SMALL_UNROLL == 1
  1114.    
  1115.     for(int i = 0; i < 10; ++i)
  1116.     {
  1117.         CHACHA_CORE(st);
  1118.     }
  1119.    
  1120.     #elif CHACHA_SMALL_UNROLL == 2
  1121.    
  1122.     for(int i = 0; i < 5; ++i)
  1123.     {
  1124.         CHACHA_CORE(st);
  1125.         CHACHA_CORE(st);
  1126.     }
  1127.    
  1128.     #elif CHACHA_SMALL_UNROLL == 3
  1129.    
  1130.     for(int i = 0; i < 4; ++i)
  1131.     {
  1132.         CHACHA_CORE(st);
  1133.         if(i == 3) break;
  1134.         CHACHA_CORE(st);
  1135.         CHACHA_CORE(st);
  1136.     }
  1137.    
  1138.     #elif CHACHA_SMALL_UNROLL == 4
  1139.    
  1140.     for(int i = 0; i < 3; ++i)
  1141.     {
  1142.         CHACHA_CORE(st);
  1143.         CHACHA_CORE(st);
  1144.         if(i == 2) break;
  1145.         CHACHA_CORE(st);
  1146.         CHACHA_CORE(st);
  1147.     }
  1148.    
  1149.     #elif CHACHA_SMALL_UNROLL == 5
  1150.    
  1151.     for(int i = 0; i < 2; ++i)
  1152.     {
  1153.         CHACHA_CORE(st);
  1154.         CHACHA_CORE(st);
  1155.         CHACHA_CORE(st);
  1156.         CHACHA_CORE(st);
  1157.         CHACHA_CORE(st);
  1158.     }
  1159.    
  1160.     #else
  1161.    
  1162.     CHACHA_CORE(st);
  1163.     CHACHA_CORE(st);
  1164.     CHACHA_CORE(st);
  1165.     CHACHA_CORE(st);
  1166.     CHACHA_CORE(st);
  1167.     CHACHA_CORE(st);
  1168.     CHACHA_CORE(st);
  1169.     CHACHA_CORE(st);
  1170.     CHACHA_CORE(st);
  1171.     CHACHA_CORE(st);
  1172.    
  1173.     #endif
  1174.        
  1175.     return(X + st);
  1176. }
  1177.  
  1178. void neoscrypt_blkmix_salsa(uint16 XV[4])
  1179. {
  1180.     /* NeoScrypt flow:                   Scrypt flow:
  1181.          Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
  1182.          Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
  1183.          Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
  1184.          Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
  1185.          Xa" = Ya; Xb" = Yc;
  1186.          Xc" = Yb; Xd" = Yd; */
  1187. #if 0
  1188.     for(int i = 0; i < 4; ++i) XV[i] = (uint16)(
  1189.         XV[i].s4, XV[i].s9, XV[i].se, XV[i].s3, XV[i].s8, XV[i].sd, XV[i].s2, XV[i].s7,
  1190.         XV[i].sc, XV[i].s1, XV[i].s6, XV[i].sb, XV[i].s0, XV[i].s5, XV[i].sa, XV[i].sf);  
  1191. #endif
  1192.     XV[0] ^= XV[3];
  1193.  
  1194.     XV[0] = salsa_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
  1195.     XV[1] = salsa_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
  1196.     XV[2] = salsa_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
  1197.     XV[3] = salsa_small_parallel_rnd(XV[3]);
  1198.    
  1199.     //XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
  1200.     //XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
  1201.     //XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
  1202.     //XV[3] = salsa_small_scalar_rnd(XV[3]);
  1203.    
  1204.     XV[1] ^= XV[2];
  1205.     XV[2] ^= XV[1];
  1206.     XV[1] ^= XV[2];
  1207. #if 0
  1208.     XV[0] = (uint16)(XV[0].sc, XV[0].s9, XV[0].s6, XV[0].s3, XV[0].s0, XV[0].sd, XV[0].sa, XV[0].s7, XV[0].s4, XV[0].s1, XV[0].se, XV[0].sb, XV[0].s8, XV[0].s5, XV[0].s2, XV[0].sf);
  1209.     XV[1] = (uint16)(XV[1].sc, XV[1].s9, XV[1].s6, XV[1].s3, XV[1].s0, XV[1].sd, XV[1].sa, XV[1].s7, XV[1].s4, XV[1].s1, XV[1].se, XV[1].sb, XV[1].s8, XV[1].s5, XV[1].s2, XV[1].sf);
  1210.     XV[2] = (uint16)(XV[2].sc, XV[2].s9, XV[2].s6, XV[2].s3, XV[2].s0, XV[2].sd, XV[2].sa, XV[2].s7, XV[2].s4, XV[2].s1, XV[2].se, XV[2].sb, XV[2].s8, XV[2].s5, XV[2].s2, XV[2].sf);
  1211.     XV[3] = (uint16)(XV[3].sc, XV[3].s9, XV[3].s6, XV[3].s3, XV[3].s0, XV[3].sd, XV[3].sa, XV[3].s7, XV[3].s4, XV[3].s1, XV[3].se, XV[3].sb, XV[3].s8, XV[3].s5, XV[3].s2, XV[3].sf);
  1212. #endif
  1213. }
  1214.  
  1215. void neoscrypt_blkmix_chacha(uint16 XV[4])
  1216. {
  1217.  
  1218.     /* NeoScrypt flow:                   Scrypt flow:
  1219.          Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
  1220.          Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
  1221.          Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
  1222.          Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
  1223.          Xa" = Ya; Xb" = Yc;
  1224.          Xc" = Yb; Xd" = Yd; */
  1225.  
  1226.     XV[0] ^= XV[3];
  1227.    
  1228.     #if 1
  1229.    
  1230.     XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
  1231.     XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
  1232.     XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
  1233.     XV[3] = chacha_small_parallel_rnd(XV[3]);
  1234.    
  1235.     #else
  1236.    
  1237.     XV[0] = chacha_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
  1238.     XV[1] = chacha_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
  1239.     XV[2] = chacha_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
  1240.     XV[3] = chacha_small_scalar_rnd(XV[3]);
  1241.    
  1242.     #endif
  1243.    
  1244.     XV[1] ^= XV[2];
  1245.     XV[2] ^= XV[1];
  1246.     XV[1] ^= XV[2];
  1247. }
  1248.  
  1249. #ifdef WIDE_STRIPE
  1250.  
  1251. void ScratchpadStore(__global void *V, void *X, uchar idx)
  1252. {
  1253.     ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0];
  1254.     ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1] = ((ulong16 *)X)[1];
  1255.     //const uint idx2 = mul24(idx << 2, (int)get_global_size(0));
  1256.     //#pragma unroll
  1257.     //for(int i = 0; i < 4; ++i) ((__global uint16 *)V)[idx2 + i] = ((uint16 *)X)[i];
  1258. }
  1259.  
  1260. void ScratchpadMix(void *X, const __global void *V, uchar idx)
  1261. {
  1262.     ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))];
  1263.     ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1];
  1264. }
  1265.  
  1266. #else
  1267.  
  1268. void ScratchpadStore(__global void *V, void *X, uchar idx)
  1269. {
  1270.     ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0];
  1271.     ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))] = ((ulong16 *)X)[1];
  1272. }
  1273.  
  1274. void ScratchpadMix(void *X, const __global void *V, uchar idx)
  1275. {
  1276.     ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))];
  1277.     ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))];
  1278. }
  1279.  
  1280. #endif
  1281.  
  1282.  
  1283.  
  1284. #define SALSA_PERM      (uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15)
  1285. #define SALSA_INV_PERM  (uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15)
  1286.  
  1287. void SMix_Salsa(uint16 X[4], __global uint16 *V)
  1288. {
  1289.     #pragma unroll 1
  1290.     for(int i = 0; i < 128; ++i)
  1291.     {
  1292.         ScratchpadStore(V, X, i);
  1293.         neoscrypt_blkmix_salsa(X);
  1294.     }
  1295.  
  1296.     #pragma unroll 1
  1297.     for(int i = 0; i < 128; ++i)
  1298.     {
  1299.         #ifdef SHITMAIN
  1300.         const uint idx = convert_uchar(((uint *)X)[60] & 0x7F);
  1301.         #else
  1302.         const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
  1303.         #endif
  1304.         ScratchpadMix(X, V, idx);
  1305.         neoscrypt_blkmix_salsa(X);
  1306.     }
  1307. }
  1308.  
  1309. void SMix_Chacha(uint16 X[4], __global uint16 *V)
  1310. {
  1311.     #pragma unroll 1
  1312.     for(int i = 0; i < 128; ++i)
  1313.     {
  1314.         ScratchpadStore(V, X, i);
  1315.         neoscrypt_blkmix_chacha(X);
  1316.     }
  1317.  
  1318.     #pragma unroll 1
  1319.     for(int i = 0; i < 128; ++i)
  1320.     {
  1321.         const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
  1322.         ScratchpadMix(X, V, idx);
  1323.         neoscrypt_blkmix_chacha(X);
  1324.     }
  1325. }
  1326.  
  1327. #define SALSA_PERM      (uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15)
  1328. #define SALSA_INV_PERM  (uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15)
  1329.  
  1330. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  1331. __kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)
  1332. {
  1333. #define CONSTANT_N 128
  1334. #define CONSTANT_r 2
  1335.     // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
  1336.     uint16 X[4], Z[4];
  1337.     #ifdef WIDE_STRIPE
  1338.     __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1);
  1339.     #else
  1340.     __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0)));
  1341.     #endif
  1342.     //uchar outbuf[32];
  1343.     uchar data[PASSWORD_LEN];
  1344.  
  1345.     ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
  1346.     ((ulong *)data)[8] = ((__global const ulong *)input)[8];
  1347.     ((uint *)data)[18] = ((__global const uint *)input)[18];
  1348.     ((uint *)data)[19] = get_global_id(0);
  1349.  
  1350.     // X = KDF(password, salt)
  1351.     //fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
  1352.     fastkdf1(data, (uchar *)X);
  1353.    
  1354.     #ifndef SHITMAIN
  1355.     // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2
  1356.     CopyBytes128(Z, X, 2);
  1357.     #else
  1358.    
  1359.     #pragma unroll
  1360.     for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] = shuffle(((uint16 *)X)[i], SALSA_PERM);
  1361.    
  1362.     #endif
  1363.    
  1364.     // X = SMix(X); X & Z are swapped, repeat.
  1365.     for(int i = 0;; ++i)
  1366.     {
  1367.         #ifdef SWAP
  1368.         if (i) SMix_Salsa(X,V); else SMix_Chacha(X,V);
  1369.         if(i) break;
  1370.         SwapBytes128(X, Z, 256);
  1371.         #else
  1372.         if (i) SMix_Chacha(X,V); else SMix_Salsa(Z,V);
  1373.         if(i) break;
  1374.         #endif
  1375.     }
  1376.    
  1377.     #if defined(SWAP) && defined(SHITMAIN)
  1378.     #pragma unroll
  1379.     for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] ^= shuffle(((uint16 *)X)[i], SALSA_INV_PERM);
  1380.     fastkdf2(data, (uchar *)Z, output, target);
  1381.     #elif defined(SHITMAIN)
  1382.     #pragma unroll
  1383.     for(int i = 0; i < 4; ++i) ((uint16 *)X)[i] ^= shuffle(((uint16 *)Z)[i], SALSA_INV_PERM);
  1384.     fastkdf2(data, (uchar *)X, output, target);
  1385.     #else
  1386.     // blkxor(X, Z)
  1387.     ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
  1388.     ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
  1389.  
  1390.     // output = KDF(password, X)
  1391.     //fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
  1392.     fastkdf2(data, (uchar *)X, output, target);
  1393.     #endif
  1394. }
  1395.  
  1396.  
  1397. /*
  1398. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  1399. __kernel void search(__global const uchar* restrict input, __global uint16 *XZOutput)
  1400. {
  1401. #define CONSTANT_N 128
  1402. #define CONSTANT_r 2
  1403.     // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
  1404.     uint16 X[4];
  1405.     XZOutput += (4 * 2 * get_global_id(0));
  1406.    
  1407.     //uchar outbuf[32];
  1408.     uchar data[PASSWORD_LEN];
  1409.  
  1410.     ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
  1411.     ((ulong *)data)[8] = ((__global const ulong *)input)[8];
  1412.     ((uint *)data)[18] = ((__global const uint *)input)[18];
  1413.     ((uint *)data)[19] = get_global_id(0);
  1414.  
  1415.     // X = KDF(password, salt)
  1416.     //fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
  1417.     fastkdf1(data, (uchar *)X);
  1418.    
  1419.     for(int i = 0; i < 4; ++i) XZOutput[i] = X[i];
  1420.     for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i];
  1421.     mem_fence(CLK_GLOBAL_MEM_FENCE);
  1422. }
  1423.  
  1424. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  1425. __kernel void search1(__global uint16 *XZOutput, __global uchar *padcache)
  1426. {
  1427. #define CONSTANT_N 128
  1428. #define CONSTANT_r 2
  1429.     // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
  1430.     uint16 X[4], Z[4];
  1431.     #ifdef WIDE_STRIPE
  1432.     __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1);
  1433.     #else
  1434.     __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0)));
  1435.     #endif
  1436.     //uchar outbuf[32];
  1437.    
  1438.     XZOutput += (4 * 2 * get_global_id(0));
  1439.    
  1440.     for(int i = 0; i < 4; ++i) X[i] = XZOutput[i];
  1441.    
  1442.     SMix_Salsa(X,V);
  1443.    
  1444.     for(int i = 0; i < 4; ++i) XZOutput[i] = X[i];
  1445.     mem_fence(CLK_GLOBAL_MEM_FENCE);
  1446. }
  1447.  
  1448. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  1449. __kernel void search2(__global uint16 *XZOutput, __global uchar *padcache)
  1450. {
  1451. #define CONSTANT_N 128
  1452. #define CONSTANT_r 2
  1453.     // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
  1454.     uint16 X[4], Z[4];
  1455.     #ifdef WIDE_STRIPE
  1456.     __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1);
  1457.     #else
  1458.     __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0)));
  1459.     #endif
  1460.     //uchar outbuf[32];
  1461.    
  1462.     XZOutput += (4 * 2 * get_global_id(0));
  1463.    
  1464.     for(int i = 0; i < 4; ++i) X[i] = XZOutput[i + 4];
  1465.    
  1466.     SMix_Chacha(X,V);
  1467.    
  1468.     for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i];
  1469.     mem_fence(CLK_GLOBAL_MEM_FENCE);
  1470. }
  1471.  
  1472. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  1473. __kernel void search3(__global const uchar* restrict input, __global uint16 *XZOutput, __global uint* restrict output, const uint target)
  1474. {
  1475.     uint16 X[4], Z[4];
  1476.     uchar data[PASSWORD_LEN];
  1477.  
  1478.     ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
  1479.     ((ulong *)data)[8] = ((__global const ulong *)input)[8];
  1480.     ((uint *)data)[18] = ((__global const uint *)input)[18];
  1481.     ((uint *)data)[19] = get_global_id(0);
  1482.    
  1483.     XZOutput += (4 * 2 * get_global_id(0));
  1484.    
  1485.     for(int i = 0; i < 4; ++i) X[i] = XZOutput[i];
  1486.     for(int i = 0; i < 4; ++i) Z[i] = XZOutput[i + 4];
  1487.    
  1488.     // blkxor(X, Z)
  1489.     ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
  1490.     ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
  1491.  
  1492.     // output = KDF(password, X)
  1493.     //fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
  1494.     fastkdf2(data, (uchar *)X, output, target);
  1495. }
  1496. */
Advertisement
Add Comment
Please, Sign In to add comment