Advertisement
Lantis

cgminer scrypt optimization

Sep 20th, 2013
5,642
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 27.07 KB | None | 0 0
  1. /*-
  2.  * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
  3.  * 2012-2013 Con Kolivas.
  4.  * 2013 Adam Villena.
  5.  * All rights reserved.
  6.  *
  7.  * Redistribution and use in source and binary forms, with or without
  8.  * modification, are permitted provided that the following conditions
  9.  * are met:
  10.  * 1. Redistributions of source code must retain the above copyright
  11.  *    notice, this list of conditions and the following disclaimer.
  12.  * 2. Redistributions in binary form must reproduce the above copyright
  13.  *    notice, this list of conditions and the following disclaimer in the
  14.  *    documentation and/or other materials provided with the distribution.
  15.  *
  16.  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17.  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18.  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19.  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20.  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21.  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22.  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23.  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24.  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25.  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26.  * SUCH DAMAGE.
  27.  *
  28.  * This file was originally written by Colin Percival as part of the Tarsnap
  29.  * online backup system.
  30.  */
  31.  
  32. __constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };
  33. __constant uint K[] = {
  34.     0x428a2f98U,
  35.     0x71374491U,
  36.     0xb5c0fbcfU,
  37.     0xe9b5dba5U,
  38.     0x3956c25bU,
  39.     0x59f111f1U,
  40.     0x923f82a4U,
  41.     0xab1c5ed5U,
  42.     0xd807aa98U,
  43.     0x12835b01U,
  44.     0x243185beU, // 10
  45.     0x550c7dc3U,
  46.     0x72be5d74U,
  47.     0x80deb1feU,
  48.     0x9bdc06a7U,
  49.     0xe49b69c1U,
  50.     0xefbe4786U,
  51.     0x0fc19dc6U,
  52.     0x240ca1ccU,
  53.     0x2de92c6fU,
  54.     0x4a7484aaU, // 20
  55.     0x5cb0a9dcU,
  56.     0x76f988daU,
  57.     0x983e5152U,
  58.     0xa831c66dU,
  59.     0xb00327c8U,
  60.     0xbf597fc7U,
  61.     0xc6e00bf3U,
  62.     0xd5a79147U,
  63.     0x06ca6351U,
  64.     0x14292967U, // 30
  65.     0x27b70a85U,
  66.     0x2e1b2138U,
  67.     0x4d2c6dfcU,
  68.     0x53380d13U,
  69.     0x650a7354U,
  70.     0x766a0abbU,
  71.     0x81c2c92eU,
  72.     0x92722c85U,
  73.     0xa2bfe8a1U,
  74.     0xa81a664bU, // 40
  75.     0xc24b8b70U,
  76.     0xc76c51a3U,
  77.     0xd192e819U,
  78.     0xd6990624U,
  79.     0xf40e3585U,
  80.     0x106aa070U,
  81.     0x19a4c116U,
  82.     0x1e376c08U,
  83.     0x2748774cU,
  84.     0x34b0bcb5U, // 50
  85.     0x391c0cb3U,
  86.     0x4ed8aa4aU,
  87.     0x5b9cca4fU,
  88.     0x682e6ff3U,
  89.     0x748f82eeU,
  90.     0x78a5636fU,
  91.     0x84c87814U,
  92.     0x8cc70208U,
  93.     0x90befffaU,
  94.     0xa4506cebU, // 60
  95.     0xbef9a3f7U,
  96.     0xc67178f2U,
  97.     0x98c7e2a2U,
  98.     0xfc08884dU,
  99.     0xcd2a11aeU,
  100.     0x510e527fU,
  101.     0x9b05688cU,
  102.     0xC3910C8EU,
  103.     0xfb6feee7U,
  104.     0x2a01a605U, // 70
  105.     0x0c2e12e0U,
  106.     0x4498517BU,
  107.     0x6a09e667U,
  108.     0xa4ce148bU,
  109.     0x95F61999U,
  110.     0xc19bf174U,
  111.     0xBB67AE85U,
  112.     0x3C6EF372U,
  113.     0xA54FF53AU,
  114.     0x1F83D9ABU, // 80
  115.     0x5BE0CD19U,
  116.     0x5C5C5C5CU,
  117.     0x36363636U,
  118.     0x80000000U,
  119.     0x000003FFU,
  120.     0x00000280U,
  121.     0x000004a0U,
  122.     0x00000300U
  123. };
  124.  
  125. #define rotl(x,y) rotate(x,y)
  126. #define Ch(x,y,z) bitselect(z,y,x)
  127. #define Maj(x,y,z) Ch((x^z),y,z)
  128.  
  129. #define EndianSwap(n) (rotl(n & ES[0], 24U)|rotl(n & ES[1], 8U))
  130.  
  131. #define Tr2(x)      (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
  132. #define Tr1(x)      (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
  133. #define Wr2(x)      (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
  134. #define Wr1(x)      (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
  135.  
  136. #define RND(a, b, c, d, e, f, g, h, k)  \
  137.     h += Tr1(e);            \
  138.     h += Ch(e, f, g);       \
  139.     h += k;             \
  140.     d += h;             \
  141.     h += Tr2(a);            \
  142.     h += Maj(a, b, c);
  143.  
  144. void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
  145. {
  146.     uint4 S0 = *state0;
  147.     uint4 S1 = *state1;
  148.    
  149. #define A S0.x
  150. #define B S0.y
  151. #define C S0.z
  152. #define D S0.w
  153. #define E S1.x
  154. #define F S1.y
  155. #define G S1.z
  156. #define H S1.w
  157.  
  158.     uint4 W[4];
  159.  
  160.     W[ 0].x = block0.x;
  161.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[0]);
  162.     W[ 0].y = block0.y;
  163.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[1]);
  164.     W[ 0].z = block0.z;
  165.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[2]);
  166.     W[ 0].w = block0.w;
  167.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[3]);
  168.  
  169.     W[ 1].x = block1.x;
  170.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
  171.     W[ 1].y = block1.y;
  172.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
  173.     W[ 1].z = block1.z;
  174.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
  175.     W[ 1].w = block1.w;
  176.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
  177.  
  178.     W[ 2].x = block2.x;
  179.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
  180.     W[ 2].y = block2.y;
  181.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
  182.     W[ 2].z = block2.z;
  183.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
  184.     W[ 2].w = block2.w;
  185.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
  186.  
  187.     W[ 3].x = block3.x;
  188.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
  189.     W[ 3].y = block3.y;
  190.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
  191.     W[ 3].z = block3.z;
  192.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
  193.     W[ 3].w = block3.w;
  194.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
  195.  
  196.     W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
  197.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
  198.  
  199.     W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
  200.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
  201.  
  202.     W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
  203.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
  204.  
  205.     W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
  206.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
  207.  
  208.     W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
  209.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
  210.  
  211.     W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
  212.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
  213.  
  214.     W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
  215.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
  216.  
  217.     W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
  218.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
  219.  
  220.     W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
  221.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
  222.  
  223.     W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
  224.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
  225.  
  226.     W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
  227.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
  228.  
  229.     W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
  230.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
  231.  
  232.     W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
  233.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
  234.  
  235.     W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
  236.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
  237.  
  238.     W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
  239.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
  240.  
  241.     W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
  242.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
  243.  
  244.     W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
  245.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
  246.  
  247.     W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
  248.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
  249.  
  250.     W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
  251.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
  252.  
  253.     W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
  254.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
  255.  
  256.     W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
  257.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
  258.  
  259.     W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
  260.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
  261.  
  262.     W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
  263.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
  264.  
  265.     W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
  266.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
  267.  
  268.     W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
  269.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
  270.  
  271.     W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
  272.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
  273.  
  274.     W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
  275.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
  276.  
  277.     W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
  278.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
  279.  
  280.     W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
  281.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
  282.  
  283.     W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
  284.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
  285.  
  286.     W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
  287.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
  288.  
  289.     W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
  290.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
  291.  
  292.     W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
  293.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
  294.  
  295.     W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
  296.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
  297.  
  298.     W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
  299.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
  300.  
  301.     W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
  302.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
  303.  
  304.     W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
  305.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
  306.  
  307.     W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
  308.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
  309.  
  310.     W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
  311.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
  312.  
  313.     W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
  314.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
  315.  
  316.     W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
  317.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
  318.  
  319.     W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
  320.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
  321.  
  322.     W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
  323.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
  324.  
  325.     W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
  326.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
  327.  
  328.     W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
  329.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
  330.  
  331.     W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
  332.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
  333.  
  334.     W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
  335.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
  336.  
  337.     W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
  338.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
  339.    
  340. #undef A
  341. #undef B
  342. #undef C
  343. #undef D
  344. #undef E
  345. #undef F
  346. #undef G
  347. #undef H
  348.  
  349.     *state0 += S0;
  350.     *state1 += S1;
  351. }
  352.  
  353. void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
  354. {
  355. #define A (*state0).x
  356. #define B (*state0).y
  357. #define C (*state0).z
  358. #define D (*state0).w
  359. #define E (*state1).x
  360. #define F (*state1).y
  361. #define G (*state1).z
  362. #define H (*state1).w
  363.  
  364.     uint4 W[4];
  365.  
  366.     W[0].x = block0.x;
  367.     D= K[63] +W[0].x;
  368.     H= K[64] +W[0].x;
  369.  
  370.     W[0].y = block0.y;
  371.     C= K[65] +Tr1(D)+Ch(D, K[66], K[67])+W[0].y;
  372.     G= K[68] +C+Tr2(H)+Ch(H, K[69] ,K[70]);
  373.  
  374.     W[0].z = block0.z;
  375.     B= K[71] +Tr1(C)+Ch(C,D,K[66])+W[0].z;
  376.     F= K[72] +B+Tr2(G)+Maj(G,H, K[73]);
  377.  
  378.     W[0].w = block0.w;
  379.     A= K[74] +Tr1(B)+Ch(B,C,D)+W[0].w;
  380.     E= K[75] +A+Tr2(F)+Maj(F,G,H);
  381.  
  382.     W[1].x = block1.x;
  383.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
  384.     W[1].y = block1.y;
  385.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
  386.     W[1].z = block1.z;
  387.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
  388.     W[1].w = block1.w;
  389.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
  390.    
  391.     W[2].x = block2.x;
  392.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
  393.     W[2].y = block2.y;
  394.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
  395.     W[2].z = block2.z;
  396.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
  397.     W[2].w = block2.w;
  398.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
  399.    
  400.     W[3].x = block3.x;
  401.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
  402.     W[3].y = block3.y;
  403.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
  404.     W[3].z = block3.z;
  405.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
  406.     W[3].w = block3.w;
  407.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
  408.  
  409.     W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
  410.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
  411.  
  412.     W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
  413.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
  414.  
  415.     W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
  416.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
  417.  
  418.     W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
  419.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
  420.  
  421.     W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
  422.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
  423.  
  424.     W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
  425.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
  426.  
  427.     W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
  428.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
  429.  
  430.     W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
  431.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
  432.  
  433.     W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
  434.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
  435.  
  436.     W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
  437.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
  438.  
  439.     W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
  440.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
  441.  
  442.     W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
  443.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
  444.  
  445.     W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
  446.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
  447.  
  448.     W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
  449.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
  450.  
  451.     W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
  452.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
  453.  
  454.     W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
  455.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
  456.  
  457.     W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
  458.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
  459.  
  460.     W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
  461.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
  462.  
  463.     W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
  464.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
  465.  
  466.     W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
  467.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
  468.  
  469.     W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
  470.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
  471.  
  472.     W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
  473.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
  474.  
  475.     W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
  476.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
  477.  
  478.     W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
  479.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
  480.  
  481.     W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
  482.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
  483.  
  484.     W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
  485.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
  486.  
  487.     W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
  488.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
  489.  
  490.     W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
  491.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
  492.  
  493.     W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
  494.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
  495.  
  496.     W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
  497.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
  498.  
  499.     W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
  500.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
  501.  
  502.     W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
  503.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
  504.  
  505.     W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
  506.     RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
  507.  
  508.     W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
  509.     RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
  510.  
  511.     W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
  512.     RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
  513.  
  514.     W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
  515.     RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
  516.  
  517.     W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
  518.     RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
  519.  
  520.     W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
  521.     RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
  522.  
  523.     W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
  524.     RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
  525.  
  526.     W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
  527.     RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
  528.  
  529.     W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
  530.     RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
  531.  
  532.     W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
  533.     RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
  534.  
  535.     W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
  536.     RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
  537.  
  538.     W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
  539.     RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
  540.  
  541.     W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
  542.     RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
  543.  
  544.     W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
  545.     RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
  546.  
  547.     W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
  548.     RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
  549.  
  550.     W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
  551.     RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
  552.    
  553. #undef A
  554. #undef B
  555. #undef C
  556. #undef D
  557. #undef E
  558. #undef F
  559. #undef G
  560. #undef H
  561.  
  562.     *state0 += (uint4)(K[73], K[77], K[78], K[79]);
  563.     *state1 += (uint4)(K[66], K[67], K[80], K[81]);
  564. }
  565.  
  566. __constant uint fixedW[64] =
  567. {
  568.     0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  569.     0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
  570.     0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
  571.     0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
  572.     0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
  573.     0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
  574.     0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
  575.     0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
  576. };
  577.  
  578. void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
  579. {
  580.     uint4 S0 = *state0;
  581.     uint4 S1 = *state1;
  582.  
  583. #define A S0.x
  584. #define B S0.y
  585. #define C S0.z
  586. #define D S0.w
  587. #define E S1.x
  588. #define F S1.y
  589. #define G S1.z
  590. #define H S1.w
  591.  
  592.     RND(A,B,C,D,E,F,G,H, fixedW[0]);
  593.     RND(H,A,B,C,D,E,F,G, fixedW[1]);
  594.     RND(G,H,A,B,C,D,E,F, fixedW[2]);
  595.     RND(F,G,H,A,B,C,D,E, fixedW[3]);
  596.     RND(E,F,G,H,A,B,C,D, fixedW[4]);
  597.     RND(D,E,F,G,H,A,B,C, fixedW[5]);
  598.     RND(C,D,E,F,G,H,A,B, fixedW[6]);
  599.     RND(B,C,D,E,F,G,H,A, fixedW[7]);
  600.     RND(A,B,C,D,E,F,G,H, fixedW[8]);
  601.     RND(H,A,B,C,D,E,F,G, fixedW[9]);
  602.     RND(G,H,A,B,C,D,E,F, fixedW[10]);
  603.     RND(F,G,H,A,B,C,D,E, fixedW[11]);
  604.     RND(E,F,G,H,A,B,C,D, fixedW[12]);
  605.     RND(D,E,F,G,H,A,B,C, fixedW[13]);
  606.     RND(C,D,E,F,G,H,A,B, fixedW[14]);
  607.     RND(B,C,D,E,F,G,H,A, fixedW[15]);
  608.     RND(A,B,C,D,E,F,G,H, fixedW[16]);
  609.     RND(H,A,B,C,D,E,F,G, fixedW[17]);
  610.     RND(G,H,A,B,C,D,E,F, fixedW[18]);
  611.     RND(F,G,H,A,B,C,D,E, fixedW[19]);
  612.     RND(E,F,G,H,A,B,C,D, fixedW[20]);
  613.     RND(D,E,F,G,H,A,B,C, fixedW[21]);
  614.     RND(C,D,E,F,G,H,A,B, fixedW[22]);
  615.     RND(B,C,D,E,F,G,H,A, fixedW[23]);
  616.     RND(A,B,C,D,E,F,G,H, fixedW[24]);
  617.     RND(H,A,B,C,D,E,F,G, fixedW[25]);
  618.     RND(G,H,A,B,C,D,E,F, fixedW[26]);
  619.     RND(F,G,H,A,B,C,D,E, fixedW[27]);
  620.     RND(E,F,G,H,A,B,C,D, fixedW[28]);
  621.     RND(D,E,F,G,H,A,B,C, fixedW[29]);
  622.     RND(C,D,E,F,G,H,A,B, fixedW[30]);
  623.     RND(B,C,D,E,F,G,H,A, fixedW[31]);
  624.     RND(A,B,C,D,E,F,G,H, fixedW[32]);
  625.     RND(H,A,B,C,D,E,F,G, fixedW[33]);
  626.     RND(G,H,A,B,C,D,E,F, fixedW[34]);
  627.     RND(F,G,H,A,B,C,D,E, fixedW[35]);
  628.     RND(E,F,G,H,A,B,C,D, fixedW[36]);
  629.     RND(D,E,F,G,H,A,B,C, fixedW[37]);
  630.     RND(C,D,E,F,G,H,A,B, fixedW[38]);
  631.     RND(B,C,D,E,F,G,H,A, fixedW[39]);
  632.     RND(A,B,C,D,E,F,G,H, fixedW[40]);
  633.     RND(H,A,B,C,D,E,F,G, fixedW[41]);
  634.     RND(G,H,A,B,C,D,E,F, fixedW[42]);
  635.     RND(F,G,H,A,B,C,D,E, fixedW[43]);
  636.     RND(E,F,G,H,A,B,C,D, fixedW[44]);
  637.     RND(D,E,F,G,H,A,B,C, fixedW[45]);
  638.     RND(C,D,E,F,G,H,A,B, fixedW[46]);
  639.     RND(B,C,D,E,F,G,H,A, fixedW[47]);
  640.     RND(A,B,C,D,E,F,G,H, fixedW[48]);
  641.     RND(H,A,B,C,D,E,F,G, fixedW[49]);
  642.     RND(G,H,A,B,C,D,E,F, fixedW[50]);
  643.     RND(F,G,H,A,B,C,D,E, fixedW[51]);
  644.     RND(E,F,G,H,A,B,C,D, fixedW[52]);
  645.     RND(D,E,F,G,H,A,B,C, fixedW[53]);
  646.     RND(C,D,E,F,G,H,A,B, fixedW[54]);
  647.     RND(B,C,D,E,F,G,H,A, fixedW[55]);
  648.     RND(A,B,C,D,E,F,G,H, fixedW[56]);
  649.     RND(H,A,B,C,D,E,F,G, fixedW[57]);
  650.     RND(G,H,A,B,C,D,E,F, fixedW[58]);
  651.     RND(F,G,H,A,B,C,D,E, fixedW[59]);
  652.     RND(E,F,G,H,A,B,C,D, fixedW[60]);
  653.     RND(D,E,F,G,H,A,B,C, fixedW[61]);
  654.     RND(C,D,E,F,G,H,A,B, fixedW[62]);
  655.     RND(B,C,D,E,F,G,H,A, fixedW[63]);
  656.    
  657. #undef A
  658. #undef B
  659. #undef C
  660. #undef D
  661. #undef E
  662. #undef F
  663. #undef G
  664. #undef H
  665.     *state0 += S0;
  666.     *state1 += S1;
  667. }
  668.  
  669. void shittify(uint4 B[8])
  670. {
  671.     uint4 tmp[4];
  672.     tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
  673.     tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
  674.     tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
  675.     tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
  676.    
  677.     B[0] = EndianSwap(tmp[0]);
  678.     B[1] = EndianSwap(tmp[1]);
  679.     B[2] = EndianSwap(tmp[2]);
  680.     B[3] = EndianSwap(tmp[3]);
  681.  
  682.     tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
  683.     tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
  684.     tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
  685.     tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
  686.    
  687.     B[4] = EndianSwap(tmp[0]);
  688.     B[5] = EndianSwap(tmp[1]);
  689.     B[6] = EndianSwap(tmp[2]);
  690.     B[7] = EndianSwap(tmp[3]);
  691.  
  692. }
  693.  
  694. void unshittify(uint4 B[8])
  695. {
  696.     uint4 tmp[4];
  697.     tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
  698.     tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
  699.     tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
  700.     tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
  701.    
  702.     B[0] = EndianSwap(tmp[0]);
  703.     B[1] = EndianSwap(tmp[1]);
  704.     B[2] = EndianSwap(tmp[2]);
  705.     B[3] = EndianSwap(tmp[3]);
  706.  
  707.     tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
  708.     tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
  709.     tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
  710.     tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
  711.    
  712.     B[4] = EndianSwap(tmp[0]);
  713.     B[5] = EndianSwap(tmp[1]);
  714.     B[6] = EndianSwap(tmp[2]);
  715.     B[7] = EndianSwap(tmp[3]);
  716. }
  717.  
  718. void salsa(uint4 B[8])
  719. {
  720.     uint4 w[4];
  721.  
  722.     w[0] = (B[0]^=B[4]);
  723.     w[1] = (B[1]^=B[5]);
  724.     w[2] = (B[2]^=B[6]);
  725.     w[3] = (B[3]^=B[7]);
  726.  
  727.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  728.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  729.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  730.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  731.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  732.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  733.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  734.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  735.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  736.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  737.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  738.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  739.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  740.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  741.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  742.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  743.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  744.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  745.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  746.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  747.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  748.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  749.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  750.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  751.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  752.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  753.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  754.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  755.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  756.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  757.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  758.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  759.  
  760.     w[0] = (B[4]^=(B[0]+=w[0]));
  761.     w[1] = (B[5]^=(B[1]+=w[1]));
  762.     w[2] = (B[6]^=(B[2]+=w[2]));
  763.     w[3] = (B[7]^=(B[3]+=w[3]));
  764.  
  765.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  766.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  767.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  768.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  769.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  770.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  771.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  772.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  773.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  774.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  775.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  776.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  777.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  778.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  779.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  780.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  781.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  782.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  783.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  784.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  785.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  786.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  787.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  788.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  789.     w[0] ^= rotl(w[3]     +w[2]     , 7U);
  790.     w[1] ^= rotl(w[0]     +w[3]     , 9U);
  791.     w[2] ^= rotl(w[1]     +w[0]     ,13U);
  792.     w[3] ^= rotl(w[2]     +w[1]     ,18U);
  793.     w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  794.     w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  795.     w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  796.     w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  797.  
  798.     B[4] += w[0];
  799.     B[5] += w[1];
  800.     B[6] += w[2];
  801.     B[7] += w[3];
  802. }
  803.  
  804. void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
  805. {
  806.     shittify(X);
  807.     uint xs = get_global_id(0)%24000*8;     //replace 24000 with your thread concurrency
  808.    
  809.     for(uint y=0; y<512; ++y)               //512 value was 1024 divided by LOOKUP_GAP (2 at this value).  2 is the best, if you have a different lookup gap  
  810.     {                                       //consider using 4 or 8 not other numbers, because it will hit the conditional preprocessor below, and you don't want that.
  811.    
  812.         lookup[0+xs+y*192000U] = X[0];      //192000U is thread concurrency * 8 (in this case 24000 * 8).  8 is a fixed value for salsa.
  813.         lookup[1+xs+y*192000U] = X[1];      //replace all 192000U with your thread concurrency * 8.
  814.         lookup[2+xs+y*192000U] = X[2];
  815.         lookup[3+xs+y*192000U] = X[3];
  816.         lookup[4+xs+y*192000U] = X[4];
  817.         lookup[5+xs+y*192000U] = X[5];
  818.         lookup[6+xs+y*192000U] = X[6];
  819.         lookup[7+xs+y*192000U] = X[7];
  820.            
  821.         for(uint i=2; i--;)                 //replace value 2 with you LOOKUP_GAP value.
  822.             salsa(X);
  823.     }
  824. #if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
  825.     {
  826.         uint y = 512;
  827. #pragma unroll
  828.         for(uint z=0; z<zSIZE; ++z)
  829.             lookup[CO] = X[z];
  830.         for(uint i=0; i<1024%LOOKUP_GAP; ++i)
  831.             salsa(X);
  832.     }
  833. #endif
  834.     for (uint i=1024; i--; )                //this loop is the main scrypt 1024 mining value.  The for loop is optimized to remove the condition check, and relies on the i-- for the condition check.
  835.     {
  836.         uint4 V[8];
  837.         uint j = X[7].x & K[85];
  838.         uint y = (j>>1);                    //this is (j / LOOKUP_GAP).  Since our lookup gap is 2, it's better to shift than to do a division.  Replace this with (j / LOOKUP_GAP) if your lookup gap is not 2.
  839.         uint ys = y*192000U;                //replace all 192000U with your thread concurrency * 8.
  840.        
  841.         V[0] = lookup[0+xs+ys];
  842.         V[1] = lookup[1+xs+ys];
  843.         V[2] = lookup[2+xs+ys];
  844.         V[3] = lookup[3+xs+ys];
  845.         V[4] = lookup[4+xs+ys];
  846.         V[5] = lookup[5+xs+ys];
  847.         V[6] = lookup[6+xs+ys];
  848.         V[7] = lookup[7+xs+ys];
  849.  
  850. #if (LOOKUP_GAP == 1)
  851. #elif (LOOKUP_GAP == 2)
  852.         if (j&1)
  853.             salsa(V);
  854. #else
  855.         uint val = j%LOOKUP_GAP;
  856.         for (uint z=0; z<val; ++z)
  857.             salsa(V);
  858. #endif
  859.  
  860.         X[0] ^= V[0];
  861.         X[1] ^= V[1];
  862.         X[2] ^= V[2];
  863.         X[3] ^= V[3];
  864.         X[4] ^= V[4];
  865.         X[5] ^= V[5];
  866.         X[6] ^= V[6];
  867.         X[7] ^= V[7];
  868.         salsa(X);
  869.     }
  870.     unshittify(X);
  871. }
  872.  
  873. #define SCRYPT_FOUND (0xFF)
  874. #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
  875.  
  876. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  877. __kernel void search(__global const uint4 * restrict input,
  878. volatile __global uint*restrict output, __global uint4*restrict padcache,
  879. const uint4 midstate0, const uint4 midstate16, const uint target)
  880. {
  881.     uint gid = get_global_id(0);
  882.     uint4 X[8];
  883.     uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
  884.     uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
  885.     uint4 pad0 = midstate0, pad1 = midstate16;
  886.  
  887.     SHA256(&pad0,&pad1, data, (uint4)(K[84],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[86]));
  888.     SHA256_fresh(&ostate0,&ostate1, pad0^ K[82], pad1^ K[82], K[82], K[82]);
  889.     SHA256_fresh(&tstate0,&tstate1, pad0^ K[83], pad1^ K[83], K[83], K[83]);
  890.  
  891.     tmp0 = tstate0;
  892.     tmp1 = tstate1;
  893.     SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
  894.  
  895.     pad0 = tstate0;
  896.     pad1 = tstate1;
  897.     X[0] = ostate0;
  898.     X[1] = ostate1;
  899.  
  900.     SHA256(&pad0,&pad1, data, (uint4)(1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
  901.     SHA256(X,X+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
  902.  
  903.     pad0 = tstate0;
  904.     pad1 = tstate1;
  905.     X[2] = ostate0;
  906.     X[3] = ostate1;
  907.  
  908.     SHA256(&pad0,&pad1, data, (uint4)(2,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
  909.     SHA256(X+2,X+3, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
  910.  
  911.     pad0 = tstate0;
  912.     pad1 = tstate1;
  913.     X[4] = ostate0;
  914.     X[5] = ostate1;
  915.  
  916.     SHA256(&pad0,&pad1, data, (uint4)(3,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
  917.     SHA256(X+4,X+5, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
  918.  
  919.     pad0 = tstate0;
  920.     pad1 = tstate1;
  921.     X[6] = ostate0;
  922.     X[7] = ostate1;
  923.  
  924.     SHA256(&pad0,&pad1, data, (uint4)(4,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
  925.     SHA256(X+6,X+7, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
  926.  
  927.     scrypt_core(X,padcache);
  928.     SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
  929.     SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
  930.     SHA256_fixed(&tmp0,&tmp1);
  931.     SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
  932.  
  933.     bool result = (EndianSwap(ostate1.w) <= target);
  934.     if (result)
  935.         SETFOUND(gid);
  936. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement