Advertisement
ig0rb

latest

Dec 17th, 2013
581
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 29.67 KB | None | 0 0
  1. /*-
  2.  * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
  3.  * 2012-2013 Con Kolivas.
  4.  * 2013-9999 Ig0rb
  5.  * All rights reserved.
  6.  *
  7.  * Redistribution and use in source and binary forms, with or without
  8.  * modification, are permitted provided that the following conditions
  9.  * are met:
  10.  * 1. Redistributions of source code must retain the above copyright
  11.  *    notice, this list of conditions and the following disclaimer.
  12.  * 2. Redistributions in binary form must reproduce the above copyright
  13.  *    notice, this list of conditions and the following disclaimer in the
  14.  *    documentation and/or other materials provided with the distribution.
  15.  *
  16.  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17.  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18.  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19.  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20.  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21.  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22.  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23.  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24.  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25.  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26.  * SUCH DAMAGE.
  27.  *
  28.  * This file was originally written by Colin Percival as part of the Tarsnap
  29.  * online backup system.
  30.  */
  31.  
  32. // __constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; definiti come costanti
  33. /*
  34. __constant uint K[] = {
  35.     0x428a2f98U,
  36.     0x71374491U,
  37.     0xb5c0fbcfU,
  38.     0xe9b5dba5U,
  39.     0x3956c25bU,
  40.     0x59f111f1U,
  41.     0x923f82a4U,
  42.     0xab1c5ed5U,
  43.     0xd807aa98U,
  44.     0x12835b01U,
  45.     0x243185beU, // 10
  46.     0x550c7dc3U,
  47.     0x72be5d74U,
  48.     0x80deb1feU,
  49.     0x9bdc06a7U,
  50.     0xe49b69c1U,
  51.     0xefbe4786U,
  52.     0x0fc19dc6U,
  53.     0x240ca1ccU,
  54.     0x2de92c6fU,
  55.     0x4a7484aaU, // 20
  56.     0x5cb0a9dcU,
  57.     0x76f988daU,
  58.     0x983e5152U,
  59.     0xa831c66dU,
  60.     0xb00327c8U,
  61.     0xbf597fc7U,
  62.     0xc6e00bf3U,
  63.     0xd5a79147U,
  64.     0x06ca6351U,
  65.     0x14292967U, // 30
  66.     0x27b70a85U,
  67.     0x2e1b2138U,
  68.     0x4d2c6dfcU,
  69.     0x53380d13U,
  70.     0x650a7354U,
  71.     0x766a0abbU,
  72.     0x81c2c92eU,
  73.     0x92722c85U,
  74.     0xa2bfe8a1U,
  75.     0xa81a664bU, // 40
  76.     0xc24b8b70U,
  77.     0xc76c51a3U,
  78.     0xd192e819U,
  79.     0xd6990624U,
  80.     0xf40e3585U,
  81.     0x106aa070U,
  82.     0x19a4c116U,
  83.     0x1e376c08U,
  84.     0x2748774cU,
  85.     0x34b0bcb5U, // 50
  86.     0x391c0cb3U,
  87.     0x4ed8aa4aU,
  88.     0x5b9cca4fU,
  89.     0x682e6ff3U,
  90.     0x748f82eeU,
  91.     0x78a5636fU,
  92.     0x84c87814U,
  93.     0x8cc70208U,
  94.     0x90befffaU,
  95.     0xa4506cebU, // 60
  96.     0xbef9a3f7U,
  97.     0xc67178f2U,
  98.     0x98c7e2a2U,
  99.     0xfc08884dU,
  100.     0xcd2a11aeU,
  101.     0x510e527fU,
  102.     0x9b05688cU,
  103.     0xC3910C8EU,
  104.     0xfb6feee7U,
  105.     0x2a01a605U, // 70
  106.     0x0c2e12e0U,
  107.     0x4498517BU,
  108.     0x6a09e667U,
  109.     0xa4ce148bU,
  110.     0x95F61999U,
  111.     0xc19bf174U,
  112.     0xBB67AE85U,
  113.     0x3C6EF372U,
  114.     0xA54FF53AU,
  115.     0x1F83D9ABU, // 80
  116.     0x5BE0CD19U,
  117.     0x5C5C5C5CU,
  118.     0x36363636U,
  119.     0x80000000U,
  120.     0x000003FFU,
  121.     0x00000280U,
  122.     0x000004a0U,
  123.     0x00000300U
  124. };
  125. */
  126.  
  127. #define rotl(x,y) rotate(x,y)
  128. //#define Ch(x,y,z) bitselect(z,y,x)
  129. #define Ch(x,y,z)       (z ^ (x & (y ^ z)))
  130.  
  131. #define Maj(x,y,z) Ch((x^z),y,z)
  132.  
  133. #define EndianSwap(n) (rotl(n & 0x00FF00FF, 24U)|rotl(n & 0xFF00FF00, 8U))
  134.  
  135. #define Tr2(x)      (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
  136. #define Tr1(x)      (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
  137. #define Wr2(x)      (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
  138. #define Wr1(x)      (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
  139.  
  140. #define RND(a, b, c, d, e, f, g, h, k)  \
  141.     h += Tr1(e);            \
  142.     h += Ch(e, f, g);       \
  143.     h += k;             \
  144.     d += h;             \
  145.     h += Tr2(a);            \
  146.     h += Maj(a, b, c);
  147.  
  148. void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
  149. {
  150.     uint4 S0 = *state0;
  151.     uint4 S1 = *state1;
  152.    
  153. #define A S0.x
  154. #define B S0.y
  155. #define C S0.z
  156. #define D S0.w
  157. #define E S1.x
  158. #define F S1.y
  159. #define G S1.z
  160. #define H S1.w
  161.  
  162.     uint4 W[4];
  163.  
  164.     W[ 0].x = block0.x;
  165.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0x428a2f98U);
  166.     W[ 0].y = block0.y;
  167.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0x71374491U);
  168.     W[ 0].z = block0.z;
  169.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0xb5c0fbcfU);
  170.     W[ 0].w = block0.w;
  171.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0xe9b5dba5U);
  172.  
  173.     W[ 1].x = block1.x;
  174.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x3956c25bU);
  175.     W[ 1].y = block1.y;
  176.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x59f111f1U);
  177.     W[ 1].z = block1.z;
  178.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x923f82a4U);
  179.     W[ 1].w = block1.w;
  180.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0xab1c5ed5U);
  181.  
  182.     W[ 2].x = block2.x;
  183.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0xd807aa98U);
  184.     W[ 2].y = block2.y;
  185.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0x12835b01U);
  186.     W[ 2].z = block2.z;
  187.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0x243185beU);
  188.     W[ 2].w = block2.w;
  189.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0x550c7dc3U);
  190.  
  191.     W[ 3].x = block3.x;
  192.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0x72be5d74U);
  193.     W[ 3].y = block3.y;
  194.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0x80deb1feU);
  195.     W[ 3].z = block3.z;
  196.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0x9bdc06a7U);
  197.     W[ 3].w = block3.w;
  198.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0xc19bf174U);
  199.  
  200.     W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
  201.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0xe49b69c1U);
  202.  
  203.     W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
  204.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0xefbe4786U);
  205.  
  206.     W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
  207.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0x0fc19dc6U);
  208.  
  209.     W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
  210.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0x240ca1ccU);
  211.  
  212.     W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
  213.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x2de92c6fU);
  214.  
  215.     W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
  216.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x4a7484aaU);
  217.  
  218.     W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
  219.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x5cb0a9dcU);
  220.  
  221.     W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
  222.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0x76f988daU);
  223.  
  224.     W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
  225.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0x983e5152U);
  226.  
  227.     W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
  228.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0xa831c66dU);
  229.  
  230.     W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
  231.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0xb00327c8U);
  232.  
  233.     W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
  234.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0xbf597fc7U);
  235.  
  236.     W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
  237.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0xc6e00bf3U);
  238.  
  239.     W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
  240.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0xd5a79147U);
  241.  
  242.     W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
  243.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0x06ca6351U);
  244.  
  245.     W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
  246.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0x14292967U);
  247.  
  248.     W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
  249.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0x27b70a85U);
  250.  
  251.     W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
  252.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0x2e1b2138U);
  253.  
  254.     W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
  255.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0x4d2c6dfcU);
  256.  
  257.     W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
  258.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0x53380d13U);
  259.  
  260.     W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
  261.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x650a7354U);
  262.  
  263.     W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
  264.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x766a0abbU);
  265.  
  266.     W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
  267.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x81c2c92eU);
  268.  
  269.     W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
  270.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0x92722c85U);
  271.  
  272.     W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
  273.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0xa2bfe8a1U);
  274.  
  275.     W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
  276.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0xa81a664bU);
  277.  
  278.     W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
  279.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0xc24b8b70U);
  280.  
  281.     W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
  282.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0xc76c51a3U);
  283.  
  284.     W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
  285.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0xd192e819U);
  286.  
  287.     W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
  288.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0xd6990624U);
  289.  
  290.     W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
  291.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0xf40e3585U);
  292.  
  293.     W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
  294.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0x106aa070U);
  295.  
  296.     W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
  297.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0x19a4c116U);
  298.  
  299.     W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
  300.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0x1e376c08U);
  301.  
  302.     W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
  303.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0x2748774cU);
  304.  
  305.     W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
  306.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0x34b0bcb5U);
  307.  
  308.     W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
  309.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x391c0cb3U);
  310.  
  311.     W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
  312.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x4ed8aa4aU);
  313.  
  314.     W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
  315.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x5b9cca4fU);
  316.  
  317.     W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
  318.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0x682e6ff3U);
  319.  
  320.     W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
  321.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0x748f82eeU);
  322.  
  323.     W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
  324.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0x78a5636fU);
  325.  
  326.     W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
  327.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0x84c87814U);
  328.  
  329.     W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
  330.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0x8cc70208U);
  331.  
  332.     W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
  333.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0x90befffaU);
  334.  
  335.     W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
  336.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0xa4506cebU);
  337.  
  338.     W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
  339.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0xbef9a3f7U);
  340.  
  341.     W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
  342.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0xc67178f2U);
  343.    
  344. #undef A
  345. #undef B
  346. #undef C
  347. #undef D
  348. #undef E
  349. #undef F
  350. #undef G
  351. #undef H
  352.  
  353.     *state0 += S0;
  354.     *state1 += S1;
  355. }
  356.  
  357. void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
  358. {
  359. #define A (*state0).x
  360. #define B (*state0).y
  361. #define C (*state0).z
  362. #define D (*state0).w
  363. #define E (*state1).x
  364. #define F (*state1).y
  365. #define G (*state1).z
  366. #define H (*state1).w
  367.  
  368.     uint4 W[4];
  369.  
  370.     W[0].x = block0.x;
  371.     D= 0x98c7e2a2U +W[0].x;
  372.     H= 0xfc08884dU +W[0].x;
  373.  
  374.     W[0].y = block0.y;
  375.     C= 0xcd2a11aeU +Tr1(D)+Ch(D, 0x510e527fU, 0x9b05688cU)+W[0].y;
  376.     G= 0xc3910c8eU +C+Tr2(H)+Ch(H, 0xfb6feee7U ,0x2a01a605U);
  377.  
  378.     W[0].z = block0.z;
  379.     B= 0x0c2e12e0U +Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z;
  380.     F= 0x4498517bU +B+Tr2(G)+Maj(G,H, 0x6a09e667U);
  381.  
  382.     W[0].w = block0.w;
  383.     A= 0xa4ce148bU +Tr1(B)+Ch(B,C,D)+W[0].w;
  384.     E= 0x95f61999U +A+Tr2(F)+Maj(F,G,H);
  385.  
  386.     W[1].x = block1.x;
  387.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x3956c25bU);
  388.     W[1].y = block1.y;
  389.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x59f111f1U);
  390.     W[1].z = block1.z;
  391.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x923f82a4U);
  392.     W[1].w = block1.w;
  393.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0xab1c5ed5U);
  394.    
  395.     W[2].x = block2.x;
  396.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0xd807aa98U);
  397.     W[2].y = block2.y;
  398.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0x12835b01U);
  399.     W[2].z = block2.z;
  400.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0x243185beU);
  401.     W[2].w = block2.w;
  402.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0x550c7dc3U);
  403.    
  404.     W[3].x = block3.x;
  405.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0x72be5d74U);
  406.     W[3].y = block3.y;
  407.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0x80deb1feU);
  408.     W[3].z = block3.z;
  409.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0x9bdc06a7U);
  410.     W[3].w = block3.w;
  411.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0xc19bf174U);
  412.  
  413.     W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
  414.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0xe49b69c1U);
  415.  
  416.     W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
  417.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0xefbe4786U);
  418.  
  419.     W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
  420.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0x0fc19dc6U);
  421.  
  422.     W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
  423.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0x240ca1ccU);
  424.  
  425.     W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
  426.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x2de92c6fU);
  427.  
  428.     W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
  429.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x4a7484aaU);
  430.  
  431.     W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
  432.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x5cb0a9dcU);
  433.  
  434.     W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
  435.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0x76f988daU);
  436.  
  437.     W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
  438.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0x983e5152U);
  439.  
  440.     W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
  441.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0xa831c66dU);
  442.  
  443.     W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
  444.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0xb00327c8U);
  445.  
  446.     W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
  447.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0xbf597fc7U);
  448.  
  449.     W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
  450.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0xc6e00bf3U);
  451.  
  452.     W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
  453.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0xd5a79147U);
  454.  
  455.     W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
  456.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0x06ca6351U);
  457.  
  458.     W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
  459.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0x14292967U);
  460.  
  461.     W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
  462.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0x27b70a85U);
  463.  
  464.     W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
  465.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0x2e1b2138U);
  466.  
  467.     W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
  468.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0x4d2c6dfcU);
  469.  
  470.     W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
  471.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0x53380d13U);
  472.  
  473.     W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
  474.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x650a7354U);
  475.  
  476.     W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
  477.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x766a0abbU);
  478.  
  479.     W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
  480.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x81c2c92eU);
  481.  
  482.     W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
  483.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0x92722c85U);
  484.  
  485.     W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
  486.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0xa2bfe8a1U);
  487.  
  488.     W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
  489.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0xa81a664bU);
  490.  
  491.     W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
  492.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0xc24b8b70U);
  493.  
  494.     W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
  495.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0xc76c51a3U);
  496.  
  497.     W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
  498.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0xd192e819U);
  499.  
  500.     W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
  501.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0xd6990624U);
  502.  
  503.     W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
  504.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0xf40e3585U);
  505.  
  506.     W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
  507.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0x106aa070U);
  508.  
  509.     W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
  510.     RND(A,B,C,D,E,F,G,H, W[0].x+ 0x19a4c116U);
  511.  
  512.     W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
  513.     RND(H,A,B,C,D,E,F,G, W[0].y+ 0x1e376c08U);
  514.  
  515.     W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
  516.     RND(G,H,A,B,C,D,E,F, W[0].z+ 0x2748774cU);
  517.  
  518.     W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
  519.     RND(F,G,H,A,B,C,D,E, W[0].w+ 0x34b0bcb5U);
  520.  
  521.     W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
  522.     RND(E,F,G,H,A,B,C,D, W[1].x+ 0x391c0cb3U);
  523.  
  524.     W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
  525.     RND(D,E,F,G,H,A,B,C, W[1].y+ 0x4ed8aa4aU);
  526.  
  527.     W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
  528.     RND(C,D,E,F,G,H,A,B, W[1].z+ 0x5b9cca4fU);
  529.  
  530.     W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
  531.     RND(B,C,D,E,F,G,H,A, W[1].w+ 0x682e6ff3U);
  532.  
  533.     W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
  534.     RND(A,B,C,D,E,F,G,H, W[2].x+ 0x748f82eeU);
  535.  
  536.     W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
  537.     RND(H,A,B,C,D,E,F,G, W[2].y+ 0x78a5636fU);
  538.  
  539.     W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
  540.     RND(G,H,A,B,C,D,E,F, W[2].z+ 0x84c87814U);
  541.  
  542.     W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
  543.     RND(F,G,H,A,B,C,D,E, W[2].w+ 0x8cc70208U);
  544.  
  545.     W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
  546.     RND(E,F,G,H,A,B,C,D, W[3].x+ 0x90befffaU);
  547.  
  548.     W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
  549.     RND(D,E,F,G,H,A,B,C, W[3].y+ 0xa4506cebU);
  550.  
  551.     W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
  552.     RND(C,D,E,F,G,H,A,B, W[3].z+ 0xbef9a3f7U);
  553.  
  554.     W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
  555.     RND(B,C,D,E,F,G,H,A, W[3].w+ 0xc67178f2U);
  556.    
  557. #undef A
  558. #undef B
  559. #undef C
  560. #undef D
  561. #undef E
  562. #undef F
  563. #undef G
  564. #undef H
  565.  
  566.     *state0 += (uint4)(0x6a09e667U, 0xbb67ae85U, 0x3c6ef372U, 0xa54ff53aU);
  567.     *state1 += (uint4)(0x510e527fU, 0x9b05688cU, 0x1f83d9abU, 0x5be0cd19U);
  568. }
  569.  
  570. /*
  571. __constant uint fixedW[64] =
  572. {
  573.     0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  574.     0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
  575.     0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
  576.     0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
  577.     0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
  578.     0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
  579.     0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
  580.     0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
  581. };
  582. */
  583.  
  584. void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
  585. {
  586.     uint4 S0 = *state0;
  587.     uint4 S1 = *state1;
  588.  
  589. #define A S0.x
  590. #define B S0.y
  591. #define C S0.z
  592. #define D S0.w
  593. #define E S1.x
  594. #define F S1.y
  595. #define G S1.z
  596. #define H S1.w
  597.  
  598.     RND(A,B,C,D,E,F,G,H, 0x428a2f99U);
  599.     RND(H,A,B,C,D,E,F,G, 0xf1374491U);
  600.     RND(G,H,A,B,C,D,E,F, 0xb5c0fbcfU);
  601.     RND(F,G,H,A,B,C,D,E, 0xe9b5dba5U);
  602.     RND(E,F,G,H,A,B,C,D, 0x3956c25bU);
  603.     RND(D,E,F,G,H,A,B,C, 0x59f111f1U);
  604.     RND(C,D,E,F,G,H,A,B, 0x923f82a4U);
  605.     RND(B,C,D,E,F,G,H,A, 0xab1c5ed5U);
  606.     RND(A,B,C,D,E,F,G,H, 0xd807aa98U);
  607.     RND(H,A,B,C,D,E,F,G, 0x12835b01U);
  608.     RND(G,H,A,B,C,D,E,F, 0x243185beU);
  609.     RND(F,G,H,A,B,C,D,E, 0x550c7dc3U);
  610.     RND(E,F,G,H,A,B,C,D, 0x72be5d74U);
  611.     RND(D,E,F,G,H,A,B,C, 0x80deb1feU);
  612.     RND(C,D,E,F,G,H,A,B, 0x9bdc06a7U);
  613.     RND(B,C,D,E,F,G,H,A, 0xc19bf794U);
  614.     RND(A,B,C,D,E,F,G,H, 0xf59b89c2U);
  615.     RND(H,A,B,C,D,E,F,G, 0x73924787U);
  616.     RND(G,H,A,B,C,D,E,F, 0x23c6886eU);
  617.     RND(F,G,H,A,B,C,D,E, 0xa42ca65cU);
  618.     RND(E,F,G,H,A,B,C,D, 0x15ed3627U);
  619.     RND(D,E,F,G,H,A,B,C, 0x4d6edcbfU);
  620.     RND(C,D,E,F,G,H,A,B, 0xe28217fcU);
  621.     RND(B,C,D,E,F,G,H,A, 0xef02488fU);
  622.     RND(A,B,C,D,E,F,G,H, 0xb707775cU);
  623.     RND(H,A,B,C,D,E,F,G, 0x0468c23fU);
  624.     RND(G,H,A,B,C,D,E,F, 0xe7e72b4cU);
  625.     RND(F,G,H,A,B,C,D,E, 0x49e1f1a2U);
  626.     RND(E,F,G,H,A,B,C,D, 0x4b99c816U);
  627.     RND(D,E,F,G,H,A,B,C, 0x926d1570U);
  628.     RND(C,D,E,F,G,H,A,B, 0xaa0fc072U);
  629.     RND(B,C,D,E,F,G,H,A, 0xadb36e2cU);
  630.     RND(A,B,C,D,E,F,G,H, 0xad87a3eaU);
  631.     RND(H,A,B,C,D,E,F,G, 0xbcb1d3a3U);
  632.     RND(G,H,A,B,C,D,E,F, 0x7b993186U);
  633.     RND(F,G,H,A,B,C,D,E, 0x562b9420U);
  634.     RND(E,F,G,H,A,B,C,D, 0xbff3ca0cU);
  635.     RND(D,E,F,G,H,A,B,C, 0xda4b0c23U);
  636.     RND(C,D,E,F,G,H,A,B, 0x6cd8711aU);
  637.     RND(B,C,D,E,F,G,H,A, 0x8f337caaU);
  638.     RND(A,B,C,D,E,F,G,H, 0xc91b1417U);
  639.     RND(H,A,B,C,D,E,F,G, 0xc359dce1U);
  640.     RND(G,H,A,B,C,D,E,F, 0xa83253a7U);
  641.     RND(F,G,H,A,B,C,D,E, 0x3b13c12dU);
  642.     RND(E,F,G,H,A,B,C,D, 0x9d3d725dU);
  643.     RND(D,E,F,G,H,A,B,C, 0xd9031a84U);
  644.     RND(C,D,E,F,G,H,A,B, 0xb1a03340U);
  645.     RND(B,C,D,E,F,G,H,A, 0x16f58012U);
  646.     RND(A,B,C,D,E,F,G,H, 0xe64fb6a2U);
  647.     RND(H,A,B,C,D,E,F,G, 0xe84d923aU);
  648.     RND(G,H,A,B,C,D,E,F, 0xe93a5730U);
  649.     RND(F,G,H,A,B,C,D,E, 0x09837686U);
  650.     RND(E,F,G,H,A,B,C,D, 0x078ff753U);
  651.     RND(D,E,F,G,H,A,B,C, 0x29833341U);
  652.     RND(C,D,E,F,G,H,A,B, 0xd5de0b7eU);
  653.     RND(B,C,D,E,F,G,H,A, 0x6948ccf4U);
  654.     RND(A,B,C,D,E,F,G,H, 0xe0a1adbeU);
  655.     RND(H,A,B,C,D,E,F,G, 0x7c728e11U);
  656.     RND(G,H,A,B,C,D,E,F, 0x511c78e4U);
  657.     RND(F,G,H,A,B,C,D,E, 0x315b45bdU);
  658.     RND(E,F,G,H,A,B,C,D, 0xfca71413U);
  659.     RND(D,E,F,G,H,A,B,C, 0xea28f96aU);
  660.     RND(C,D,E,F,G,H,A,B, 0x79703128U);
  661.     RND(B,C,D,E,F,G,H,A, 0x4e1ef848U);
  662.    
  663. #undef A
  664. #undef B
  665. #undef C
  666. #undef D
  667. #undef E
  668. #undef F
  669. #undef G
  670. #undef H
  671.     *state0 += S0;
  672.     *state1 += S1;
  673. }
  674.  
  675. void shittify(uint4 B[8])
  676. {
  677.     uint4 tmp[4];
  678.     tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
  679.     tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
  680.     tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
  681.     tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
  682.  
  683.    
  684. #pragma unroll
  685.     //for(uint i=0; i<4; ++i)
  686.     for(uint i=0; i!=4; i++)
  687.         B[i] = EndianSwap(tmp[i]);
  688. /*
  689.     B[0] = EndianSwap(tmp[0]);
  690.     B[1] = EndianSwap(tmp[1]);
  691.     B[2] = EndianSwap(tmp[2]);
  692.     B[3] = EndianSwap(tmp[3]);
  693. */
  694.     tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
  695.     tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
  696.     tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
  697.     tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
  698.  
  699.    
  700. #pragma unroll
  701.     //for(uint i=0; i<4; ++i)
  702.     for(uint i=0; i!=4; i++)
  703.         B[i+4] = EndianSwap(tmp[i]);
  704. /*
  705.     B[4] = EndianSwap(tmp[0]);
  706.     B[5] = EndianSwap(tmp[1]);
  707.     B[6] = EndianSwap(tmp[2]);
  708.     B[7] = EndianSwap(tmp[3]);
  709. */
  710. }
  711.  
  712. void unshittify(uint4 B[8])
  713. {
  714.     uint4 tmp[4];
  715.     tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
  716.     tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
  717.     tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
  718.     tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
  719.  
  720.    
  721. #pragma unroll
  722.     //for(uint i=0; i<4; ++i)
  723.     for(uint i=0; i!=4; i++)
  724.         B[i] = EndianSwap(tmp[i]);
  725. /*
  726.     B[0] = EndianSwap(tmp[0]);
  727.     B[1] = EndianSwap(tmp[1]);
  728.     B[2] = EndianSwap(tmp[2]);
  729.     B[3] = EndianSwap(tmp[3]);
  730. */
  731.     tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
  732.     tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
  733.     tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
  734.     tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
  735.    
  736. #pragma unroll
  737.     //for(uint i=0; i<4; ++i)
  738.     for(uint i=0; i!=4; i++)
  739.         B[i+4] = EndianSwap(tmp[i]);
  740. /*
  741.     B[4] = EndianSwap(tmp[0]);
  742.         B[5] = EndianSwap(tmp[1]);
  743.         B[6] = EndianSwap(tmp[2]);
  744.         B[7] = EndianSwap(tmp[3]);
  745. */
  746. }
  747.  
  748. void salsa(uint4 B[8])
  749. {
  750.     uint4 w[4];
  751.  
  752. #pragma unroll
  753.     //for(uint i=0; i<4; ++i)
  754.     for(uint i=0; i!=4; i++)
  755.         w[i] = (B[i]^=B[i+4]);
  756. /*
  757.     w[0] = (B[0]^=B[4]);
  758.     w[1] = (B[1]^=B[5]);
  759.     w[2] = (B[2]^=B[6]);
  760.     w[3] = (B[3]^=B[7]);
  761. */
  762.  
  763. #pragma unroll
  764.     //for(uint i=0; i<4; ++i)
  765.     for(uint i=0; i != 4; ++i)
  766.     {
  767.         w[0] ^= rotl(w[3]     +w[2]     , 7U);
  768.         w[1] ^= rotl(w[0]     +w[3]     , 9U);
  769.         w[2] ^= rotl(w[1]     +w[0]     ,13U);
  770.         w[3] ^= rotl(w[2]     +w[1]     ,18U);
  771.         w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  772.         w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  773.         w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  774.         w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  775.     }
  776. /*
  777.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  778.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  779.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  780.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  781.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  782.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  783.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  784.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  785.  
  786.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  787.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  788.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  789.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  790.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  791.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  792.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  793.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  794.  
  795.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  796.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  797.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  798.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  799.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  800.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  801.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  802.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  803.  
  804.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  805.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  806.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  807.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  808.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  809.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  810.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  811.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  812.  
  813.  
  814.  
  815.  
  816. */
  817. #pragma unroll
  818.     //for(uint i=0; i<4; ++i)
  819.     for(uint i=0; i != 4; i++)
  820.         w[i] = (B[i+4]^=(B[i]+=w[i]));
  821. /*
  822.     w[0] = (B[4]^=(B[0]+=w[0]));
  823.     w[1] = (B[5]^=(B[1]+=w[1]));
  824.     w[2] = (B[6]^=(B[2]+=w[2]));
  825.     w[3] = (B[7]^=(B[3]+=w[3]));
  826. */
  827. #pragma unroll
  828.     for(uint i=0; i != 4; i++)
  829.     {
  830.         w[0] ^= rotl(w[3]     +w[2]     , 7U);
  831.         w[1] ^= rotl(w[0]     +w[3]     , 9U);
  832.         w[2] ^= rotl(w[1]     +w[0]     ,13U);
  833.         w[3] ^= rotl(w[2]     +w[1]     ,18U);
  834.         w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  835.         w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  836.         w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  837.         w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  838.     }
  839. /*
  840.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  841.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  842.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  843.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  844.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  845.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  846.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  847.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  848.  
  849.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  850.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  851.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  852.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  853.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  854.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  855.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  856.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  857.  
  858.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  859.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  860.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  861.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  862.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  863.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  864.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  865.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  866.  
  867.          w[0] ^= rotl(w[3]     +w[2]     , 7U);
  868.          w[1] ^= rotl(w[0]     +w[3]     , 9U);
  869.          w[2] ^= rotl(w[1]     +w[0]     ,13U);
  870.          w[3] ^= rotl(w[2]     +w[1]     ,18U);
  871.          w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
  872.          w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
  873.          w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
  874.          w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
  875.  
  876.  
  877.  
  878. */
  879. #pragma unroll
  880.     //for(uint i=0; i<4; ++i)
  881.     for(uint i=0; i != 4; i++)
  882.         B[i+4] += w[i];
  883. /*
  884.     B[4] += w[0];
  885.     B[5] += w[1];
  886.     B[6] += w[2];
  887.     B[7] += w[3];
  888. */
  889. }
  890.  
  891. //#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
  892. #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
  893. #define CO Coord(z,x,y)
  894.  
  895. void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
  896. {
  897.     shittify(X);
  898.     const uint zSIZE = 8;
  899.     //const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
  900.     //const uint ySIZE = (512);
  901.     const uint xSIZE = CONCURRENT_THREADS;
  902.     uint x = get_global_id(0)%xSIZE;
  903.  
  904.  
  905.     //for(uint y=0; y<1024/LOOKUP_GAP; y++)
  906.     for(uint y=0; y != 512; y++)
  907. //  for(uint y=0; y<512; y++)
  908. //  uint y=0;
  909. //  while(y != 512)
  910.     {
  911. #pragma unroll
  912.         //for(uint z=0; z<8; z++)
  913.         for(uint z=0; z != 8; z++)
  914.         {
  915.             lookup[CO] = X[z];
  916.         }
  917. /*
  918. #pragma unroll
  919.                 for(uint i=0; i<LOOKUP_GAP; i++)
  920.                         salsa(X);
  921. */
  922.         salsa(X);
  923.         salsa(X);
  924.     }
  925.  
  926.         for (uint i=0; i != 1024; i++)
  927.         {
  928.                 uint4 V[8];
  929.                 uint j = X[7].x & 0x000003ffU;
  930.                 //uint y = (j/LOOKUP_GAP);
  931.                 uint y = (j >> 1);
  932.  
  933. #pragma unroll
  934.                 //for(uint z=0; z<8; ++z)
  935.                 for(uint z=0; z != 8; ++z)
  936.         {
  937.                         V[z] = lookup[CO];
  938.         }
  939.  
  940. //                if (j&1)
  941.         if (X[7].x&1)
  942.                         salsa(V);
  943.  
  944.  
  945. #pragma unroll
  946.                 for(uint z=0; z != 8; ++z)
  947.                         X[z] ^= V[z];
  948.                 salsa(X);
  949.         }
  950. /*
  951.  
  952.     for(uint y=0; y<1024/LOOKUP_GAP; ++y)
  953.     {
  954. //#pragma unroll
  955.         for(uint z=0; z<zSIZE; ++z)
  956.             lookup[CO] = X[z];
  957.         for(uint i=0; i<LOOKUP_GAP; ++i)
  958.             salsa(X);
  959.     }
  960. #if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
  961.     {
  962.         uint y = (1024/LOOKUP_GAP);
  963. //#pragma unroll
  964.         for(uint z=0; z<zSIZE; ++z)
  965.             lookup[CO] = X[z];
  966.         for(uint i=0; i<1024%LOOKUP_GAP; ++i)
  967.             salsa(X);
  968.     }
  969. #endif
  970.     for (uint i=0; i<1024; ++i)
  971.     {
  972.         uint4 V[8];
  973.         uint j = X[7].x & 0x000003ffU;
  974.         uint y = (j/LOOKUP_GAP);
  975. //#pragma unroll
  976.         for(uint z=0; z<zSIZE; ++z)
  977.             V[z] = lookup[CO];
  978.  
  979. #if (LOOKUP_GAP == 1)
  980. #elif (LOOKUP_GAP == 2)
  981.         if (j&1)
  982.             salsa(V);
  983. #else
  984.         uint val = j%LOOKUP_GAP;
  985.         for (uint z=0; z<val; ++z)
  986.             salsa(V);
  987. #endif
  988.  
  989.  
  990. //#pragma unroll
  991.         for(uint z=0; z<zSIZE; ++z)
  992.             X[z] ^= V[z];
  993.         salsa(X);
  994.     }
  995. */
  996.     unshittify(X);
  997. }
  998.  
  999. #define SCRYPT_FOUND (0xFF)
  1000. #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
  1001.  
  1002. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  1003. __kernel void search(__global const uint4 * restrict input,
  1004. volatile __global uint*restrict output, __global uint4*restrict padcache,
  1005. const uint4 midstate0, const uint4 midstate16, const uint target)
  1006. {
  1007.     uint gid = get_global_id(0);
  1008.     uint4 X[8];
  1009.     uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
  1010.     uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
  1011.     uint4 pad0 = midstate0, pad1 = midstate16;
  1012.  
  1013.     SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, 0x00000280U));
  1014.     SHA256_fresh(&ostate0,&ostate1, pad0^ 0x5c5c5c5cU, pad1^ 0x5c5c5c5cU, 0x5c5c5c5cU, 0x5c5c5c5cU);
  1015.     SHA256_fresh(&tstate0,&tstate1, pad0^ 0x36363636U, pad1^ 0x36363636U, 0x36363636U, 0x36363636U);
  1016.  
  1017.     tmp0 = tstate0;
  1018.     tmp1 = tstate1;
  1019.     SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
  1020.  
  1021. #pragma unroll
  1022.     //for (uint i=0; i<4; i++)
  1023.     for (uint i=0; i!=4; i++)
  1024.     {
  1025.         pad0 = tstate0;
  1026.         pad1 = tstate1;
  1027.         X[i*2 ] = ostate0;
  1028.         X[i*2+1] = ostate1;
  1029.  
  1030.         SHA256(&pad0,&pad1, data, (uint4)(i+1,0x80000000U,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, 0x000004a0U));
  1031.         SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x00000300U));
  1032.     }
  1033.  
  1034.     scrypt_core(X,padcache);
  1035.     SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
  1036.     SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
  1037.     SHA256_fixed(&tmp0,&tmp1);
  1038.     SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x00000300U));
  1039.  
  1040.     bool result = (EndianSwap(ostate1.w) <= target);
  1041.     if (result)
  1042.         SETFOUND(gid);
  1043. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement