Advertisement
Guest User

Untitled

a guest
Feb 5th, 2014
464
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 55.47 KB | None | 0 0
  1. /* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
  2. /*
  3. * Keccak implementation.
  4. *
  5. * ==========================(LICENSE BEGIN)============================
  6. *
  7. * Copyright (c) 2007-2010 Projet RNRT SAPHIR
  8. *
  9. * Permission is hereby granted, free of charge, to any person obtaining
  10. * a copy of this software and associated documentation files (the
  11. * "Software"), to deal in the Software without restriction, including
  12. * without limitation the rights to use, copy, modify, merge, publish,
  13. * distribute, sublicense, and/or sell copies of the Software, and to
  14. * permit persons to whom the Software is furnished to do so, subject to
  15. * the following conditions:
  16. *
  17. * The above copyright notice and this permission notice shall be
  18. * included in all copies or substantial portions of the Software.
  19. *
  20. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  24. * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  25. * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  26. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27. *
  28. * ===========================(LICENSE END)=============================
  29. *
  30. * @author Thomas Pornin <thomas.pornin@cryptolog.com>
  31. */
  32.  
  33. #include <stddef.h>
  34. #include <string.h>
  35.  
  36. #include "sph_keccak.h"
  37.  
  38. #ifdef __cplusplus
  39. extern "C"{
  40. #endif
  41.  
  42. /*
  43. * Parameters:
  44. *
  45. * SPH_KECCAK_64 use a 64-bit type
  46. * SPH_KECCAK_UNROLL number of loops to unroll (0/undef for full unroll)
  47. * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only)
  48. * SPH_KECCAK_NOCOPY do not copy the state into local variables
  49. *
  50. * If there is no usable 64-bit type, the code automatically switches
  51. * back to the 32-bit implementation.
  52. *
  53. * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
  54. * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
  55. * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
  56. * 8 kB L1 code cache), seem to show that the following are optimal:
  57. *
  58. * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
  59. * do not copy the state; unrolling 2, 6 or all rounds also provides
  60. * near-optimal performance.
  61. * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
  62. * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
  63. * also provides near-optimal performance.
  64. * -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
  65. * copy the state. Unrolling 4 or 6 rounds is near-optimal.
  66. * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
  67. * copy the state.
  68. * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
  69. * the state. Unrolling only 1 round is also near-optimal.
  70. *
  71. * Also, interleaving does not always yield actual improvements when
  72. * using a 32-bit implementation; in particular when the architecture
  73. * does not offer a native rotation opcode (interleaving replaces one
  74. * 64-bit rotation with two 32-bit rotations, which is a gain only if
  75. * there is a native 32-bit rotation opcode and not a native 64-bit
  76. * rotation opcode; also, interleaving implies a small overhead when
  77. * processing input words).
  78. *
  79. * To sum up:
  80. * -- when possible, use the 64-bit code
  81. * -- exception: on 32-bit x86, use 32-bit code
  82. * -- when using 32-bit code, use interleaving
  83. * -- copy the state, except on x86
  84. * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
  85. */
  86.  
  87. #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK
  88. #define SPH_SMALL_FOOTPRINT_KECCAK 1
  89. #endif
  90.  
  91. /*
  92. * By default, we select the 64-bit implementation if a 64-bit type
  93. * is available, unless a 32-bit x86 is detected.
  94. */
  95. #if !defined SPH_KECCAK_64 && SPH_64 \
  96. && !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC)
  97. #define SPH_KECCAK_64 1
  98. #endif
  99.  
  100. /*
  101. * If using a 32-bit implementation, we prefer to interleave.
  102. */
  103. #if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE
  104. #define SPH_KECCAK_INTERLEAVE 1
  105. #endif
  106.  
  107. /*
  108. * Unroll 8 rounds on big systems, 2 rounds on small systems.
  109. */
  110. #ifndef SPH_KECCAK_UNROLL
  111. #if SPH_SMALL_FOOTPRINT_KECCAK
  112. #define SPH_KECCAK_UNROLL 2
  113. #else
  114. #define SPH_KECCAK_UNROLL 8
  115. #endif
  116. #endif
  117.  
  118. /*
  119. * We do not want to copy the state to local variables on x86 (32-bit
  120. * and 64-bit alike).
  121. */
  122. #ifndef SPH_KECCAK_NOCOPY
  123. #if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC
  124. #define SPH_KECCAK_NOCOPY 1
  125. #else
  126. #define SPH_KECCAK_NOCOPY 0
  127. #endif
  128. #endif
  129.  
  130. #ifdef _MSC_VER
  131. #pragma warning (disable: 4146)
  132. #endif
  133.  
  134. #if SPH_KECCAK_64
  135.  
  136. static const sph_u64 RC[] = {
  137. SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
  138. SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
  139. SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
  140. SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
  141. SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
  142. SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
  143. SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
  144. SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
  145. SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
  146. SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
  147. SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
  148. SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
  149. };
  150.  
  151. #if SPH_KECCAK_NOCOPY
  152.  
  153. #define a00 (kc->u.wide[ 0])
  154. #define a10 (kc->u.wide[ 1])
  155. #define a20 (kc->u.wide[ 2])
  156. #define a30 (kc->u.wide[ 3])
  157. #define a40 (kc->u.wide[ 4])
  158. #define a01 (kc->u.wide[ 5])
  159. #define a11 (kc->u.wide[ 6])
  160. #define a21 (kc->u.wide[ 7])
  161. #define a31 (kc->u.wide[ 8])
  162. #define a41 (kc->u.wide[ 9])
  163. #define a02 (kc->u.wide[10])
  164. #define a12 (kc->u.wide[11])
  165. #define a22 (kc->u.wide[12])
  166. #define a32 (kc->u.wide[13])
  167. #define a42 (kc->u.wide[14])
  168. #define a03 (kc->u.wide[15])
  169. #define a13 (kc->u.wide[16])
  170. #define a23 (kc->u.wide[17])
  171. #define a33 (kc->u.wide[18])
  172. #define a43 (kc->u.wide[19])
  173. #define a04 (kc->u.wide[20])
  174. #define a14 (kc->u.wide[21])
  175. #define a24 (kc->u.wide[22])
  176. #define a34 (kc->u.wide[23])
  177. #define a44 (kc->u.wide[24])
  178.  
  179. #define DECL_STATE
  180. #define READ_STATE(sc)
  181. #define WRITE_STATE(sc)
  182.  
  183. #define INPUT_BUF(size) do { \
  184. size_t j; \
  185. for (j = 0; j < (size); j += 8) { \
  186. kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \
  187. } \
  188. } while (0)
  189.  
  190. #define INPUT_BUF144 INPUT_BUF(144)
  191. #define INPUT_BUF136 INPUT_BUF(136)
  192. #define INPUT_BUF104 INPUT_BUF(104)
  193. #define INPUT_BUF72 INPUT_BUF(72)
  194.  
  195. #else
  196.  
  197. #define DECL_STATE \
  198. sph_u64 a00, a01, a02, a03, a04; \
  199. sph_u64 a10, a11, a12, a13, a14; \
  200. sph_u64 a20, a21, a22, a23, a24; \
  201. sph_u64 a30, a31, a32, a33, a34; \
  202. sph_u64 a40, a41, a42, a43, a44;
  203.  
  204. #define READ_STATE(state) do { \
  205. a00 = (state)->u.wide[ 0]; \
  206. a10 = (state)->u.wide[ 1]; \
  207. a20 = (state)->u.wide[ 2]; \
  208. a30 = (state)->u.wide[ 3]; \
  209. a40 = (state)->u.wide[ 4]; \
  210. a01 = (state)->u.wide[ 5]; \
  211. a11 = (state)->u.wide[ 6]; \
  212. a21 = (state)->u.wide[ 7]; \
  213. a31 = (state)->u.wide[ 8]; \
  214. a41 = (state)->u.wide[ 9]; \
  215. a02 = (state)->u.wide[10]; \
  216. a12 = (state)->u.wide[11]; \
  217. a22 = (state)->u.wide[12]; \
  218. a32 = (state)->u.wide[13]; \
  219. a42 = (state)->u.wide[14]; \
  220. a03 = (state)->u.wide[15]; \
  221. a13 = (state)->u.wide[16]; \
  222. a23 = (state)->u.wide[17]; \
  223. a33 = (state)->u.wide[18]; \
  224. a43 = (state)->u.wide[19]; \
  225. a04 = (state)->u.wide[20]; \
  226. a14 = (state)->u.wide[21]; \
  227. a24 = (state)->u.wide[22]; \
  228. a34 = (state)->u.wide[23]; \
  229. a44 = (state)->u.wide[24]; \
  230. } while (0)
  231.  
  232. #define WRITE_STATE(state) do { \
  233. (state)->u.wide[ 0] = a00; \
  234. (state)->u.wide[ 1] = a10; \
  235. (state)->u.wide[ 2] = a20; \
  236. (state)->u.wide[ 3] = a30; \
  237. (state)->u.wide[ 4] = a40; \
  238. (state)->u.wide[ 5] = a01; \
  239. (state)->u.wide[ 6] = a11; \
  240. (state)->u.wide[ 7] = a21; \
  241. (state)->u.wide[ 8] = a31; \
  242. (state)->u.wide[ 9] = a41; \
  243. (state)->u.wide[10] = a02; \
  244. (state)->u.wide[11] = a12; \
  245. (state)->u.wide[12] = a22; \
  246. (state)->u.wide[13] = a32; \
  247. (state)->u.wide[14] = a42; \
  248. (state)->u.wide[15] = a03; \
  249. (state)->u.wide[16] = a13; \
  250. (state)->u.wide[17] = a23; \
  251. (state)->u.wide[18] = a33; \
  252. (state)->u.wide[19] = a43; \
  253. (state)->u.wide[20] = a04; \
  254. (state)->u.wide[21] = a14; \
  255. (state)->u.wide[22] = a24; \
  256. (state)->u.wide[23] = a34; \
  257. (state)->u.wide[24] = a44; \
  258. } while (0)
  259.  
  260. #define INPUT_BUF144 do { \
  261. a00 ^= sph_dec64le_aligned(buf + 0); \
  262. a10 ^= sph_dec64le_aligned(buf + 8); \
  263. a20 ^= sph_dec64le_aligned(buf + 16); \
  264. a30 ^= sph_dec64le_aligned(buf + 24); \
  265. a40 ^= sph_dec64le_aligned(buf + 32); \
  266. a01 ^= sph_dec64le_aligned(buf + 40); \
  267. a11 ^= sph_dec64le_aligned(buf + 48); \
  268. a21 ^= sph_dec64le_aligned(buf + 56); \
  269. a31 ^= sph_dec64le_aligned(buf + 64); \
  270. a41 ^= sph_dec64le_aligned(buf + 72); \
  271. a02 ^= sph_dec64le_aligned(buf + 80); \
  272. a12 ^= sph_dec64le_aligned(buf + 88); \
  273. a22 ^= sph_dec64le_aligned(buf + 96); \
  274. a32 ^= sph_dec64le_aligned(buf + 104); \
  275. a42 ^= sph_dec64le_aligned(buf + 112); \
  276. a03 ^= sph_dec64le_aligned(buf + 120); \
  277. a13 ^= sph_dec64le_aligned(buf + 128); \
  278. a23 ^= sph_dec64le_aligned(buf + 136); \
  279. } while (0)
  280.  
  281. #define INPUT_BUF136 do { \
  282. a00 ^= sph_dec64le_aligned(buf + 0); \
  283. a10 ^= sph_dec64le_aligned(buf + 8); \
  284. a20 ^= sph_dec64le_aligned(buf + 16); \
  285. a30 ^= sph_dec64le_aligned(buf + 24); \
  286. a40 ^= sph_dec64le_aligned(buf + 32); \
  287. a01 ^= sph_dec64le_aligned(buf + 40); \
  288. a11 ^= sph_dec64le_aligned(buf + 48); \
  289. a21 ^= sph_dec64le_aligned(buf + 56); \
  290. a31 ^= sph_dec64le_aligned(buf + 64); \
  291. a41 ^= sph_dec64le_aligned(buf + 72); \
  292. a02 ^= sph_dec64le_aligned(buf + 80); \
  293. a12 ^= sph_dec64le_aligned(buf + 88); \
  294. a22 ^= sph_dec64le_aligned(buf + 96); \
  295. a32 ^= sph_dec64le_aligned(buf + 104); \
  296. a42 ^= sph_dec64le_aligned(buf + 112); \
  297. a03 ^= sph_dec64le_aligned(buf + 120); \
  298. a13 ^= sph_dec64le_aligned(buf + 128); \
  299. } while (0)
  300.  
  301. #define INPUT_BUF104 do { \
  302. a00 ^= sph_dec64le_aligned(buf + 0); \
  303. a10 ^= sph_dec64le_aligned(buf + 8); \
  304. a20 ^= sph_dec64le_aligned(buf + 16); \
  305. a30 ^= sph_dec64le_aligned(buf + 24); \
  306. a40 ^= sph_dec64le_aligned(buf + 32); \
  307. a01 ^= sph_dec64le_aligned(buf + 40); \
  308. a11 ^= sph_dec64le_aligned(buf + 48); \
  309. a21 ^= sph_dec64le_aligned(buf + 56); \
  310. a31 ^= sph_dec64le_aligned(buf + 64); \
  311. a41 ^= sph_dec64le_aligned(buf + 72); \
  312. a02 ^= sph_dec64le_aligned(buf + 80); \
  313. a12 ^= sph_dec64le_aligned(buf + 88); \
  314. a22 ^= sph_dec64le_aligned(buf + 96); \
  315. } while (0)
  316.  
  317. #define INPUT_BUF72 do { \
  318. a00 ^= sph_dec64le_aligned(buf + 0); \
  319. a10 ^= sph_dec64le_aligned(buf + 8); \
  320. a20 ^= sph_dec64le_aligned(buf + 16); \
  321. a30 ^= sph_dec64le_aligned(buf + 24); \
  322. a40 ^= sph_dec64le_aligned(buf + 32); \
  323. a01 ^= sph_dec64le_aligned(buf + 40); \
  324. a11 ^= sph_dec64le_aligned(buf + 48); \
  325. a21 ^= sph_dec64le_aligned(buf + 56); \
  326. a31 ^= sph_dec64le_aligned(buf + 64); \
  327. } while (0)
  328.  
  329. #define INPUT_BUF(lim) do { \
  330. a00 ^= sph_dec64le_aligned(buf + 0); \
  331. a10 ^= sph_dec64le_aligned(buf + 8); \
  332. a20 ^= sph_dec64le_aligned(buf + 16); \
  333. a30 ^= sph_dec64le_aligned(buf + 24); \
  334. a40 ^= sph_dec64le_aligned(buf + 32); \
  335. a01 ^= sph_dec64le_aligned(buf + 40); \
  336. a11 ^= sph_dec64le_aligned(buf + 48); \
  337. a21 ^= sph_dec64le_aligned(buf + 56); \
  338. a31 ^= sph_dec64le_aligned(buf + 64); \
  339. if ((lim) == 72) \
  340. break; \
  341. a41 ^= sph_dec64le_aligned(buf + 72); \
  342. a02 ^= sph_dec64le_aligned(buf + 80); \
  343. a12 ^= sph_dec64le_aligned(buf + 88); \
  344. a22 ^= sph_dec64le_aligned(buf + 96); \
  345. if ((lim) == 104) \
  346. break; \
  347. a32 ^= sph_dec64le_aligned(buf + 104); \
  348. a42 ^= sph_dec64le_aligned(buf + 112); \
  349. a03 ^= sph_dec64le_aligned(buf + 120); \
  350. a13 ^= sph_dec64le_aligned(buf + 128); \
  351. if ((lim) == 136) \
  352. break; \
  353. a23 ^= sph_dec64le_aligned(buf + 136); \
  354. } while (0)
  355.  
  356. #endif
  357.  
  358. #define DECL64(x) sph_u64 x
  359. #define MOV64(d, s) (d = s)
  360. #define XOR64(d, a, b) (d = a ^ b)
  361. #define AND64(d, a, b) (d = a & b)
  362. #define OR64(d, a, b) (d = a | b)
  363. #define NOT64(d, s) (d = SPH_T64(~s))
  364. #define ROL64(d, v, n) (d = SPH_ROTL64(v, n))
  365. #define XOR64_IOTA XOR64
  366.  
  367. #else
  368.  
  369. static const struct {
  370. sph_u32 high, low;
  371. } RC[] = {
  372. #if SPH_KECCAK_INTERLEAVE
  373. { SPH_C32(0x00000000), SPH_C32(0x00000001) },
  374. { SPH_C32(0x00000089), SPH_C32(0x00000000) },
  375. { SPH_C32(0x8000008B), SPH_C32(0x00000000) },
  376. { SPH_C32(0x80008080), SPH_C32(0x00000000) },
  377. { SPH_C32(0x0000008B), SPH_C32(0x00000001) },
  378. { SPH_C32(0x00008000), SPH_C32(0x00000001) },
  379. { SPH_C32(0x80008088), SPH_C32(0x00000001) },
  380. { SPH_C32(0x80000082), SPH_C32(0x00000001) },
  381. { SPH_C32(0x0000000B), SPH_C32(0x00000000) },
  382. { SPH_C32(0x0000000A), SPH_C32(0x00000000) },
  383. { SPH_C32(0x00008082), SPH_C32(0x00000001) },
  384. { SPH_C32(0x00008003), SPH_C32(0x00000000) },
  385. { SPH_C32(0x0000808B), SPH_C32(0x00000001) },
  386. { SPH_C32(0x8000000B), SPH_C32(0x00000001) },
  387. { SPH_C32(0x8000008A), SPH_C32(0x00000001) },
  388. { SPH_C32(0x80000081), SPH_C32(0x00000001) },
  389. { SPH_C32(0x80000081), SPH_C32(0x00000000) },
  390. { SPH_C32(0x80000008), SPH_C32(0x00000000) },
  391. { SPH_C32(0x00000083), SPH_C32(0x00000000) },
  392. { SPH_C32(0x80008003), SPH_C32(0x00000000) },
  393. { SPH_C32(0x80008088), SPH_C32(0x00000001) },
  394. { SPH_C32(0x80000088), SPH_C32(0x00000000) },
  395. { SPH_C32(0x00008000), SPH_C32(0x00000001) },
  396. { SPH_C32(0x80008082), SPH_C32(0x00000000) }
  397. #else
  398. { SPH_C32(0x00000000), SPH_C32(0x00000001) },
  399. { SPH_C32(0x00000000), SPH_C32(0x00008082) },
  400. { SPH_C32(0x80000000), SPH_C32(0x0000808A) },
  401. { SPH_C32(0x80000000), SPH_C32(0x80008000) },
  402. { SPH_C32(0x00000000), SPH_C32(0x0000808B) },
  403. { SPH_C32(0x00000000), SPH_C32(0x80000001) },
  404. { SPH_C32(0x80000000), SPH_C32(0x80008081) },
  405. { SPH_C32(0x80000000), SPH_C32(0x00008009) },
  406. { SPH_C32(0x00000000), SPH_C32(0x0000008A) },
  407. { SPH_C32(0x00000000), SPH_C32(0x00000088) },
  408. { SPH_C32(0x00000000), SPH_C32(0x80008009) },
  409. { SPH_C32(0x00000000), SPH_C32(0x8000000A) },
  410. { SPH_C32(0x00000000), SPH_C32(0x8000808B) },
  411. { SPH_C32(0x80000000), SPH_C32(0x0000008B) },
  412. { SPH_C32(0x80000000), SPH_C32(0x00008089) },
  413. { SPH_C32(0x80000000), SPH_C32(0x00008003) },
  414. { SPH_C32(0x80000000), SPH_C32(0x00008002) },
  415. { SPH_C32(0x80000000), SPH_C32(0x00000080) },
  416. { SPH_C32(0x00000000), SPH_C32(0x0000800A) },
  417. { SPH_C32(0x80000000), SPH_C32(0x8000000A) },
  418. { SPH_C32(0x80000000), SPH_C32(0x80008081) },
  419. { SPH_C32(0x80000000), SPH_C32(0x00008080) },
  420. { SPH_C32(0x00000000), SPH_C32(0x80000001) },
  421. { SPH_C32(0x80000000), SPH_C32(0x80008008) }
  422. #endif
  423. };
  424.  
  425. #if SPH_KECCAK_INTERLEAVE
  426.  
  427. #define INTERLEAVE(xl, xh) do { \
  428. sph_u32 l, h, t; \
  429. l = (xl); h = (xh); \
  430. t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
  431. t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
  432. t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
  433. t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
  434. t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
  435. t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
  436. t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
  437. t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
  438. t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
  439. l ^= t; h ^= t >> 16; \
  440. (xl) = l; (xh) = h; \
  441. } while (0)
  442.  
  443. #define UNINTERLEAVE(xl, xh) do { \
  444. sph_u32 l, h, t; \
  445. l = (xl); h = (xh); \
  446. t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
  447. l ^= t; h ^= t >> 16; \
  448. t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
  449. t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
  450. t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
  451. t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
  452. t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
  453. t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
  454. t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
  455. t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
  456. (xl) = l; (xh) = h; \
  457. } while (0)
  458.  
  459. #else
  460.  
  461. #define INTERLEAVE(l, h)
  462. #define UNINTERLEAVE(l, h)
  463.  
  464. #endif
  465.  
  466. #if SPH_KECCAK_NOCOPY
  467.  
  468. #define a00l (kc->u.narrow[2 * 0 + 0])
  469. #define a00h (kc->u.narrow[2 * 0 + 1])
  470. #define a10l (kc->u.narrow[2 * 1 + 0])
  471. #define a10h (kc->u.narrow[2 * 1 + 1])
  472. #define a20l (kc->u.narrow[2 * 2 + 0])
  473. #define a20h (kc->u.narrow[2 * 2 + 1])
  474. #define a30l (kc->u.narrow[2 * 3 + 0])
  475. #define a30h (kc->u.narrow[2 * 3 + 1])
  476. #define a40l (kc->u.narrow[2 * 4 + 0])
  477. #define a40h (kc->u.narrow[2 * 4 + 1])
  478. #define a01l (kc->u.narrow[2 * 5 + 0])
  479. #define a01h (kc->u.narrow[2 * 5 + 1])
  480. #define a11l (kc->u.narrow[2 * 6 + 0])
  481. #define a11h (kc->u.narrow[2 * 6 + 1])
  482. #define a21l (kc->u.narrow[2 * 7 + 0])
  483. #define a21h (kc->u.narrow[2 * 7 + 1])
  484. #define a31l (kc->u.narrow[2 * 8 + 0])
  485. #define a31h (kc->u.narrow[2 * 8 + 1])
  486. #define a41l (kc->u.narrow[2 * 9 + 0])
  487. #define a41h (kc->u.narrow[2 * 9 + 1])
  488. #define a02l (kc->u.narrow[2 * 10 + 0])
  489. #define a02h (kc->u.narrow[2 * 10 + 1])
  490. #define a12l (kc->u.narrow[2 * 11 + 0])
  491. #define a12h (kc->u.narrow[2 * 11 + 1])
  492. #define a22l (kc->u.narrow[2 * 12 + 0])
  493. #define a22h (kc->u.narrow[2 * 12 + 1])
  494. #define a32l (kc->u.narrow[2 * 13 + 0])
  495. #define a32h (kc->u.narrow[2 * 13 + 1])
  496. #define a42l (kc->u.narrow[2 * 14 + 0])
  497. #define a42h (kc->u.narrow[2 * 14 + 1])
  498. #define a03l (kc->u.narrow[2 * 15 + 0])
  499. #define a03h (kc->u.narrow[2 * 15 + 1])
  500. #define a13l (kc->u.narrow[2 * 16 + 0])
  501. #define a13h (kc->u.narrow[2 * 16 + 1])
  502. #define a23l (kc->u.narrow[2 * 17 + 0])
  503. #define a23h (kc->u.narrow[2 * 17 + 1])
  504. #define a33l (kc->u.narrow[2 * 18 + 0])
  505. #define a33h (kc->u.narrow[2 * 18 + 1])
  506. #define a43l (kc->u.narrow[2 * 19 + 0])
  507. #define a43h (kc->u.narrow[2 * 19 + 1])
  508. #define a04l (kc->u.narrow[2 * 20 + 0])
  509. #define a04h (kc->u.narrow[2 * 20 + 1])
  510. #define a14l (kc->u.narrow[2 * 21 + 0])
  511. #define a14h (kc->u.narrow[2 * 21 + 1])
  512. #define a24l (kc->u.narrow[2 * 22 + 0])
  513. #define a24h (kc->u.narrow[2 * 22 + 1])
  514. #define a34l (kc->u.narrow[2 * 23 + 0])
  515. #define a34h (kc->u.narrow[2 * 23 + 1])
  516. #define a44l (kc->u.narrow[2 * 24 + 0])
  517. #define a44h (kc->u.narrow[2 * 24 + 1])
  518.  
  519. #define DECL_STATE
  520. #define READ_STATE(state)
  521. #define WRITE_STATE(state)
  522.  
  523. #define INPUT_BUF(size) do { \
  524. size_t j; \
  525. for (j = 0; j < (size); j += 8) { \
  526. sph_u32 tl, th; \
  527. tl = sph_dec32le_aligned(buf + j + 0); \
  528. th = sph_dec32le_aligned(buf + j + 4); \
  529. INTERLEAVE(tl, th); \
  530. kc->u.narrow[(j >> 2) + 0] ^= tl; \
  531. kc->u.narrow[(j >> 2) + 1] ^= th; \
  532. } \
  533. } while (0)
  534.  
  535. #define INPUT_BUF144 INPUT_BUF(144)
  536. #define INPUT_BUF136 INPUT_BUF(136)
  537. #define INPUT_BUF104 INPUT_BUF(104)
  538. #define INPUT_BUF72 INPUT_BUF(72)
  539.  
  540. #else
  541.  
  542. #define DECL_STATE \
  543. sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \
  544. sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \
  545. sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \
  546. sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \
  547. sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h;
  548.  
  549. #define READ_STATE(state) do { \
  550. a00l = (state)->u.narrow[2 * 0 + 0]; \
  551. a00h = (state)->u.narrow[2 * 0 + 1]; \
  552. a10l = (state)->u.narrow[2 * 1 + 0]; \
  553. a10h = (state)->u.narrow[2 * 1 + 1]; \
  554. a20l = (state)->u.narrow[2 * 2 + 0]; \
  555. a20h = (state)->u.narrow[2 * 2 + 1]; \
  556. a30l = (state)->u.narrow[2 * 3 + 0]; \
  557. a30h = (state)->u.narrow[2 * 3 + 1]; \
  558. a40l = (state)->u.narrow[2 * 4 + 0]; \
  559. a40h = (state)->u.narrow[2 * 4 + 1]; \
  560. a01l = (state)->u.narrow[2 * 5 + 0]; \
  561. a01h = (state)->u.narrow[2 * 5 + 1]; \
  562. a11l = (state)->u.narrow[2 * 6 + 0]; \
  563. a11h = (state)->u.narrow[2 * 6 + 1]; \
  564. a21l = (state)->u.narrow[2 * 7 + 0]; \
  565. a21h = (state)->u.narrow[2 * 7 + 1]; \
  566. a31l = (state)->u.narrow[2 * 8 + 0]; \
  567. a31h = (state)->u.narrow[2 * 8 + 1]; \
  568. a41l = (state)->u.narrow[2 * 9 + 0]; \
  569. a41h = (state)->u.narrow[2 * 9 + 1]; \
  570. a02l = (state)->u.narrow[2 * 10 + 0]; \
  571. a02h = (state)->u.narrow[2 * 10 + 1]; \
  572. a12l = (state)->u.narrow[2 * 11 + 0]; \
  573. a12h = (state)->u.narrow[2 * 11 + 1]; \
  574. a22l = (state)->u.narrow[2 * 12 + 0]; \
  575. a22h = (state)->u.narrow[2 * 12 + 1]; \
  576. a32l = (state)->u.narrow[2 * 13 + 0]; \
  577. a32h = (state)->u.narrow[2 * 13 + 1]; \
  578. a42l = (state)->u.narrow[2 * 14 + 0]; \
  579. a42h = (state)->u.narrow[2 * 14 + 1]; \
  580. a03l = (state)->u.narrow[2 * 15 + 0]; \
  581. a03h = (state)->u.narrow[2 * 15 + 1]; \
  582. a13l = (state)->u.narrow[2 * 16 + 0]; \
  583. a13h = (state)->u.narrow[2 * 16 + 1]; \
  584. a23l = (state)->u.narrow[2 * 17 + 0]; \
  585. a23h = (state)->u.narrow[2 * 17 + 1]; \
  586. a33l = (state)->u.narrow[2 * 18 + 0]; \
  587. a33h = (state)->u.narrow[2 * 18 + 1]; \
  588. a43l = (state)->u.narrow[2 * 19 + 0]; \
  589. a43h = (state)->u.narrow[2 * 19 + 1]; \
  590. a04l = (state)->u.narrow[2 * 20 + 0]; \
  591. a04h = (state)->u.narrow[2 * 20 + 1]; \
  592. a14l = (state)->u.narrow[2 * 21 + 0]; \
  593. a14h = (state)->u.narrow[2 * 21 + 1]; \
  594. a24l = (state)->u.narrow[2 * 22 + 0]; \
  595. a24h = (state)->u.narrow[2 * 22 + 1]; \
  596. a34l = (state)->u.narrow[2 * 23 + 0]; \
  597. a34h = (state)->u.narrow[2 * 23 + 1]; \
  598. a44l = (state)->u.narrow[2 * 24 + 0]; \
  599. a44h = (state)->u.narrow[2 * 24 + 1]; \
  600. } while (0)
  601.  
  602. #define WRITE_STATE(state) do { \
  603. (state)->u.narrow[2 * 0 + 0] = a00l; \
  604. (state)->u.narrow[2 * 0 + 1] = a00h; \
  605. (state)->u.narrow[2 * 1 + 0] = a10l; \
  606. (state)->u.narrow[2 * 1 + 1] = a10h; \
  607. (state)->u.narrow[2 * 2 + 0] = a20l; \
  608. (state)->u.narrow[2 * 2 + 1] = a20h; \
  609. (state)->u.narrow[2 * 3 + 0] = a30l; \
  610. (state)->u.narrow[2 * 3 + 1] = a30h; \
  611. (state)->u.narrow[2 * 4 + 0] = a40l; \
  612. (state)->u.narrow[2 * 4 + 1] = a40h; \
  613. (state)->u.narrow[2 * 5 + 0] = a01l; \
  614. (state)->u.narrow[2 * 5 + 1] = a01h; \
  615. (state)->u.narrow[2 * 6 + 0] = a11l; \
  616. (state)->u.narrow[2 * 6 + 1] = a11h; \
  617. (state)->u.narrow[2 * 7 + 0] = a21l; \
  618. (state)->u.narrow[2 * 7 + 1] = a21h; \
  619. (state)->u.narrow[2 * 8 + 0] = a31l; \
  620. (state)->u.narrow[2 * 8 + 1] = a31h; \
  621. (state)->u.narrow[2 * 9 + 0] = a41l; \
  622. (state)->u.narrow[2 * 9 + 1] = a41h; \
  623. (state)->u.narrow[2 * 10 + 0] = a02l; \
  624. (state)->u.narrow[2 * 10 + 1] = a02h; \
  625. (state)->u.narrow[2 * 11 + 0] = a12l; \
  626. (state)->u.narrow[2 * 11 + 1] = a12h; \
  627. (state)->u.narrow[2 * 12 + 0] = a22l; \
  628. (state)->u.narrow[2 * 12 + 1] = a22h; \
  629. (state)->u.narrow[2 * 13 + 0] = a32l; \
  630. (state)->u.narrow[2 * 13 + 1] = a32h; \
  631. (state)->u.narrow[2 * 14 + 0] = a42l; \
  632. (state)->u.narrow[2 * 14 + 1] = a42h; \
  633. (state)->u.narrow[2 * 15 + 0] = a03l; \
  634. (state)->u.narrow[2 * 15 + 1] = a03h; \
  635. (state)->u.narrow[2 * 16 + 0] = a13l; \
  636. (state)->u.narrow[2 * 16 + 1] = a13h; \
  637. (state)->u.narrow[2 * 17 + 0] = a23l; \
  638. (state)->u.narrow[2 * 17 + 1] = a23h; \
  639. (state)->u.narrow[2 * 18 + 0] = a33l; \
  640. (state)->u.narrow[2 * 18 + 1] = a33h; \
  641. (state)->u.narrow[2 * 19 + 0] = a43l; \
  642. (state)->u.narrow[2 * 19 + 1] = a43h; \
  643. (state)->u.narrow[2 * 20 + 0] = a04l; \
  644. (state)->u.narrow[2 * 20 + 1] = a04h; \
  645. (state)->u.narrow[2 * 21 + 0] = a14l; \
  646. (state)->u.narrow[2 * 21 + 1] = a14h; \
  647. (state)->u.narrow[2 * 22 + 0] = a24l; \
  648. (state)->u.narrow[2 * 22 + 1] = a24h; \
  649. (state)->u.narrow[2 * 23 + 0] = a34l; \
  650. (state)->u.narrow[2 * 23 + 1] = a34h; \
  651. (state)->u.narrow[2 * 24 + 0] = a44l; \
  652. (state)->u.narrow[2 * 24 + 1] = a44h; \
  653. } while (0)
  654.  
  655. #define READ64(d, off) do { \
  656. sph_u32 tl, th; \
  657. tl = sph_dec32le_aligned(buf + (off)); \
  658. th = sph_dec32le_aligned(buf + (off) + 4); \
  659. INTERLEAVE(tl, th); \
  660. d ## l ^= tl; \
  661. d ## h ^= th; \
  662. } while (0)
  663.  
  664. #define INPUT_BUF144 do { \
  665. READ64(a00, 0); \
  666. READ64(a10, 8); \
  667. READ64(a20, 16); \
  668. READ64(a30, 24); \
  669. READ64(a40, 32); \
  670. READ64(a01, 40); \
  671. READ64(a11, 48); \
  672. READ64(a21, 56); \
  673. READ64(a31, 64); \
  674. READ64(a41, 72); \
  675. READ64(a02, 80); \
  676. READ64(a12, 88); \
  677. READ64(a22, 96); \
  678. READ64(a32, 104); \
  679. READ64(a42, 112); \
  680. READ64(a03, 120); \
  681. READ64(a13, 128); \
  682. READ64(a23, 136); \
  683. } while (0)
  684.  
  685. #define INPUT_BUF136 do { \
  686. READ64(a00, 0); \
  687. READ64(a10, 8); \
  688. READ64(a20, 16); \
  689. READ64(a30, 24); \
  690. READ64(a40, 32); \
  691. READ64(a01, 40); \
  692. READ64(a11, 48); \
  693. READ64(a21, 56); \
  694. READ64(a31, 64); \
  695. READ64(a41, 72); \
  696. READ64(a02, 80); \
  697. READ64(a12, 88); \
  698. READ64(a22, 96); \
  699. READ64(a32, 104); \
  700. READ64(a42, 112); \
  701. READ64(a03, 120); \
  702. READ64(a13, 128); \
  703. } while (0)
  704.  
  705. #define INPUT_BUF104 do { \
  706. READ64(a00, 0); \
  707. READ64(a10, 8); \
  708. READ64(a20, 16); \
  709. READ64(a30, 24); \
  710. READ64(a40, 32); \
  711. READ64(a01, 40); \
  712. READ64(a11, 48); \
  713. READ64(a21, 56); \
  714. READ64(a31, 64); \
  715. READ64(a41, 72); \
  716. READ64(a02, 80); \
  717. READ64(a12, 88); \
  718. READ64(a22, 96); \
  719. } while (0)
  720.  
  721. #define INPUT_BUF72 do { \
  722. READ64(a00, 0); \
  723. READ64(a10, 8); \
  724. READ64(a20, 16); \
  725. READ64(a30, 24); \
  726. READ64(a40, 32); \
  727. READ64(a01, 40); \
  728. READ64(a11, 48); \
  729. READ64(a21, 56); \
  730. READ64(a31, 64); \
  731. } while (0)
  732.  
  733. #define INPUT_BUF(lim) do { \
  734. READ64(a00, 0); \
  735. READ64(a10, 8); \
  736. READ64(a20, 16); \
  737. READ64(a30, 24); \
  738. READ64(a40, 32); \
  739. READ64(a01, 40); \
  740. READ64(a11, 48); \
  741. READ64(a21, 56); \
  742. READ64(a31, 64); \
  743. if ((lim) == 72) \
  744. break; \
  745. READ64(a41, 72); \
  746. READ64(a02, 80); \
  747. READ64(a12, 88); \
  748. READ64(a22, 96); \
  749. if ((lim) == 104) \
  750. break; \
  751. READ64(a32, 104); \
  752. READ64(a42, 112); \
  753. READ64(a03, 120); \
  754. READ64(a13, 128); \
  755. if ((lim) == 136) \
  756. break; \
  757. READ64(a23, 136); \
  758. } while (0)
  759.  
  760. #endif
  761.  
  762. #define DECL64(x) sph_u64 x ## l, x ## h
  763. #define MOV64(d, s) (d ## l = s ## l, d ## h = s ## h)
  764. #define XOR64(d, a, b) (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h)
  765. #define AND64(d, a, b) (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h)
  766. #define OR64(d, a, b) (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h)
  767. #define NOT64(d, s) (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h))
  768. #define ROL64(d, v, n) ROL64_ ## n(d, v)
  769.  
  770. #if SPH_KECCAK_INTERLEAVE
  771.  
  772. #define ROL64_odd1(d, v) do { \
  773. sph_u32 tmp; \
  774. tmp = v ## l; \
  775. d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \
  776. d ## h = tmp; \
  777. } while (0)
  778.  
  779. #define ROL64_odd63(d, v) do { \
  780. sph_u32 tmp; \
  781. tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \
  782. d ## l = v ## h; \
  783. d ## h = tmp; \
  784. } while (0)
  785.  
  786. #define ROL64_odd(d, v, n) do { \
  787. sph_u32 tmp; \
  788. tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \
  789. d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
  790. d ## h = tmp; \
  791. } while (0)
  792.  
  793. #define ROL64_even(d, v, n) do { \
  794. d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \
  795. d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
  796. } while (0)
  797.  
  798. #define ROL64_0(d, v)
  799. #define ROL64_1(d, v) ROL64_odd1(d, v)
  800. #define ROL64_2(d, v) ROL64_even(d, v, 1)
  801. #define ROL64_3(d, v) ROL64_odd( d, v, 2)
  802. #define ROL64_4(d, v) ROL64_even(d, v, 2)
  803. #define ROL64_5(d, v) ROL64_odd( d, v, 3)
  804. #define ROL64_6(d, v) ROL64_even(d, v, 3)
  805. #define ROL64_7(d, v) ROL64_odd( d, v, 4)
  806. #define ROL64_8(d, v) ROL64_even(d, v, 4)
  807. #define ROL64_9(d, v) ROL64_odd( d, v, 5)
  808. #define ROL64_10(d, v) ROL64_even(d, v, 5)
  809. #define ROL64_11(d, v) ROL64_odd( d, v, 6)
  810. #define ROL64_12(d, v) ROL64_even(d, v, 6)
  811. #define ROL64_13(d, v) ROL64_odd( d, v, 7)
  812. #define ROL64_14(d, v) ROL64_even(d, v, 7)
  813. #define ROL64_15(d, v) ROL64_odd( d, v, 8)
  814. #define ROL64_16(d, v) ROL64_even(d, v, 8)
  815. #define ROL64_17(d, v) ROL64_odd( d, v, 9)
  816. #define ROL64_18(d, v) ROL64_even(d, v, 9)
  817. #define ROL64_19(d, v) ROL64_odd( d, v, 10)
  818. #define ROL64_20(d, v) ROL64_even(d, v, 10)
  819. #define ROL64_21(d, v) ROL64_odd( d, v, 11)
  820. #define ROL64_22(d, v) ROL64_even(d, v, 11)
  821. #define ROL64_23(d, v) ROL64_odd( d, v, 12)
  822. #define ROL64_24(d, v) ROL64_even(d, v, 12)
  823. #define ROL64_25(d, v) ROL64_odd( d, v, 13)
  824. #define ROL64_26(d, v) ROL64_even(d, v, 13)
  825. #define ROL64_27(d, v) ROL64_odd( d, v, 14)
  826. #define ROL64_28(d, v) ROL64_even(d, v, 14)
  827. #define ROL64_29(d, v) ROL64_odd( d, v, 15)
  828. #define ROL64_30(d, v) ROL64_even(d, v, 15)
  829. #define ROL64_31(d, v) ROL64_odd( d, v, 16)
  830. #define ROL64_32(d, v) ROL64_even(d, v, 16)
  831. #define ROL64_33(d, v) ROL64_odd( d, v, 17)
  832. #define ROL64_34(d, v) ROL64_even(d, v, 17)
  833. #define ROL64_35(d, v) ROL64_odd( d, v, 18)
  834. #define ROL64_36(d, v) ROL64_even(d, v, 18)
  835. #define ROL64_37(d, v) ROL64_odd( d, v, 19)
  836. #define ROL64_38(d, v) ROL64_even(d, v, 19)
  837. #define ROL64_39(d, v) ROL64_odd( d, v, 20)
  838. #define ROL64_40(d, v) ROL64_even(d, v, 20)
  839. #define ROL64_41(d, v) ROL64_odd( d, v, 21)
  840. #define ROL64_42(d, v) ROL64_even(d, v, 21)
  841. #define ROL64_43(d, v) ROL64_odd( d, v, 22)
  842. #define ROL64_44(d, v) ROL64_even(d, v, 22)
  843. #define ROL64_45(d, v) ROL64_odd( d, v, 23)
  844. #define ROL64_46(d, v) ROL64_even(d, v, 23)
  845. #define ROL64_47(d, v) ROL64_odd( d, v, 24)
  846. #define ROL64_48(d, v) ROL64_even(d, v, 24)
  847. #define ROL64_49(d, v) ROL64_odd( d, v, 25)
  848. #define ROL64_50(d, v) ROL64_even(d, v, 25)
  849. #define ROL64_51(d, v) ROL64_odd( d, v, 26)
  850. #define ROL64_52(d, v) ROL64_even(d, v, 26)
  851. #define ROL64_53(d, v) ROL64_odd( d, v, 27)
  852. #define ROL64_54(d, v) ROL64_even(d, v, 27)
  853. #define ROL64_55(d, v) ROL64_odd( d, v, 28)
  854. #define ROL64_56(d, v) ROL64_even(d, v, 28)
  855. #define ROL64_57(d, v) ROL64_odd( d, v, 29)
  856. #define ROL64_58(d, v) ROL64_even(d, v, 29)
  857. #define ROL64_59(d, v) ROL64_odd( d, v, 30)
  858. #define ROL64_60(d, v) ROL64_even(d, v, 30)
  859. #define ROL64_61(d, v) ROL64_odd( d, v, 31)
  860. #define ROL64_62(d, v) ROL64_even(d, v, 31)
  861. #define ROL64_63(d, v) ROL64_odd63(d, v)
  862.  
  863. #else
  864.  
  865. #define ROL64_small(d, v, n) do { \
  866. sph_u32 tmp; \
  867. tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \
  868. d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \
  869. d ## l = tmp; \
  870. } while (0)
  871.  
  872. #define ROL64_0(d, v) 0
  873. #define ROL64_1(d, v) ROL64_small(d, v, 1)
  874. #define ROL64_2(d, v) ROL64_small(d, v, 2)
  875. #define ROL64_3(d, v) ROL64_small(d, v, 3)
  876. #define ROL64_4(d, v) ROL64_small(d, v, 4)
  877. #define ROL64_5(d, v) ROL64_small(d, v, 5)
  878. #define ROL64_6(d, v) ROL64_small(d, v, 6)
  879. #define ROL64_7(d, v) ROL64_small(d, v, 7)
  880. #define ROL64_8(d, v) ROL64_small(d, v, 8)
  881. #define ROL64_9(d, v) ROL64_small(d, v, 9)
  882. #define ROL64_10(d, v) ROL64_small(d, v, 10)
  883. #define ROL64_11(d, v) ROL64_small(d, v, 11)
  884. #define ROL64_12(d, v) ROL64_small(d, v, 12)
  885. #define ROL64_13(d, v) ROL64_small(d, v, 13)
  886. #define ROL64_14(d, v) ROL64_small(d, v, 14)
  887. #define ROL64_15(d, v) ROL64_small(d, v, 15)
  888. #define ROL64_16(d, v) ROL64_small(d, v, 16)
  889. #define ROL64_17(d, v) ROL64_small(d, v, 17)
  890. #define ROL64_18(d, v) ROL64_small(d, v, 18)
  891. #define ROL64_19(d, v) ROL64_small(d, v, 19)
  892. #define ROL64_20(d, v) ROL64_small(d, v, 20)
  893. #define ROL64_21(d, v) ROL64_small(d, v, 21)
  894. #define ROL64_22(d, v) ROL64_small(d, v, 22)
  895. #define ROL64_23(d, v) ROL64_small(d, v, 23)
  896. #define ROL64_24(d, v) ROL64_small(d, v, 24)
  897. #define ROL64_25(d, v) ROL64_small(d, v, 25)
  898. #define ROL64_26(d, v) ROL64_small(d, v, 26)
  899. #define ROL64_27(d, v) ROL64_small(d, v, 27)
  900. #define ROL64_28(d, v) ROL64_small(d, v, 28)
  901. #define ROL64_29(d, v) ROL64_small(d, v, 29)
  902. #define ROL64_30(d, v) ROL64_small(d, v, 30)
  903. #define ROL64_31(d, v) ROL64_small(d, v, 31)
  904.  
  905. #define ROL64_32(d, v) do { \
  906. sph_u32 tmp; \
  907. tmp = v ## l; \
  908. d ## l = v ## h; \
  909. d ## h = tmp; \
  910. } while (0)
  911.  
  912. #define ROL64_big(d, v, n) do { \
  913. sph_u32 trl, trh; \
  914. ROL64_small(tr, v, n); \
  915. d ## h = trl; \
  916. d ## l = trh; \
  917. } while (0)
  918.  
  919. #define ROL64_33(d, v) ROL64_big(d, v, 1)
  920. #define ROL64_34(d, v) ROL64_big(d, v, 2)
  921. #define ROL64_35(d, v) ROL64_big(d, v, 3)
  922. #define ROL64_36(d, v) ROL64_big(d, v, 4)
  923. #define ROL64_37(d, v) ROL64_big(d, v, 5)
  924. #define ROL64_38(d, v) ROL64_big(d, v, 6)
  925. #define ROL64_39(d, v) ROL64_big(d, v, 7)
  926. #define ROL64_40(d, v) ROL64_big(d, v, 8)
  927. #define ROL64_41(d, v) ROL64_big(d, v, 9)
  928. #define ROL64_42(d, v) ROL64_big(d, v, 10)
  929. #define ROL64_43(d, v) ROL64_big(d, v, 11)
  930. #define ROL64_44(d, v) ROL64_big(d, v, 12)
  931. #define ROL64_45(d, v) ROL64_big(d, v, 13)
  932. #define ROL64_46(d, v) ROL64_big(d, v, 14)
  933. #define ROL64_47(d, v) ROL64_big(d, v, 15)
  934. #define ROL64_48(d, v) ROL64_big(d, v, 16)
  935. #define ROL64_49(d, v) ROL64_big(d, v, 17)
  936. #define ROL64_50(d, v) ROL64_big(d, v, 18)
  937. #define ROL64_51(d, v) ROL64_big(d, v, 19)
  938. #define ROL64_52(d, v) ROL64_big(d, v, 20)
  939. #define ROL64_53(d, v) ROL64_big(d, v, 21)
  940. #define ROL64_54(d, v) ROL64_big(d, v, 22)
  941. #define ROL64_55(d, v) ROL64_big(d, v, 23)
  942. #define ROL64_56(d, v) ROL64_big(d, v, 24)
  943. #define ROL64_57(d, v) ROL64_big(d, v, 25)
  944. #define ROL64_58(d, v) ROL64_big(d, v, 26)
  945. #define ROL64_59(d, v) ROL64_big(d, v, 27)
  946. #define ROL64_60(d, v) ROL64_big(d, v, 28)
  947. #define ROL64_61(d, v) ROL64_big(d, v, 29)
  948. #define ROL64_62(d, v) ROL64_big(d, v, 30)
  949. #define ROL64_63(d, v) ROL64_big(d, v, 31)
  950.  
  951. #endif
  952.  
  953. #define XOR64_IOTA(d, s, k) \
  954. (d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high)
  955.  
  956. #endif
  957.  
  958. #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
  959. DECL64(tt0); \
  960. DECL64(tt1); \
  961. DECL64(tt2); \
  962. DECL64(tt3); \
  963. XOR64(tt0, d0, d1); \
  964. XOR64(tt1, d2, d3); \
  965. XOR64(tt0, tt0, d4); \
  966. XOR64(tt0, tt0, tt1); \
  967. ROL64(tt0, tt0, 1); \
  968. XOR64(tt2, c0, c1); \
  969. XOR64(tt3, c2, c3); \
  970. XOR64(tt0, tt0, c4); \
  971. XOR64(tt2, tt2, tt3); \
  972. XOR64(t, tt0, tt2); \
  973. } while (0)
  974.  
  975. #define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
  976. b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
  977. b40, b41, b42, b43, b44) \
  978. do { \
  979. DECL64(t0); \
  980. DECL64(t1); \
  981. DECL64(t2); \
  982. DECL64(t3); \
  983. DECL64(t4); \
  984. TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
  985. TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
  986. TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
  987. TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
  988. TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
  989. XOR64(b00, b00, t0); \
  990. XOR64(b01, b01, t0); \
  991. XOR64(b02, b02, t0); \
  992. XOR64(b03, b03, t0); \
  993. XOR64(b04, b04, t0); \
  994. XOR64(b10, b10, t1); \
  995. XOR64(b11, b11, t1); \
  996. XOR64(b12, b12, t1); \
  997. XOR64(b13, b13, t1); \
  998. XOR64(b14, b14, t1); \
  999. XOR64(b20, b20, t2); \
  1000. XOR64(b21, b21, t2); \
  1001. XOR64(b22, b22, t2); \
  1002. XOR64(b23, b23, t2); \
  1003. XOR64(b24, b24, t2); \
  1004. XOR64(b30, b30, t3); \
  1005. XOR64(b31, b31, t3); \
  1006. XOR64(b32, b32, t3); \
  1007. XOR64(b33, b33, t3); \
  1008. XOR64(b34, b34, t3); \
  1009. XOR64(b40, b40, t4); \
  1010. XOR64(b41, b41, t4); \
  1011. XOR64(b42, b42, t4); \
  1012. XOR64(b43, b43, t4); \
  1013. XOR64(b44, b44, t4); \
  1014. } while (0)
  1015.  
  1016. #define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
  1017. b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
  1018. b40, b41, b42, b43, b44) \
  1019. do { \
  1020. /* ROL64(b00, b00, 0); */ \
  1021. ROL64(b01, b01, 36); \
  1022. ROL64(b02, b02, 3); \
  1023. ROL64(b03, b03, 41); \
  1024. ROL64(b04, b04, 18); \
  1025. ROL64(b10, b10, 1); \
  1026. ROL64(b11, b11, 44); \
  1027. ROL64(b12, b12, 10); \
  1028. ROL64(b13, b13, 45); \
  1029. ROL64(b14, b14, 2); \
  1030. ROL64(b20, b20, 62); \
  1031. ROL64(b21, b21, 6); \
  1032. ROL64(b22, b22, 43); \
  1033. ROL64(b23, b23, 15); \
  1034. ROL64(b24, b24, 61); \
  1035. ROL64(b30, b30, 28); \
  1036. ROL64(b31, b31, 55); \
  1037. ROL64(b32, b32, 25); \
  1038. ROL64(b33, b33, 21); \
  1039. ROL64(b34, b34, 56); \
  1040. ROL64(b40, b40, 27); \
  1041. ROL64(b41, b41, 20); \
  1042. ROL64(b42, b42, 39); \
  1043. ROL64(b43, b43, 8); \
  1044. ROL64(b44, b44, 14); \
  1045. } while (0)
  1046.  
  1047. /*
  1048. * The KHI macro integrates the "lane complement" optimization. On input,
  1049. * some words are complemented:
  1050. * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
  1051. * On output, the following words are complemented:
  1052. * a04 a10 a20 a22 a23 a31
  1053. *
  1054. * The (implicit) permutation and the theta expansion will bring back
  1055. * the input mask for the next round.
  1056. */
  1057.  
  1058. #define KHI_XO(d, a, b, c) do { \
  1059. DECL64(kt); \
  1060. OR64(kt, b, c); \
  1061. XOR64(d, a, kt); \
  1062. } while (0)
  1063.  
  1064. #define KHI_XA(d, a, b, c) do { \
  1065. DECL64(kt); \
  1066. AND64(kt, b, c); \
  1067. XOR64(d, a, kt); \
  1068. } while (0)
  1069.  
  1070. #define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
  1071. b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
  1072. b40, b41, b42, b43, b44) \
  1073. do { \
  1074. DECL64(c0); \
  1075. DECL64(c1); \
  1076. DECL64(c2); \
  1077. DECL64(c3); \
  1078. DECL64(c4); \
  1079. DECL64(bnn); \
  1080. NOT64(bnn, b20); \
  1081. KHI_XO(c0, b00, b10, b20); \
  1082. KHI_XO(c1, b10, bnn, b30); \
  1083. KHI_XA(c2, b20, b30, b40); \
  1084. KHI_XO(c3, b30, b40, b00); \
  1085. KHI_XA(c4, b40, b00, b10); \
  1086. MOV64(b00, c0); \
  1087. MOV64(b10, c1); \
  1088. MOV64(b20, c2); \
  1089. MOV64(b30, c3); \
  1090. MOV64(b40, c4); \
  1091. NOT64(bnn, b41); \
  1092. KHI_XO(c0, b01, b11, b21); \
  1093. KHI_XA(c1, b11, b21, b31); \
  1094. KHI_XO(c2, b21, b31, bnn); \
  1095. KHI_XO(c3, b31, b41, b01); \
  1096. KHI_XA(c4, b41, b01, b11); \
  1097. MOV64(b01, c0); \
  1098. MOV64(b11, c1); \
  1099. MOV64(b21, c2); \
  1100. MOV64(b31, c3); \
  1101. MOV64(b41, c4); \
  1102. NOT64(bnn, b32); \
  1103. KHI_XO(c0, b02, b12, b22); \
  1104. KHI_XA(c1, b12, b22, b32); \
  1105. KHI_XA(c2, b22, bnn, b42); \
  1106. KHI_XO(c3, bnn, b42, b02); \
  1107. KHI_XA(c4, b42, b02, b12); \
  1108. MOV64(b02, c0); \
  1109. MOV64(b12, c1); \
  1110. MOV64(b22, c2); \
  1111. MOV64(b32, c3); \
  1112. MOV64(b42, c4); \
  1113. NOT64(bnn, b33); \
  1114. KHI_XA(c0, b03, b13, b23); \
  1115. KHI_XO(c1, b13, b23, b33); \
  1116. KHI_XO(c2, b23, bnn, b43); \
  1117. KHI_XA(c3, bnn, b43, b03); \
  1118. KHI_XO(c4, b43, b03, b13); \
  1119. MOV64(b03, c0); \
  1120. MOV64(b13, c1); \
  1121. MOV64(b23, c2); \
  1122. MOV64(b33, c3); \
  1123. MOV64(b43, c4); \
  1124. NOT64(bnn, b14); \
  1125. KHI_XA(c0, b04, bnn, b24); \
  1126. KHI_XO(c1, bnn, b24, b34); \
  1127. KHI_XA(c2, b24, b34, b44); \
  1128. KHI_XO(c3, b34, b44, b04); \
  1129. KHI_XA(c4, b44, b04, b14); \
  1130. MOV64(b04, c0); \
  1131. MOV64(b14, c1); \
  1132. MOV64(b24, c2); \
  1133. MOV64(b34, c3); \
  1134. MOV64(b44, c4); \
  1135. } while (0)
  1136.  
  1137. #define IOTA(r) XOR64_IOTA(a00, a00, r)
  1138.  
  1139. #define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
  1140. a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
  1141. #define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
  1142. a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
  1143. #define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
  1144. a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
  1145. #define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
  1146. a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
  1147. #define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
  1148. a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
  1149. #define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
  1150. a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
  1151. #define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
  1152. a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
  1153. #define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
  1154. a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
  1155. #define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
  1156. a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
  1157. #define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
  1158. a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
  1159. #define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
  1160. a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
  1161. #define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
  1162. a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
  1163. #define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
  1164. a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
  1165. #define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
  1166. a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
  1167. #define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
  1168. a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
  1169. #define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
  1170. a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
  1171. #define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
  1172. a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
  1173. #define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
  1174. a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
  1175. #define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
  1176. a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
  1177. #define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
  1178. a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
  1179. #define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
  1180. a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
  1181. #define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
  1182. a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
  1183. #define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
  1184. a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
  1185. #define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
  1186. a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
  1187.  
  1188. #define P1_TO_P0 do { \
  1189. DECL64(t); \
  1190. MOV64(t, a01); \
  1191. MOV64(a01, a30); \
  1192. MOV64(a30, a33); \
  1193. MOV64(a33, a23); \
  1194. MOV64(a23, a12); \
  1195. MOV64(a12, a21); \
  1196. MOV64(a21, a02); \
  1197. MOV64(a02, a10); \
  1198. MOV64(a10, a11); \
  1199. MOV64(a11, a41); \
  1200. MOV64(a41, a24); \
  1201. MOV64(a24, a42); \
  1202. MOV64(a42, a04); \
  1203. MOV64(a04, a20); \
  1204. MOV64(a20, a22); \
  1205. MOV64(a22, a32); \
  1206. MOV64(a32, a43); \
  1207. MOV64(a43, a34); \
  1208. MOV64(a34, a03); \
  1209. MOV64(a03, a40); \
  1210. MOV64(a40, a44); \
  1211. MOV64(a44, a14); \
  1212. MOV64(a14, a31); \
  1213. MOV64(a31, a13); \
  1214. MOV64(a13, t); \
  1215. } while (0)
  1216.  
  1217. #define P2_TO_P0 do { \
  1218. DECL64(t); \
  1219. MOV64(t, a01); \
  1220. MOV64(a01, a33); \
  1221. MOV64(a33, a12); \
  1222. MOV64(a12, a02); \
  1223. MOV64(a02, a11); \
  1224. MOV64(a11, a24); \
  1225. MOV64(a24, a04); \
  1226. MOV64(a04, a22); \
  1227. MOV64(a22, a43); \
  1228. MOV64(a43, a03); \
  1229. MOV64(a03, a44); \
  1230. MOV64(a44, a31); \
  1231. MOV64(a31, t); \
  1232. MOV64(t, a10); \
  1233. MOV64(a10, a41); \
  1234. MOV64(a41, a42); \
  1235. MOV64(a42, a20); \
  1236. MOV64(a20, a32); \
  1237. MOV64(a32, a34); \
  1238. MOV64(a34, a40); \
  1239. MOV64(a40, a14); \
  1240. MOV64(a14, a13); \
  1241. MOV64(a13, a30); \
  1242. MOV64(a30, a23); \
  1243. MOV64(a23, a21); \
  1244. MOV64(a21, t); \
  1245. } while (0)
  1246.  
  1247. #define P4_TO_P0 do { \
  1248. DECL64(t); \
  1249. MOV64(t, a01); \
  1250. MOV64(a01, a12); \
  1251. MOV64(a12, a11); \
  1252. MOV64(a11, a04); \
  1253. MOV64(a04, a43); \
  1254. MOV64(a43, a44); \
  1255. MOV64(a44, t); \
  1256. MOV64(t, a02); \
  1257. MOV64(a02, a24); \
  1258. MOV64(a24, a22); \
  1259. MOV64(a22, a03); \
  1260. MOV64(a03, a31); \
  1261. MOV64(a31, a33); \
  1262. MOV64(a33, t); \
  1263. MOV64(t, a10); \
  1264. MOV64(a10, a42); \
  1265. MOV64(a42, a32); \
  1266. MOV64(a32, a40); \
  1267. MOV64(a40, a13); \
  1268. MOV64(a13, a23); \
  1269. MOV64(a23, t); \
  1270. MOV64(t, a14); \
  1271. MOV64(a14, a30); \
  1272. MOV64(a30, a21); \
  1273. MOV64(a21, a41); \
  1274. MOV64(a41, a20); \
  1275. MOV64(a20, a34); \
  1276. MOV64(a34, t); \
  1277. } while (0)
  1278.  
  1279. #define P6_TO_P0 do { \
  1280. DECL64(t); \
  1281. MOV64(t, a01); \
  1282. MOV64(a01, a02); \
  1283. MOV64(a02, a04); \
  1284. MOV64(a04, a03); \
  1285. MOV64(a03, t); \
  1286. MOV64(t, a10); \
  1287. MOV64(a10, a20); \
  1288. MOV64(a20, a40); \
  1289. MOV64(a40, a30); \
  1290. MOV64(a30, t); \
  1291. MOV64(t, a11); \
  1292. MOV64(a11, a22); \
  1293. MOV64(a22, a44); \
  1294. MOV64(a44, a33); \
  1295. MOV64(a33, t); \
  1296. MOV64(t, a12); \
  1297. MOV64(a12, a24); \
  1298. MOV64(a24, a43); \
  1299. MOV64(a43, a31); \
  1300. MOV64(a31, t); \
  1301. MOV64(t, a13); \
  1302. MOV64(a13, a21); \
  1303. MOV64(a21, a42); \
  1304. MOV64(a42, a34); \
  1305. MOV64(a34, t); \
  1306. MOV64(t, a14); \
  1307. MOV64(a14, a23); \
  1308. MOV64(a23, a41); \
  1309. MOV64(a41, a32); \
  1310. MOV64(a32, t); \
  1311. } while (0)
  1312.  
  1313. #define P8_TO_P0 do { \
  1314. DECL64(t); \
  1315. MOV64(t, a01); \
  1316. MOV64(a01, a11); \
  1317. MOV64(a11, a43); \
  1318. MOV64(a43, t); \
  1319. MOV64(t, a02); \
  1320. MOV64(a02, a22); \
  1321. MOV64(a22, a31); \
  1322. MOV64(a31, t); \
  1323. MOV64(t, a03); \
  1324. MOV64(a03, a33); \
  1325. MOV64(a33, a24); \
  1326. MOV64(a24, t); \
  1327. MOV64(t, a04); \
  1328. MOV64(a04, a44); \
  1329. MOV64(a44, a12); \
  1330. MOV64(a12, t); \
  1331. MOV64(t, a10); \
  1332. MOV64(a10, a32); \
  1333. MOV64(a32, a13); \
  1334. MOV64(a13, t); \
  1335. MOV64(t, a14); \
  1336. MOV64(a14, a21); \
  1337. MOV64(a21, a20); \
  1338. MOV64(a20, t); \
  1339. MOV64(t, a23); \
  1340. MOV64(a23, a42); \
  1341. MOV64(a42, a40); \
  1342. MOV64(a40, t); \
  1343. MOV64(t, a30); \
  1344. MOV64(a30, a41); \
  1345. MOV64(a41, a34); \
  1346. MOV64(a34, t); \
  1347. } while (0)
  1348.  
  1349. #define P12_TO_P0 do { \
  1350. DECL64(t); \
  1351. MOV64(t, a01); \
  1352. MOV64(a01, a04); \
  1353. MOV64(a04, t); \
  1354. MOV64(t, a02); \
  1355. MOV64(a02, a03); \
  1356. MOV64(a03, t); \
  1357. MOV64(t, a10); \
  1358. MOV64(a10, a40); \
  1359. MOV64(a40, t); \
  1360. MOV64(t, a11); \
  1361. MOV64(a11, a44); \
  1362. MOV64(a44, t); \
  1363. MOV64(t, a12); \
  1364. MOV64(a12, a43); \
  1365. MOV64(a43, t); \
  1366. MOV64(t, a13); \
  1367. MOV64(a13, a42); \
  1368. MOV64(a42, t); \
  1369. MOV64(t, a14); \
  1370. MOV64(a14, a41); \
  1371. MOV64(a41, t); \
  1372. MOV64(t, a20); \
  1373. MOV64(a20, a30); \
  1374. MOV64(a30, t); \
  1375. MOV64(t, a21); \
  1376. MOV64(a21, a34); \
  1377. MOV64(a34, t); \
  1378. MOV64(t, a22); \
  1379. MOV64(a22, a33); \
  1380. MOV64(a33, t); \
  1381. MOV64(t, a23); \
  1382. MOV64(a23, a32); \
  1383. MOV64(a32, t); \
  1384. MOV64(t, a24); \
  1385. MOV64(a24, a31); \
  1386. MOV64(a31, t); \
  1387. } while (0)
  1388.  
  1389. #define LPAR (
  1390. #define RPAR )
  1391.  
  1392. #define KF_ELT(r, s, k) do { \
  1393. THETA LPAR P ## r RPAR; \
  1394. RHO LPAR P ## r RPAR; \
  1395. KHI LPAR P ## s RPAR; \
  1396. IOTA(k); \
  1397. } while (0)
  1398.  
  1399. #define DO(x) x
  1400.  
  1401. #define KECCAK_F_1600 DO(KECCAK_F_1600_)
  1402.  
  1403. #if SPH_KECCAK_UNROLL == 1
  1404.  
  1405. #define KECCAK_F_1600_ do { \
  1406. int j; \
  1407. for (j = 0; j < 24; j ++) { \
  1408. KF_ELT( 0, 1, RC[j + 0]); \
  1409. P1_TO_P0; \
  1410. } \
  1411. } while (0)
  1412.  
  1413. #elif SPH_KECCAK_UNROLL == 2
  1414.  
  1415. #define KECCAK_F_1600_ do { \
  1416. int j; \
  1417. for (j = 0; j < 24; j += 2) { \
  1418. KF_ELT( 0, 1, RC[j + 0]); \
  1419. KF_ELT( 1, 2, RC[j + 1]); \
  1420. P2_TO_P0; \
  1421. } \
  1422. } while (0)
  1423.  
  1424. #elif SPH_KECCAK_UNROLL == 4
  1425.  
  1426. #define KECCAK_F_1600_ do { \
  1427. int j; \
  1428. for (j = 0; j < 24; j += 4) { \
  1429. KF_ELT( 0, 1, RC[j + 0]); \
  1430. KF_ELT( 1, 2, RC[j + 1]); \
  1431. KF_ELT( 2, 3, RC[j + 2]); \
  1432. KF_ELT( 3, 4, RC[j + 3]); \
  1433. P4_TO_P0; \
  1434. } \
  1435. } while (0)
  1436.  
  1437. #elif SPH_KECCAK_UNROLL == 6
  1438.  
  1439. #define KECCAK_F_1600_ do { \
  1440. int j; \
  1441. for (j = 0; j < 24; j += 6) { \
  1442. KF_ELT( 0, 1, RC[j + 0]); \
  1443. KF_ELT( 1, 2, RC[j + 1]); \
  1444. KF_ELT( 2, 3, RC[j + 2]); \
  1445. KF_ELT( 3, 4, RC[j + 3]); \
  1446. KF_ELT( 4, 5, RC[j + 4]); \
  1447. KF_ELT( 5, 6, RC[j + 5]); \
  1448. P6_TO_P0; \
  1449. } \
  1450. } while (0)
  1451.  
  1452. #elif SPH_KECCAK_UNROLL == 8
  1453.  
  1454. #define KECCAK_F_1600_ do { \
  1455. int j; \
  1456. for (j = 0; j < 24; j += 8) { \
  1457. KF_ELT( 0, 1, RC[j + 0]); \
  1458. KF_ELT( 1, 2, RC[j + 1]); \
  1459. KF_ELT( 2, 3, RC[j + 2]); \
  1460. KF_ELT( 3, 4, RC[j + 3]); \
  1461. KF_ELT( 4, 5, RC[j + 4]); \
  1462. KF_ELT( 5, 6, RC[j + 5]); \
  1463. KF_ELT( 6, 7, RC[j + 6]); \
  1464. KF_ELT( 7, 8, RC[j + 7]); \
  1465. P8_TO_P0; \
  1466. } \
  1467. } while (0)
  1468.  
  1469. #elif SPH_KECCAK_UNROLL == 12
  1470.  
  1471. #define KECCAK_F_1600_ do { \
  1472. int j; \
  1473. for (j = 0; j < 24; j += 12) { \
  1474. KF_ELT( 0, 1, RC[j + 0]); \
  1475. KF_ELT( 1, 2, RC[j + 1]); \
  1476. KF_ELT( 2, 3, RC[j + 2]); \
  1477. KF_ELT( 3, 4, RC[j + 3]); \
  1478. KF_ELT( 4, 5, RC[j + 4]); \
  1479. KF_ELT( 5, 6, RC[j + 5]); \
  1480. KF_ELT( 6, 7, RC[j + 6]); \
  1481. KF_ELT( 7, 8, RC[j + 7]); \
  1482. KF_ELT( 8, 9, RC[j + 8]); \
  1483. KF_ELT( 9, 10, RC[j + 9]); \
  1484. KF_ELT(10, 11, RC[j + 10]); \
  1485. KF_ELT(11, 12, RC[j + 11]); \
  1486. P12_TO_P0; \
  1487. } \
  1488. } while (0)
  1489.  
  1490. #elif SPH_KECCAK_UNROLL == 0
  1491.  
  1492. #define KECCAK_F_1600_ do { \
  1493. KF_ELT( 0, 1, RC[ 0]); \
  1494. KF_ELT( 1, 2, RC[ 1]); \
  1495. KF_ELT( 2, 3, RC[ 2]); \
  1496. KF_ELT( 3, 4, RC[ 3]); \
  1497. KF_ELT( 4, 5, RC[ 4]); \
  1498. KF_ELT( 5, 6, RC[ 5]); \
  1499. KF_ELT( 6, 7, RC[ 6]); \
  1500. KF_ELT( 7, 8, RC[ 7]); \
  1501. KF_ELT( 8, 9, RC[ 8]); \
  1502. KF_ELT( 9, 10, RC[ 9]); \
  1503. KF_ELT(10, 11, RC[10]); \
  1504. KF_ELT(11, 12, RC[11]); \
  1505. KF_ELT(12, 13, RC[12]); \
  1506. KF_ELT(13, 14, RC[13]); \
  1507. KF_ELT(14, 15, RC[14]); \
  1508. KF_ELT(15, 16, RC[15]); \
  1509. KF_ELT(16, 17, RC[16]); \
  1510. KF_ELT(17, 18, RC[17]); \
  1511. KF_ELT(18, 19, RC[18]); \
  1512. KF_ELT(19, 20, RC[19]); \
  1513. KF_ELT(20, 21, RC[20]); \
  1514. KF_ELT(21, 22, RC[21]); \
  1515. KF_ELT(22, 23, RC[22]); \
  1516. KF_ELT(23, 0, RC[23]); \
  1517. } while (0)
  1518.  
  1519. #else
  1520.  
  1521. #error Unimplemented unroll count for Keccak.
  1522.  
  1523. #endif
  1524.  
  1525. static void
  1526. keccak_init(sph_keccak_context *kc, unsigned out_size)
  1527. {
  1528. int i;
  1529.  
  1530. #if SPH_KECCAK_64
  1531. for (i = 0; i < 25; i ++)
  1532. kc->u.wide[i] = 0;
  1533. /*
  1534. * Initialization for the "lane complement".
  1535. */
  1536. kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  1537. kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  1538. kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  1539. kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  1540. kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  1541. kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF);
  1542. #else
  1543.  
  1544. for (i = 0; i < 50; i ++)
  1545. kc->u.narrow[i] = 0;
  1546. /*
  1547. * Initialization for the "lane complement".
  1548. * Note: since we set to all-one full 64-bit words,
  1549. * interleaving (if applicable) is a no-op.
  1550. */
  1551. kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF);
  1552. kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF);
  1553. kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF);
  1554. kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF);
  1555. kc->u.narrow[16] = SPH_C32(0xFFFFFFFF);
  1556. kc->u.narrow[17] = SPH_C32(0xFFFFFFFF);
  1557. kc->u.narrow[24] = SPH_C32(0xFFFFFFFF);
  1558. kc->u.narrow[25] = SPH_C32(0xFFFFFFFF);
  1559. kc->u.narrow[34] = SPH_C32(0xFFFFFFFF);
  1560. kc->u.narrow[35] = SPH_C32(0xFFFFFFFF);
  1561. kc->u.narrow[40] = SPH_C32(0xFFFFFFFF);
  1562. kc->u.narrow[41] = SPH_C32(0xFFFFFFFF);
  1563. #endif
  1564. kc->ptr = 0;
  1565. kc->lim = 200 - (out_size >> 2);
  1566. }
  1567.  
  1568. static void
  1569. keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
  1570. {
  1571. unsigned char *buf;
  1572. size_t ptr;
  1573. DECL_STATE
  1574.  
  1575. buf = kc->buf;
  1576. ptr = kc->ptr;
  1577.  
  1578. if (len < (lim - ptr)) {
  1579. memcpy(buf + ptr, data, len);
  1580. kc->ptr = ptr + len;
  1581. return;
  1582. }
  1583.  
  1584. READ_STATE(kc);
  1585. while (len > 0) {
  1586. size_t clen;
  1587.  
  1588. clen = (lim - ptr);
  1589. if (clen > len)
  1590. clen = len;
  1591. memcpy(buf + ptr, data, clen);
  1592. ptr += clen;
  1593. data = (const unsigned char *)data + clen;
  1594. len -= clen;
  1595. if (ptr == lim) {
  1596. INPUT_BUF(lim);
  1597. KECCAK_F_1600;
  1598. ptr = 0;
  1599. }
  1600. }
  1601. WRITE_STATE(kc);
  1602. kc->ptr = ptr;
  1603. }
  1604.  
  1605. #if SPH_KECCAK_64
  1606.  
  1607. #define DEFCLOSE(d, lim) \
  1608. static void keccak_close ## d( \
  1609. sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
  1610. { \
  1611. unsigned eb; \
  1612. union { \
  1613. unsigned char tmp[lim + 1]; \
  1614. sph_u64 dummy; /* for alignment */ \
  1615. } u; \
  1616. size_t j; \
  1617. \
  1618. eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
  1619. if (kc->ptr == (lim - 1)) { \
  1620. if (n == 7) { \
  1621. u.tmp[0] = eb; \
  1622. memset(u.tmp + 1, 0, lim - 1); \
  1623. u.tmp[lim] = 0x80; \
  1624. j = 1 + lim; \
  1625. } else { \
  1626. u.tmp[0] = eb | 0x80; \
  1627. j = 1; \
  1628. } \
  1629. } else { \
  1630. j = lim - kc->ptr; \
  1631. u.tmp[0] = eb; \
  1632. memset(u.tmp + 1, 0, j - 2); \
  1633. u.tmp[j - 1] = 0x80; \
  1634. } \
  1635. keccak_core(kc, u.tmp, j, lim); \
  1636. /* Finalize the "lane complement" */ \
  1637. kc->u.wide[ 1] = ~kc->u.wide[ 1]; \
  1638. kc->u.wide[ 2] = ~kc->u.wide[ 2]; \
  1639. kc->u.wide[ 8] = ~kc->u.wide[ 8]; \
  1640. kc->u.wide[12] = ~kc->u.wide[12]; \
  1641. kc->u.wide[17] = ~kc->u.wide[17]; \
  1642. kc->u.wide[20] = ~kc->u.wide[20]; \
  1643. for (j = 0; j < d; j += 8) \
  1644. sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
  1645. memcpy(dst, u.tmp, d); \
  1646. keccak_init(kc, (unsigned)d << 3); \
  1647. } \
  1648.  
  1649. #else
  1650.  
  1651. #define DEFCLOSE(d, lim) \
  1652. static void keccak_close ## d( \
  1653. sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
  1654. { \
  1655. unsigned eb; \
  1656. union { \
  1657. unsigned char tmp[lim + 1]; \
  1658. sph_u64 dummy; /* for alignment */ \
  1659. } u; \
  1660. size_t j; \
  1661. \
  1662. eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
  1663. if (kc->ptr == (lim - 1)) { \
  1664. if (n == 7) { \
  1665. u.tmp[0] = eb; \
  1666. memset(u.tmp + 1, 0, lim - 1); \
  1667. u.tmp[lim] = 0x80; \
  1668. j = 1 + lim; \
  1669. } else { \
  1670. u.tmp[0] = eb | 0x80; \
  1671. j = 1; \
  1672. } \
  1673. } else { \
  1674. j = lim - kc->ptr; \
  1675. u.tmp[0] = eb; \
  1676. memset(u.tmp + 1, 0, j - 2); \
  1677. u.tmp[j - 1] = 0x80; \
  1678. } \
  1679. keccak_core(kc, u.tmp, j, lim); \
  1680. /* Finalize the "lane complement" */ \
  1681. kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \
  1682. kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \
  1683. kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \
  1684. kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \
  1685. kc->u.narrow[16] = ~kc->u.narrow[16]; \
  1686. kc->u.narrow[17] = ~kc->u.narrow[17]; \
  1687. kc->u.narrow[24] = ~kc->u.narrow[24]; \
  1688. kc->u.narrow[25] = ~kc->u.narrow[25]; \
  1689. kc->u.narrow[34] = ~kc->u.narrow[34]; \
  1690. kc->u.narrow[35] = ~kc->u.narrow[35]; \
  1691. kc->u.narrow[40] = ~kc->u.narrow[40]; \
  1692. kc->u.narrow[41] = ~kc->u.narrow[41]; \
  1693. /* un-interleave */ \
  1694. for (j = 0; j < 50; j += 2) \
  1695. UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \
  1696. for (j = 0; j < d; j += 4) \
  1697. sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \
  1698. memcpy(dst, u.tmp, d); \
  1699. keccak_init(kc, (unsigned)d << 3); \
  1700. } \
  1701.  
  1702. #endif
  1703.  
  1704. DEFCLOSE(28, 144)
  1705. DEFCLOSE(32, 136)
  1706. DEFCLOSE(48, 104)
  1707. DEFCLOSE(64, 72)
  1708.  
  1709. /* see sph_keccak.h */
  1710. void
  1711. sph_keccak224_init(void *cc)
  1712. {
  1713. keccak_init(cc, 224);
  1714. }
  1715.  
  1716. /* see sph_keccak.h */
  1717. void
  1718. sph_keccak224(void *cc, const void *data, size_t len)
  1719. {
  1720. keccak_core(cc, data, len, 144);
  1721. }
  1722.  
  1723. /* see sph_keccak.h */
  1724. void
  1725. sph_keccak224_close(void *cc, void *dst)
  1726. {
  1727. sph_keccak224_addbits_and_close(cc, 0, 0, dst);
  1728. }
  1729.  
  1730. /* see sph_keccak.h */
  1731. void
  1732. sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
  1733. {
  1734. keccak_close28(cc, ub, n, dst);
  1735. }
  1736.  
  1737. /* see sph_keccak.h */
  1738. void
  1739. sph_keccak256_init(void *cc)
  1740. {
  1741. keccak_init(cc, 256);
  1742. }
  1743.  
  1744. /* see sph_keccak.h */
  1745. void
  1746. sph_keccak256(void *cc, const void *data, size_t len)
  1747. {
  1748. keccak_core(cc, data, len, 136);
  1749. }
  1750.  
  1751. /* see sph_keccak.h */
  1752. void
  1753. sph_keccak256_close(void *cc, void *dst)
  1754. {
  1755. sph_keccak256_addbits_and_close(cc, 0, 0, dst);
  1756. }
  1757.  
  1758. /* see sph_keccak.h */
  1759. void
  1760. sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
  1761. {
  1762. keccak_close32(cc, ub, n, dst);
  1763. }
  1764.  
  1765. /* see sph_keccak.h */
  1766. void
  1767. sph_keccak384_init(void *cc)
  1768. {
  1769. keccak_init(cc, 384);
  1770. }
  1771.  
  1772. /* see sph_keccak.h */
  1773. void
  1774. sph_keccak384(void *cc, const void *data, size_t len)
  1775. {
  1776. keccak_core(cc, data, len, 104);
  1777. }
  1778.  
  1779. /* see sph_keccak.h */
  1780. void
  1781. sph_keccak384_close(void *cc, void *dst)
  1782. {
  1783. sph_keccak384_addbits_and_close(cc, 0, 0, dst);
  1784. }
  1785.  
  1786. /* see sph_keccak.h */
  1787. void
  1788. sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
  1789. {
  1790. keccak_close48(cc, ub, n, dst);
  1791. }
  1792.  
  1793. /* see sph_keccak.h */
  1794. void
  1795. sph_keccak512_init(void *cc)
  1796. {
  1797. keccak_init(cc, 512);
  1798. }
  1799.  
  1800. /* see sph_keccak.h */
  1801. void
  1802. sph_keccak512(void *cc, const void *data, size_t len)
  1803. {
  1804. keccak_core(cc, data, len, 72);
  1805. }
  1806.  
  1807. /* see sph_keccak.h */
  1808. void
  1809. sph_keccak512_close(void *cc, void *dst)
  1810. {
  1811. sph_keccak512_addbits_and_close(cc, 0, 0, dst);
  1812. }
  1813.  
  1814. /* see sph_keccak.h */
  1815. void
  1816. sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
  1817. {
  1818. keccak_close64(cc, ub, n, dst);
  1819. }
  1820.  
  1821.  
  1822. #ifdef __cplusplus
  1823. }
  1824. #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement