Z1maV1

sha2.lua

Aug 18th, 2024
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Lua 275.30 KB | Cybersecurity | 0 0
  1. --------------------------------------------------------------------------------------------------------------------------
  2. -- sha2.lua
  3. --------------------------------------------------------------------------------------------------------------------------
  4. -- VERSION: 12 (2022-02-23)
  5. -- AUTHOR:  Egor Skriptunoff
  6. -- LICENSE: MIT (the same license as Lua itself)
  7. -- URL:     https://github.com/Egor-Skriptunoff/pure_lua_SHA
  8. --
  9. -- DESCRIPTION:
  10. --    This module contains functions to calculate SHA digest:
  11. --       MD5, SHA-1,
  12. --       SHA-224, SHA-256, SHA-512/224, SHA-512/256, SHA-384, SHA-512,
  13. --       SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256,
  14. --       HMAC,
  15. --       BLAKE2b, BLAKE2s, BLAKE2bp, BLAKE2sp, BLAKE2Xb, BLAKE2Xs,
  16. --       BLAKE3, BLAKE3_KDF
  17. --    Written in pure Lua.
  18. --    Compatible with:
  19. --       Lua 5.1, Lua 5.2, Lua 5.3, Lua 5.4, Fengari, LuaJIT 2.0/2.1 (any CPU endianness).
  20. --    Main feature of this module: it was heavily optimized for speed.
  21. --    For every Lua version the module contains particular implementation branch to get benefits from version-specific features.
  22. --       - branch for Lua 5.1 (emulating bitwise operators using look-up table)
  23. --       - branch for Lua 5.2 (using bit32/bit library), suitable for both Lua 5.2 with native "bit32" and Lua 5.1 with external library "bit"
  24. --       - branch for Lua 5.3/5.4 (using native 64-bit bitwise operators)
  25. --       - branch for Lua 5.3/5.4 (using native 32-bit bitwise operators) for Lua built with LUA_INT_TYPE=LUA_INT_INT
  26. --       - branch for LuaJIT without FFI library (useful in a sandboxed environment)
  27. --       - branch for LuaJIT x86 without FFI library (LuaJIT x86 has oddity because of lack of CPU registers)
  28. --       - branch for LuaJIT 2.0 with FFI library (bit.* functions work only with Lua numbers)
  29. --       - branch for LuaJIT 2.1 with FFI library (bit.* functions can work with "int64_t" arguments)
  30. --
  31. --
  32. -- USAGE:
  33. --    Input data should be provided as a binary string: either as a whole string or as a sequence of substrings (chunk-by-chunk loading, total length < 9*10^15 bytes).
  34. --    Result (SHA digest) is returned in hexadecimal representation as a string of lowercase hex digits.
  35. --    Simplest usage example:
  36. --       local sha = require("sha2")
  37. --       local your_hash = sha.sha256("your string")
  38. --    See file "sha2_test.lua" for more examples.
  39. --
  40. --
  41. -- CHANGELOG:
  42. --  version     date      description
  43. --  -------  ----------   -----------
  44. --    12     2022-02-23   Now works in Luau (but NOT optimized for speed)
  45. --    11     2022-01-09   BLAKE3 added
  46. --    10     2022-01-02   BLAKE2 functions added
  47. --     9     2020-05-10   Now works in OpenWrt's Lua (dialect of Lua 5.1 with "double" + "invisible int32")
  48. --     8     2019-09-03   SHA-3 functions added
  49. --     7     2019-03-17   Added functions to convert to/from base64
  50. --     6     2018-11-12   HMAC added
  51. --     5     2018-11-10   SHA-1 added
  52. --     4     2018-11-03   MD5 added
  53. --     3     2018-11-02   Bug fixed: incorrect hashing of long (2 GByte) data streams on Lua 5.3/5.4 built with "int32" integers
  54. --     2     2018-10-07   Decreased module loading time in Lua 5.1 implementation branch (thanks to Peter Melnichenko for giving a hint)
  55. --     1     2018-10-06   First release (only SHA-2 functions)
  56. -----------------------------------------------------------------------------
  57.  
  58.  
  59. local print_debug_messages = false  -- set to true to view some messages about your system's abilities and implementation branch chosen for your system
  60.  
  61. local unpack, table_concat, byte, char, string_rep, sub, gsub, gmatch, string_format, floor, ceil, math_min, math_max, tonumber, type, math_huge =
  62.    table.unpack or unpack, table.concat, string.byte, string.char, string.rep, string.sub, string.gsub, string.gmatch, string.format, math.floor, math.ceil, math.min, math.max, tonumber, type, math.huge
  63.  
  64.  
  65. --------------------------------------------------------------------------------
  66. -- EXAMINING YOUR SYSTEM
  67. --------------------------------------------------------------------------------
  68.  
  69. local function get_precision(one)
  70.    -- "one" must be either float 1.0 or integer 1
  71.    -- returns bits_precision, is_integer
  72.    -- This function works correctly with all floating point datatypes (including non-IEEE-754)
  73.    local k, n, m, prev_n = 0, one, one
  74.    while true do
  75.       k, prev_n, n, m = k + 1, n, n + n + 1, m + m + k % 2
  76.       if k > 256 or n - (n - 1) ~= 1 or m - (m - 1) ~= 1 or n == m then
  77.          return k, false   -- floating point datatype
  78.       elseif n == prev_n then
  79.          return k, true    -- integer datatype
  80.       end
  81.    end
  82. end
  83.  
  84. -- Make sure Lua has "double" numbers
  85. local x = 2/3
  86. local Lua_has_double = x * 5 > 3 and x * 4 < 3 and get_precision(1.0) >= 53
  87. assert(Lua_has_double, "at least 53-bit floating point numbers are required")
  88.  
  89. -- Q:
  90. --    SHA2 was designed for FPU-less machines.
  91. --    So, why floating point numbers are needed for this module?
  92. -- A:
  93. --    53-bit "double" numbers are useful to calculate "magic numbers" used in SHA.
  94. --    I prefer to write 50 LOC "magic numbers calculator" instead of storing more than 200 constants explicitly in this source file.
  95.  
  96. local int_prec, Lua_has_integers = get_precision(1)
  97. local Lua_has_int64 = Lua_has_integers and int_prec == 64
  98. local Lua_has_int32 = Lua_has_integers and int_prec == 32
  99. assert(Lua_has_int64 or Lua_has_int32 or not Lua_has_integers, "Lua integers must be either 32-bit or 64-bit")
  100.  
  101. -- Q:
  102. --    Does it mean that almost all non-standard configurations are not supported?
  103. -- A:
  104. --    Yes.  Sorry, too many problems to support all possible Lua numbers configurations.
  105. --       Lua 5.1/5.2    with "int32"               will not work.
  106. --       Lua 5.1/5.2    with "int64"               will not work.
  107. --       Lua 5.1/5.2    with "int128"              will not work.
  108. --       Lua 5.1/5.2    with "float"               will not work.
  109. --       Lua 5.1/5.2    with "double"              is OK.          (default config for Lua 5.1, Lua 5.2, LuaJIT)
  110. --       Lua 5.3/5.4    with "int32"  + "float"    will not work.
  111. --       Lua 5.3/5.4    with "int64"  + "float"    will not work.
  112. --       Lua 5.3/5.4    with "int128" + "float"    will not work.
  113. --       Lua 5.3/5.4    with "int32"  + "double"   is OK.          (config used by Fengari)
  114. --       Lua 5.3/5.4    with "int64"  + "double"   is OK.          (default config for Lua 5.3, Lua 5.4)
  115. --       Lua 5.3/5.4    with "int128" + "double"   will not work.
  116. --   Using floating point numbers better than "double" instead of "double" is OK (non-IEEE-754 floating point implementation are allowed).
  117. --   Using "int128" instead of "int64" is not OK: "int128" would require different branch of implementation for optimized SHA512.
  118.  
  119. -- Check for LuaJIT and 32-bit bitwise libraries
  120. local is_LuaJIT = ({false, [1] = true})[1] and _VERSION ~= "Luau" and (type(jit) ~= "table" or jit.version_num >= 20000)  -- LuaJIT 1.x.x and Luau are treated as vanilla Lua 5.1/5.2
  121. local is_LuaJIT_21  -- LuaJIT 2.1+
  122. local LuaJIT_arch
  123. local ffi           -- LuaJIT FFI library (as a table)
  124. local b             -- 32-bit bitwise library (as a table)
  125. local library_name
  126.  
  127. if is_LuaJIT then
  128.    -- Assuming "bit" library is always available on LuaJIT
  129.    b = require"bit"
  130.    library_name = "bit"
  131.    -- "ffi" is intentionally disabled on some systems for safety reason
  132.    local LuaJIT_has_FFI, result = pcall(require, "ffi")
  133.    if LuaJIT_has_FFI then
  134.       ffi = result
  135.    end
  136.    is_LuaJIT_21 = not not loadstring"b=0b0"
  137.    LuaJIT_arch = type(jit) == "table" and jit.arch or ffi and ffi.arch or nil
  138. else
  139.    -- For vanilla Lua, "bit"/"bit32" libraries are searched in global namespace only.  No attempt is made to load a library if it's not loaded yet.
  140.    for _, libname in ipairs(_VERSION == "Lua 5.2" and {"bit32", "bit"} or {"bit", "bit32"}) do
  141.       if type(_G[libname]) == "table" and _G[libname].bxor then
  142.          b = _G[libname]
  143.          library_name = libname
  144.          break
  145.       end
  146.    end
  147. end
  148.  
  149. --------------------------------------------------------------------------------
  150. -- You can disable here some of your system's abilities (for testing purposes)
  151. --------------------------------------------------------------------------------
  152. -- is_LuaJIT = nil
  153. -- is_LuaJIT_21 = nil
  154. -- ffi = nil
  155. -- Lua_has_int32 = nil
  156. -- Lua_has_int64 = nil
  157. -- b, library_name = nil
  158. --------------------------------------------------------------------------------
  159.  
  160. if print_debug_messages then
  161.    -- Printing list of abilities of your system
  162.    print("Abilities:")
  163.    print("   Lua version:               "..(is_LuaJIT and "LuaJIT "..(is_LuaJIT_21 and "2.1 " or "2.0 ")..(LuaJIT_arch or "")..(ffi and " with FFI" or " without FFI") or _VERSION))
  164.    print("   Integer bitwise operators: "..(Lua_has_int64 and "int64" or Lua_has_int32 and "int32" or "no"))
  165.    print("   32-bit bitwise library:    "..(library_name or "not found"))
  166. end
  167.  
  168. -- Selecting the most suitable implementation for given set of abilities
  169. local method, branch
  170. if is_LuaJIT and ffi then
  171.    method = "Using 'ffi' library of LuaJIT"
  172.    branch = "FFI"
  173. elseif is_LuaJIT then
  174.    method = "Using special code for sandboxed LuaJIT (no FFI)"
  175.    branch = "LJ"
  176. elseif Lua_has_int64 then
  177.    method = "Using native int64 bitwise operators"
  178.    branch = "INT64"
  179. elseif Lua_has_int32 then
  180.    method = "Using native int32 bitwise operators"
  181.    branch = "INT32"
  182. elseif library_name then   -- when bitwise library is available (Lua 5.2 with native library "bit32" or Lua 5.1 with external library "bit")
  183.    method = "Using '"..library_name.."' library"
  184.    branch = "LIB32"
  185. else
  186.    method = "Emulating bitwise operators using look-up table"
  187.    branch = "EMUL"
  188. end
  189.  
  190. if print_debug_messages then
  191.    -- Printing the implementation selected to be used on your system
  192.    print("Implementation selected:")
  193.    print("   "..method)
  194. end
  195.  
  196.  
  197. --------------------------------------------------------------------------------
  198. -- BASIC 32-BIT BITWISE FUNCTIONS
  199. --------------------------------------------------------------------------------
  200.  
  201. local AND, OR, XOR, SHL, SHR, ROL, ROR, NOT, NORM, HEX, XOR_BYTE
  202. -- Only low 32 bits of function arguments matter, high bits are ignored
  203. -- The result of all functions (except HEX) is an integer inside "correct range":
  204. --    for "bit" library:    (-2^31)..(2^31-1)
  205. --    for "bit32" library:        0..(2^32-1)
  206.  
  207. if branch == "FFI" or branch == "LJ" or branch == "LIB32" then
  208.  
  209.    -- Your system has 32-bit bitwise library (either "bit" or "bit32")
  210.  
  211.    AND  = b.band                -- 2 arguments
  212.    OR   = b.bor                 -- 2 arguments
  213.    XOR  = b.bxor                -- 2..5 arguments
  214.    SHL  = b.lshift              -- second argument is integer 0..31
  215.    SHR  = b.rshift              -- second argument is integer 0..31
  216.    ROL  = b.rol or b.lrotate    -- second argument is integer 0..31
  217.    ROR  = b.ror or b.rrotate    -- second argument is integer 0..31
  218.    NOT  = b.bnot                -- only for LuaJIT
  219.    NORM = b.tobit               -- only for LuaJIT
  220.    HEX  = b.tohex               -- returns string of 8 lowercase hexadecimal digits
  221.    assert(AND and OR and XOR and SHL and SHR and ROL and ROR and NOT, "Library '"..library_name.."' is incomplete")
  222.    XOR_BYTE = XOR               -- XOR of two bytes (0..255)
  223.  
  224. elseif branch == "EMUL" then
  225.  
  226.    -- Emulating 32-bit bitwise operations using 53-bit floating point arithmetic
  227.  
  228.    function SHL(x, n)
  229.       return (x * 2^n) % 2^32
  230.    end
  231.  
  232.    function SHR(x, n)
  233.       x = x % 2^32 / 2^n
  234.       return x - x % 1
  235.    end
  236.  
  237.    function ROL(x, n)
  238.       x = x % 2^32 * 2^n
  239.       local r = x % 2^32
  240.       return r + (x - r) / 2^32
  241.    end
  242.  
  243.    function ROR(x, n)
  244.       x = x % 2^32 / 2^n
  245.       local r = x % 1
  246.       return r * 2^32 + (x - r)
  247.    end
  248.  
  249.    local AND_of_two_bytes = {[0] = 0}  -- look-up table (256*256 entries)
  250.    local idx = 0
  251.    for y = 0, 127 * 256, 256 do
  252.       for x = y, y + 127 do
  253.          x = AND_of_two_bytes[x] * 2
  254.          AND_of_two_bytes[idx] = x
  255.          AND_of_two_bytes[idx + 1] = x
  256.          AND_of_two_bytes[idx + 256] = x
  257.          AND_of_two_bytes[idx + 257] = x + 1
  258.          idx = idx + 2
  259.       end
  260.       idx = idx + 256
  261.    end
  262.  
  263.    local function and_or_xor(x, y, operation)
  264.       -- operation: nil = AND, 1 = OR, 2 = XOR
  265.       local x0 = x % 2^32
  266.       local y0 = y % 2^32
  267.       local rx = x0 % 256
  268.       local ry = y0 % 256
  269.       local res = AND_of_two_bytes[rx + ry * 256]
  270.       x = x0 - rx
  271.       y = (y0 - ry) / 256
  272.       rx = x % 65536
  273.       ry = y % 256
  274.       res = res + AND_of_two_bytes[rx + ry] * 256
  275.       x = (x - rx) / 256
  276.       y = (y - ry) / 256
  277.       rx = x % 65536 + y % 256
  278.       res = res + AND_of_two_bytes[rx] * 65536
  279.       res = res + AND_of_two_bytes[(x + y - rx) / 256] * 16777216
  280.       if operation then
  281.          res = x0 + y0 - operation * res
  282.       end
  283.       return res
  284.    end
  285.  
  286.    function AND(x, y)
  287.       return and_or_xor(x, y)
  288.    end
  289.  
  290.    function OR(x, y)
  291.       return and_or_xor(x, y, 1)
  292.    end
  293.  
  294.    function XOR(x, y, z, t, u)          -- 2..5 arguments
  295.       if z then
  296.          if t then
  297.             if u then
  298.                t = and_or_xor(t, u, 2)
  299.             end
  300.             z = and_or_xor(z, t, 2)
  301.          end
  302.          y = and_or_xor(y, z, 2)
  303.       end
  304.       return and_or_xor(x, y, 2)
  305.    end
  306.  
  307.    function XOR_BYTE(x, y)
  308.       return x + y - 2 * AND_of_two_bytes[x + y * 256]
  309.    end
  310.  
  311. end
  312.  
  313. HEX = HEX
  314.    or
  315.       pcall(string_format, "%x", 2^31) and
  316.       function (x)  -- returns string of 8 lowercase hexadecimal digits
  317.          return string_format("%08x", x % 4294967296)
  318.       end
  319.    or
  320.       function (x)  -- for OpenWrt's dialect of Lua
  321.          return string_format("%08x", (x + 2^31) % 2^32 - 2^31)
  322.       end
  323.  
  324. local function XORA5(x, y)
  325.    return XOR(x, y or 0xA5A5A5A5) % 4294967296
  326. end
  327.  
  328. local function create_array_of_lanes()
  329.    return {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  330. end
  331.  
  332.  
  333. --------------------------------------------------------------------------------
  334. -- CREATING OPTIMIZED INNER LOOP
  335. --------------------------------------------------------------------------------
  336.  
  337. -- Inner loop functions
  338. local sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
  339.  
  340. -- Arrays of SHA-2 "magic numbers" (in "INT64" and "FFI" branches "*_lo" arrays contain 64-bit values)
  341. local sha2_K_lo, sha2_K_hi, sha2_H_lo, sha2_H_hi, sha3_RC_lo, sha3_RC_hi = {}, {}, {}, {}, {}, {}
  342. local sha2_H_ext256 = {[224] = {}, [256] = sha2_H_hi}
  343. local sha2_H_ext512_lo, sha2_H_ext512_hi = {[384] = {}, [512] = sha2_H_lo}, {[384] = {}, [512] = sha2_H_hi}
  344. local md5_K, md5_sha1_H = {}, {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0}
  345. local md5_next_shift = {0, 0, 0, 0, 0, 0, 0, 0, 28, 25, 26, 27, 0, 0, 10, 9, 11, 12, 0, 15, 16, 17, 18, 0, 20, 22, 23, 21}
  346. local HEX64, lanes_index_base  -- defined only for branches that internally use 64-bit integers: "INT64" and "FFI"
  347. local common_W = {}    -- temporary table shared between all calculations (to avoid creating new temporary table every time)
  348. local common_W_blake2b, common_W_blake2s, v_for_blake2s_feed_64 = common_W, common_W, {}
  349. local K_lo_modulo, hi_factor, hi_factor_keccak = 4294967296, 0, 0
  350. local sigma = {
  351.    {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
  352.    { 15, 11,  5,  9, 10, 16, 14,  7,  2, 13,  1,  3, 12,  8,  6,  4 },
  353.    { 12,  9, 13,  1,  6,  3, 16, 14, 11, 15,  4,  7,  8,  2, 10,  5 },
  354.    {  8, 10,  4,  2, 14, 13, 12, 15,  3,  7,  6, 11,  5,  1, 16,  9 },
  355.    { 10,  1,  6,  8,  3,  5, 11, 16, 15,  2, 12, 13,  7,  9,  4, 14 },
  356.    {  3, 13,  7, 11,  1, 12,  9,  4,  5, 14,  8,  6, 16, 15,  2, 10 },
  357.    { 13,  6,  2, 16, 15, 14,  5, 11,  1,  8,  7,  4, 10,  3,  9, 12 },
  358.    { 14, 12,  8, 15, 13,  2,  4, 10,  6,  1, 16,  5,  9,  7,  3, 11 },
  359.    {  7, 16, 15, 10, 12,  4,  1,  9, 13,  3, 14,  8,  2,  5, 11,  6 },
  360.    { 11,  3,  9,  5,  8,  7,  2,  6, 16, 12, 10, 15,  4, 13, 14,  1 },
  361. };  sigma[11], sigma[12] = sigma[1], sigma[2]
  362. local perm_blake3 = {
  363.    1, 3, 4, 11, 13, 10, 12, 6,
  364.    1, 3, 4, 11, 13, 10,
  365.    2, 7, 5, 8, 14, 15, 16, 9,
  366.    2, 7, 5, 8, 14, 15,
  367. }
  368.  
  369. local function build_keccak_format(elem)
  370.    local keccak_format = {}
  371.    for _, size in ipairs{1, 9, 13, 17, 18, 21} do
  372.       keccak_format[size] = "<"..string_rep(elem, size)
  373.    end
  374.    return keccak_format
  375. end
  376.  
  377.  
  378. if branch == "FFI" then
  379.  
  380.    local common_W_FFI_int32 = ffi.new("int32_t[?]", 80)   -- 64 is enough for SHA256, but 80 is needed for SHA-1
  381.    common_W_blake2s = common_W_FFI_int32
  382.    v_for_blake2s_feed_64 = ffi.new("int32_t[?]", 16)
  383.    perm_blake3 = ffi.new("uint8_t[?]", #perm_blake3 + 1, 0, unpack(perm_blake3))
  384.    for j = 1, 10 do
  385.       sigma[j] = ffi.new("uint8_t[?]", #sigma[j] + 1, 0, unpack(sigma[j]))
  386.    end;  sigma[11], sigma[12] = sigma[1], sigma[2]
  387.  
  388.  
  389.    -- SHA256 implementation for "LuaJIT with FFI" branch
  390.  
  391.    function sha256_feed_64(H, str, offs, size)
  392.       -- offs >= 0, size >= 0, size is multiple of 64
  393.       local W, K = common_W_FFI_int32, sha2_K_hi
  394.       for pos = offs, offs + size - 1, 64 do
  395.          for j = 0, 15 do
  396.             pos = pos + 4
  397.             local a, b, c, d = byte(str, pos - 3, pos)   -- slow, but doesn't depend on endianness
  398.             W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  399.          end
  400.          for j = 16, 63 do
  401.             local a, b = W[j-15], W[j-2]
  402.             W[j] = NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16] )
  403.          end
  404.          local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  405.          for j = 0, 63, 8 do  -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
  406.             local z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j] + K[j+1] + h) )
  407.             h, g, f, e = g, f, e, NORM( d + z )
  408.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  409.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+1] + K[j+2] + h) )
  410.             h, g, f, e = g, f, e, NORM( d + z )
  411.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  412.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+2] + K[j+3] + h) )
  413.             h, g, f, e = g, f, e, NORM( d + z )
  414.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  415.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+3] + K[j+4] + h) )
  416.             h, g, f, e = g, f, e, NORM( d + z )
  417.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  418.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+4] + K[j+5] + h) )
  419.             h, g, f, e = g, f, e, NORM( d + z )
  420.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  421.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+5] + K[j+6] + h) )
  422.             h, g, f, e = g, f, e, NORM( d + z )
  423.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  424.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+6] + K[j+7] + h) )
  425.             h, g, f, e = g, f, e, NORM( d + z )
  426.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  427.             z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+7] + K[j+8] + h) )
  428.             h, g, f, e = g, f, e, NORM( d + z )
  429.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  430.          end
  431.          H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  432.          H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
  433.       end
  434.    end
  435.  
  436.  
  437.    local common_W_FFI_int64 = ffi.new("int64_t[?]", 80)
  438.    common_W_blake2b = common_W_FFI_int64
  439.    local int64 = ffi.typeof"int64_t"
  440.    local int32 = ffi.typeof"int32_t"
  441.    local uint32 = ffi.typeof"uint32_t"
  442.    hi_factor = int64(2^32)
  443.  
  444.    if is_LuaJIT_21 then   -- LuaJIT 2.1 supports bitwise 64-bit operations
  445.  
  446.       local AND64, OR64, XOR64, NOT64, SHL64, SHR64, ROL64, ROR64  -- introducing synonyms for better code readability
  447.           = AND,   OR,   XOR,   NOT,   SHL,   SHR,   ROL,   ROR
  448.       HEX64 = HEX
  449.  
  450.  
  451.       -- BLAKE2b implementation for "LuaJIT 2.1 + FFI" branch
  452.  
  453.       do
  454.          local v = ffi.new("int64_t[?]", 16)
  455.          local W = common_W_blake2b
  456.  
  457.          local function G(a, b, c, d, k1, k2)
  458.             local va, vb, vc, vd = v[a], v[b], v[c], v[d]
  459.             va = W[k1] + (va + vb)
  460.             vd = ROR64(XOR64(vd, va), 32)
  461.             vc = vc + vd
  462.             vb = ROR64(XOR64(vb, vc), 24)
  463.             va = W[k2] + (va + vb)
  464.             vd = ROR64(XOR64(vd, va), 16)
  465.             vc = vc + vd
  466.             vb = ROL64(XOR64(vb, vc), 1)
  467.             v[a], v[b], v[c], v[d] = va, vb, vc, vd
  468.          end
  469.  
  470.          function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  471.             -- offs >= 0, size >= 0, size is multiple of 128
  472.             local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  473.             for pos = offs, offs + size - 1, 128 do
  474.                if str then
  475.                   for j = 1, 16 do
  476.                      pos = pos + 8
  477.                      local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)
  478.                      W[j] = XOR64(OR(SHL(h, 24), SHL(g, 16), SHL(f, 8), e) * int64(2^32), uint32(int32(OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))))
  479.                   end
  480.                end
  481.                v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  482.                v[0x8], v[0x9], v[0xA], v[0xB], v[0xD], v[0xE], v[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  483.                bytes_compressed = bytes_compressed + (last_block_size or 128)
  484.                v[0xC] = XOR64(sha2_H_lo[5], bytes_compressed)  -- t0 = low_8_bytes(bytes_compressed)
  485.                -- t1 = high_8_bytes(bytes_compressed) = 0,  message length is always below 2^53 bytes
  486.                if last_block_size then  -- flag f0
  487.                   v[0xE] = NOT64(v[0xE])
  488.                end
  489.                if is_last_node then  -- flag f1
  490.                   v[0xF] = NOT64(v[0xF])
  491.                end
  492.                for j = 1, 12 do
  493.                   local row = sigma[j]
  494.                   G(0, 4,  8, 12, row[ 1], row[ 2])
  495.                   G(1, 5,  9, 13, row[ 3], row[ 4])
  496.                   G(2, 6, 10, 14, row[ 5], row[ 6])
  497.                   G(3, 7, 11, 15, row[ 7], row[ 8])
  498.                   G(0, 5, 10, 15, row[ 9], row[10])
  499.                   G(1, 6, 11, 12, row[11], row[12])
  500.                   G(2, 7,  8, 13, row[13], row[14])
  501.                   G(3, 4,  9, 14, row[15], row[16])
  502.                end
  503.                h1 = XOR64(h1, v[0x0], v[0x8])
  504.                h2 = XOR64(h2, v[0x1], v[0x9])
  505.                h3 = XOR64(h3, v[0x2], v[0xA])
  506.                h4 = XOR64(h4, v[0x3], v[0xB])
  507.                h5 = XOR64(h5, v[0x4], v[0xC])
  508.                h6 = XOR64(h6, v[0x5], v[0xD])
  509.                h7 = XOR64(h7, v[0x6], v[0xE])
  510.                h8 = XOR64(h8, v[0x7], v[0xF])
  511.             end
  512.             H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  513.             return bytes_compressed
  514.          end
  515.  
  516.       end
  517.  
  518.  
  519.       -- SHA-3 implementation for "LuaJIT 2.1 + FFI" branch
  520.  
  521.       local arr64_t = ffi.typeof"int64_t[?]"
  522.       -- lanes array is indexed from 0
  523.       lanes_index_base = 0
  524.       hi_factor_keccak = int64(2^32)
  525.  
  526.       function create_array_of_lanes()
  527.          return arr64_t(30)  -- 25 + 5 for temporary usage
  528.       end
  529.  
  530.       function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
  531.          -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  532.          local RC = sha3_RC_lo
  533.          local qwords_qty = SHR(block_size_in_bytes, 3)
  534.          for pos = offs, offs + size - 1, block_size_in_bytes do
  535.             for j = 0, qwords_qty - 1 do
  536.                pos = pos + 8
  537.                local h, g, f, e, d, c, b, a = byte(str, pos - 7, pos)   -- slow, but doesn't depend on endianness
  538.                lanes[j] = XOR64(lanes[j], OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))))
  539.             end
  540.             for round_idx = 1, 24 do
  541.                for j = 0, 4 do
  542.                   lanes[25 + j] = XOR64(lanes[j], lanes[j+5], lanes[j+10], lanes[j+15], lanes[j+20])
  543.                end
  544.                local D = XOR64(lanes[25], ROL64(lanes[27], 1))
  545.                lanes[1], lanes[6], lanes[11], lanes[16] = ROL64(XOR64(D, lanes[6]), 44), ROL64(XOR64(D, lanes[16]), 45), ROL64(XOR64(D, lanes[1]), 1), ROL64(XOR64(D, lanes[11]), 10)
  546.                lanes[21] = ROL64(XOR64(D, lanes[21]), 2)
  547.                D = XOR64(lanes[26], ROL64(lanes[28], 1))
  548.                lanes[2], lanes[7], lanes[12], lanes[22] = ROL64(XOR64(D, lanes[12]), 43), ROL64(XOR64(D, lanes[22]), 61), ROL64(XOR64(D, lanes[7]), 6), ROL64(XOR64(D, lanes[2]), 62)
  549.                lanes[17] = ROL64(XOR64(D, lanes[17]), 15)
  550.                D = XOR64(lanes[27], ROL64(lanes[29], 1))
  551.                lanes[3], lanes[8], lanes[18], lanes[23] = ROL64(XOR64(D, lanes[18]), 21), ROL64(XOR64(D, lanes[3]), 28), ROL64(XOR64(D, lanes[23]), 56), ROL64(XOR64(D, lanes[8]), 55)
  552.                lanes[13] = ROL64(XOR64(D, lanes[13]), 25)
  553.                D = XOR64(lanes[28], ROL64(lanes[25], 1))
  554.                lanes[4], lanes[14], lanes[19], lanes[24] = ROL64(XOR64(D, lanes[24]), 14), ROL64(XOR64(D, lanes[19]), 8), ROL64(XOR64(D, lanes[4]), 27), ROL64(XOR64(D, lanes[14]), 39)
  555.                lanes[9] = ROL64(XOR64(D, lanes[9]), 20)
  556.                D = XOR64(lanes[29], ROL64(lanes[26], 1))
  557.                lanes[5], lanes[10], lanes[15], lanes[20] = ROL64(XOR64(D, lanes[10]), 3), ROL64(XOR64(D, lanes[20]), 18), ROL64(XOR64(D, lanes[5]), 36), ROL64(XOR64(D, lanes[15]), 41)
  558.                lanes[0] = XOR64(D, lanes[0])
  559.                lanes[0], lanes[1], lanes[2], lanes[3], lanes[4] = XOR64(lanes[0], AND64(NOT64(lanes[1]), lanes[2]), RC[round_idx]), XOR64(lanes[1], AND64(NOT64(lanes[2]), lanes[3])), XOR64(lanes[2], AND64(NOT64(lanes[3]), lanes[4])), XOR64(lanes[3], AND64(NOT64(lanes[4]), lanes[0])), XOR64(lanes[4], AND64(NOT64(lanes[0]), lanes[1]))
  560.                lanes[5], lanes[6], lanes[7], lanes[8], lanes[9] = XOR64(lanes[8], AND64(NOT64(lanes[9]), lanes[5])), XOR64(lanes[9], AND64(NOT64(lanes[5]), lanes[6])), XOR64(lanes[5], AND64(NOT64(lanes[6]), lanes[7])), XOR64(lanes[6], AND64(NOT64(lanes[7]), lanes[8])), XOR64(lanes[7], AND64(NOT64(lanes[8]), lanes[9]))
  561.                lanes[10], lanes[11], lanes[12], lanes[13], lanes[14] = XOR64(lanes[11], AND64(NOT64(lanes[12]), lanes[13])), XOR64(lanes[12], AND64(NOT64(lanes[13]), lanes[14])), XOR64(lanes[13], AND64(NOT64(lanes[14]), lanes[10])), XOR64(lanes[14], AND64(NOT64(lanes[10]), lanes[11])), XOR64(lanes[10], AND64(NOT64(lanes[11]), lanes[12]))
  562.                lanes[15], lanes[16], lanes[17], lanes[18], lanes[19] = XOR64(lanes[19], AND64(NOT64(lanes[15]), lanes[16])), XOR64(lanes[15], AND64(NOT64(lanes[16]), lanes[17])), XOR64(lanes[16], AND64(NOT64(lanes[17]), lanes[18])), XOR64(lanes[17], AND64(NOT64(lanes[18]), lanes[19])), XOR64(lanes[18], AND64(NOT64(lanes[19]), lanes[15]))
  563.                lanes[20], lanes[21], lanes[22], lanes[23], lanes[24] = XOR64(lanes[22], AND64(NOT64(lanes[23]), lanes[24])), XOR64(lanes[23], AND64(NOT64(lanes[24]), lanes[20])), XOR64(lanes[24], AND64(NOT64(lanes[20]), lanes[21])), XOR64(lanes[20], AND64(NOT64(lanes[21]), lanes[22])), XOR64(lanes[21], AND64(NOT64(lanes[22]), lanes[23]))
  564.             end
  565.          end
  566.       end
  567.  
  568.  
  569.       local A5_long = 0xA5A5A5A5 * int64(2^32 + 1)  -- It's impossible to use constant 0xA5A5A5A5A5A5A5A5LL because it will raise syntax error on other Lua versions
  570.  
  571.       function XORA5(long, long2)
  572.          return XOR64(long, long2 or A5_long)
  573.       end
  574.  
  575.  
  576.       -- SHA512 implementation for "LuaJIT 2.1 + FFI" branch
  577.  
  578.       function sha512_feed_128(H, _, str, offs, size)
  579.          -- offs >= 0, size >= 0, size is multiple of 128
  580.          local W, K = common_W_FFI_int64, sha2_K_lo
  581.          for pos = offs, offs + size - 1, 128 do
  582.             for j = 0, 15 do
  583.                pos = pos + 8
  584.                local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)   -- slow, but doesn't depend on endianness
  585.                W[j] = OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h))))
  586.             end
  587.             for j = 16, 79 do
  588.                local a, b = W[j-15], W[j-2]
  589.                W[j] = XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7)) + XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6)) + W[j-7] + W[j-16]
  590.             end
  591.             local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  592.             for j = 0, 79, 8 do
  593.                local z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+1] + W[j]
  594.                h, g, f, e = g, f, e, z + d
  595.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  596.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+2] + W[j+1]
  597.                h, g, f, e = g, f, e, z + d
  598.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  599.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+3] + W[j+2]
  600.                h, g, f, e = g, f, e, z + d
  601.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  602.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+4] + W[j+3]
  603.                h, g, f, e = g, f, e, z + d
  604.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  605.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+5] + W[j+4]
  606.                h, g, f, e = g, f, e, z + d
  607.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  608.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+6] + W[j+5]
  609.                h, g, f, e = g, f, e, z + d
  610.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  611.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+7] + W[j+6]
  612.                h, g, f, e = g, f, e, z + d
  613.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  614.                z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+8] + W[j+7]
  615.                h, g, f, e = g, f, e, z + d
  616.                d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
  617.             end
  618.             H[1] = a + H[1]
  619.             H[2] = b + H[2]
  620.             H[3] = c + H[3]
  621.             H[4] = d + H[4]
  622.             H[5] = e + H[5]
  623.             H[6] = f + H[6]
  624.             H[7] = g + H[7]
  625.             H[8] = h + H[8]
  626.          end
  627.       end
  628.  
  629.    else  -- LuaJIT 2.0 doesn't support 64-bit bitwise operations
  630.  
  631.       local U = ffi.new("union{int64_t i64; struct{int32_t "..(ffi.abi("le") and "lo, hi" or "hi, lo")..";} i32;}[3]")
  632.       -- this array of unions is used for fast splitting int64 into int32_high and int32_low
  633.  
  634.       -- "xorrific" 64-bit functions :-)
  635.       -- int64 input is splitted into two int32 parts, some bitwise 32-bit operations are performed, finally the result is converted to int64
  636.       -- these functions are needed because bit.* functions in LuaJIT 2.0 don't work with int64_t
  637.  
  638.       local function XORROR64_1(a)
  639.          -- return XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7))
  640.          U[0].i64 = a
  641.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  642.          local t_lo = XOR(SHR(a_lo, 1), SHL(a_hi, 31), SHR(a_lo, 8), SHL(a_hi, 24), SHR(a_lo, 7), SHL(a_hi, 25))
  643.          local t_hi = XOR(SHR(a_hi, 1), SHL(a_lo, 31), SHR(a_hi, 8), SHL(a_lo, 24), SHR(a_hi, 7))
  644.          return t_hi * int64(2^32) + uint32(int32(t_lo))
  645.       end
  646.  
  647.       local function XORROR64_2(b)
  648.          -- return XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6))
  649.          U[0].i64 = b
  650.          local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
  651.          local u_lo = XOR(SHR(b_lo, 19), SHL(b_hi, 13), SHL(b_lo, 3), SHR(b_hi, 29), SHR(b_lo, 6), SHL(b_hi, 26))
  652.          local u_hi = XOR(SHR(b_hi, 19), SHL(b_lo, 13), SHL(b_hi, 3), SHR(b_lo, 29), SHR(b_hi, 6))
  653.          return u_hi * int64(2^32) + uint32(int32(u_lo))
  654.       end
  655.  
  656.       local function XORROR64_3(e)
  657.          -- return XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23))
  658.          U[0].i64 = e
  659.          local e_lo, e_hi = U[0].i32.lo, U[0].i32.hi
  660.          local u_lo = XOR(SHR(e_lo, 14), SHL(e_hi, 18), SHR(e_lo, 18), SHL(e_hi, 14), SHL(e_lo, 23), SHR(e_hi, 9))
  661.          local u_hi = XOR(SHR(e_hi, 14), SHL(e_lo, 18), SHR(e_hi, 18), SHL(e_lo, 14), SHL(e_hi, 23), SHR(e_lo, 9))
  662.          return u_hi * int64(2^32) + uint32(int32(u_lo))
  663.       end
  664.  
  665.       local function XORROR64_6(a)
  666.          -- return XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30))
  667.          U[0].i64 = a
  668.          local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
  669.          local u_lo = XOR(SHR(b_lo, 28), SHL(b_hi, 4), SHL(b_lo, 30), SHR(b_hi, 2), SHL(b_lo, 25), SHR(b_hi, 7))
  670.          local u_hi = XOR(SHR(b_hi, 28), SHL(b_lo, 4), SHL(b_hi, 30), SHR(b_lo, 2), SHL(b_hi, 25), SHR(b_lo, 7))
  671.          return u_hi * int64(2^32) + uint32(int32(u_lo))
  672.       end
  673.  
  674.       local function XORROR64_4(e, f, g)
  675.          -- return XOR64(g, AND64(e, XOR64(f, g)))
  676.          U[0].i64 = f
  677.          U[1].i64 = g
  678.          U[2].i64 = e
  679.          local f_lo, f_hi = U[0].i32.lo, U[0].i32.hi
  680.          local g_lo, g_hi = U[1].i32.lo, U[1].i32.hi
  681.          local e_lo, e_hi = U[2].i32.lo, U[2].i32.hi
  682.          local result_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
  683.          local result_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
  684.          return result_hi * int64(2^32) + uint32(int32(result_lo))
  685.       end
  686.  
  687.       local function XORROR64_5(a, b, c)
  688.          -- return XOR64(AND64(XOR64(a, b), c), AND64(a, b))
  689.          U[0].i64 = a
  690.          U[1].i64 = b
  691.          U[2].i64 = c
  692.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  693.          local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  694.          local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
  695.          local result_lo = XOR(AND(XOR(a_lo, b_lo), c_lo), AND(a_lo, b_lo))
  696.          local result_hi = XOR(AND(XOR(a_hi, b_hi), c_hi), AND(a_hi, b_hi))
  697.          return result_hi * int64(2^32) + uint32(int32(result_lo))
  698.       end
  699.  
  700.       local function XORROR64_7(a, b, m)
  701.          -- return ROR64(XOR64(a, b), m), m = 1..31
  702.          U[0].i64 = a
  703.          U[1].i64 = b
  704.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  705.          local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  706.          local c_lo, c_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  707.          local t_lo = XOR(SHR(c_lo, m), SHL(c_hi, -m))
  708.          local t_hi = XOR(SHR(c_hi, m), SHL(c_lo, -m))
  709.          return t_hi * int64(2^32) + uint32(int32(t_lo))
  710.       end
  711.  
  712.       local function XORROR64_8(a, b)
  713.          -- return ROL64(XOR64(a, b), 1)
  714.          U[0].i64 = a
  715.          U[1].i64 = b
  716.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  717.          local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  718.          local c_lo, c_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  719.          local t_lo = XOR(SHL(c_lo, 1), SHR(c_hi, 31))
  720.          local t_hi = XOR(SHL(c_hi, 1), SHR(c_lo, 31))
  721.          return t_hi * int64(2^32) + uint32(int32(t_lo))
  722.       end
  723.  
  724.       local function XORROR64_9(a, b)
  725.          -- return ROR64(XOR64(a, b), 32)
  726.          U[0].i64 = a
  727.          U[1].i64 = b
  728.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  729.          local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  730.          local t_hi, t_lo = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  731.          return t_hi * int64(2^32) + uint32(int32(t_lo))
  732.       end
  733.  
  734.       local function XOR64(a, b)
  735.          -- return XOR64(a, b)
  736.          U[0].i64 = a
  737.          U[1].i64 = b
  738.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  739.          local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  740.          local t_lo, t_hi = XOR(a_lo, b_lo), XOR(a_hi, b_hi)
  741.          return t_hi * int64(2^32) + uint32(int32(t_lo))
  742.       end
  743.  
  744.       local function XORROR64_11(a, b, c)
  745.          -- return XOR64(a, b, c)
  746.          U[0].i64 = a
  747.          U[1].i64 = b
  748.          U[2].i64 = c
  749.          local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
  750.          local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
  751.          local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
  752.          local t_lo, t_hi = XOR(a_lo, b_lo, c_lo), XOR(a_hi, b_hi, c_hi)
  753.          return t_hi * int64(2^32) + uint32(int32(t_lo))
  754.       end
  755.  
  756.       function XORA5(long, long2)
  757.          -- return XOR64(long, long2 or 0xA5A5A5A5A5A5A5A5)
  758.          U[0].i64 = long
  759.          local lo32, hi32 = U[0].i32.lo, U[0].i32.hi
  760.          local long2_lo, long2_hi = 0xA5A5A5A5, 0xA5A5A5A5
  761.          if long2 then
  762.             U[1].i64 = long2
  763.             long2_lo, long2_hi = U[1].i32.lo, U[1].i32.hi
  764.          end
  765.          lo32 = XOR(lo32, long2_lo)
  766.          hi32 = XOR(hi32, long2_hi)
  767.          return hi32 * int64(2^32) + uint32(int32(lo32))
  768.       end
  769.  
  770.       function HEX64(long)
  771.          U[0].i64 = long
  772.          return HEX(U[0].i32.hi)..HEX(U[0].i32.lo)
  773.       end
  774.  
  775.  
  776.       -- SHA512 implementation for "LuaJIT 2.0 + FFI" branch
  777.  
  778.       function sha512_feed_128(H, _, str, offs, size)
  779.          -- offs >= 0, size >= 0, size is multiple of 128
  780.          local W, K = common_W_FFI_int64, sha2_K_lo
  781.          for pos = offs, offs + size - 1, 128 do
  782.             for j = 0, 15 do
  783.                pos = pos + 8
  784.                local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)   -- slow, but doesn't depend on endianness
  785.                W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32) + uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))
  786.             end
  787.             for j = 16, 79 do
  788.                W[j] = XORROR64_1(W[j-15]) + XORROR64_2(W[j-2]) + W[j-7] + W[j-16]
  789.             end
  790.             local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  791.             for j = 0, 79, 8 do
  792.                local z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+1] + W[j]
  793.                h, g, f, e = g, f, e, z + d
  794.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  795.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+2] + W[j+1]
  796.                h, g, f, e = g, f, e, z + d
  797.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  798.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+3] + W[j+2]
  799.                h, g, f, e = g, f, e, z + d
  800.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  801.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+4] + W[j+3]
  802.                h, g, f, e = g, f, e, z + d
  803.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  804.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+5] + W[j+4]
  805.                h, g, f, e = g, f, e, z + d
  806.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  807.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+6] + W[j+5]
  808.                h, g, f, e = g, f, e, z + d
  809.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  810.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+7] + W[j+6]
  811.                h, g, f, e = g, f, e, z + d
  812.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  813.                z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+8] + W[j+7]
  814.                h, g, f, e = g, f, e, z + d
  815.                d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
  816.             end
  817.             H[1] = a + H[1]
  818.             H[2] = b + H[2]
  819.             H[3] = c + H[3]
  820.             H[4] = d + H[4]
  821.             H[5] = e + H[5]
  822.             H[6] = f + H[6]
  823.             H[7] = g + H[7]
  824.             H[8] = h + H[8]
  825.          end
  826.       end
  827.  
  828.  
  829.       -- BLAKE2b implementation for "LuaJIT 2.0 + FFI" branch
  830.  
  831.       do
  832.          local v = ffi.new("int64_t[?]", 16)
  833.          local W = common_W_blake2b
  834.  
  835.          local function G(a, b, c, d, k1, k2)
  836.             local va, vb, vc, vd = v[a], v[b], v[c], v[d]
  837.             va = W[k1] + (va + vb)
  838.             vd = XORROR64_9(vd, va)
  839.             vc = vc + vd
  840.             vb = XORROR64_7(vb, vc, 24)
  841.             va = W[k2] + (va + vb)
  842.             vd = XORROR64_7(vd, va, 16)
  843.             vc = vc + vd
  844.             vb = XORROR64_8(vb, vc)
  845.             v[a], v[b], v[c], v[d] = va, vb, vc, vd
  846.          end
  847.  
  848.          function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  849.             -- offs >= 0, size >= 0, size is multiple of 128
  850.             local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  851.             for pos = offs, offs + size - 1, 128 do
  852.                if str then
  853.                   for j = 1, 16 do
  854.                      pos = pos + 8
  855.                      local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos)
  856.                      W[j] = XOR64(OR(SHL(h, 24), SHL(g, 16), SHL(f, 8), e) * int64(2^32), uint32(int32(OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))))
  857.                   end
  858.                end
  859.                v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  860.                v[0x8], v[0x9], v[0xA], v[0xB], v[0xD], v[0xE], v[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  861.                bytes_compressed = bytes_compressed + (last_block_size or 128)
  862.                v[0xC] = XOR64(sha2_H_lo[5], bytes_compressed)  -- t0 = low_8_bytes(bytes_compressed)
  863.                -- t1 = high_8_bytes(bytes_compressed) = 0,  message length is always below 2^53 bytes
  864.                if last_block_size then  -- flag f0
  865.                   v[0xE] = -1 - v[0xE]
  866.                end
  867.                if is_last_node then  -- flag f1
  868.                   v[0xF] = -1 - v[0xF]
  869.                end
  870.                for j = 1, 12 do
  871.                   local row = sigma[j]
  872.                   G(0, 4,  8, 12, row[ 1], row[ 2])
  873.                   G(1, 5,  9, 13, row[ 3], row[ 4])
  874.                   G(2, 6, 10, 14, row[ 5], row[ 6])
  875.                   G(3, 7, 11, 15, row[ 7], row[ 8])
  876.                   G(0, 5, 10, 15, row[ 9], row[10])
  877.                   G(1, 6, 11, 12, row[11], row[12])
  878.                   G(2, 7,  8, 13, row[13], row[14])
  879.                   G(3, 4,  9, 14, row[15], row[16])
  880.                end
  881.                h1 = XORROR64_11(h1, v[0x0], v[0x8])
  882.                h2 = XORROR64_11(h2, v[0x1], v[0x9])
  883.                h3 = XORROR64_11(h3, v[0x2], v[0xA])
  884.                h4 = XORROR64_11(h4, v[0x3], v[0xB])
  885.                h5 = XORROR64_11(h5, v[0x4], v[0xC])
  886.                h6 = XORROR64_11(h6, v[0x5], v[0xD])
  887.                h7 = XORROR64_11(h7, v[0x6], v[0xE])
  888.                h8 = XORROR64_11(h8, v[0x7], v[0xF])
  889.             end
  890.             H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  891.             return bytes_compressed
  892.          end
  893.  
  894.       end
  895.  
  896.    end
  897.  
  898.  
  899.    -- MD5 implementation for "LuaJIT with FFI" branch
  900.  
  901.    function md5_feed_64(H, str, offs, size)
  902.       -- offs >= 0, size >= 0, size is multiple of 64
  903.       local W, K = common_W_FFI_int32, md5_K
  904.       for pos = offs, offs + size - 1, 64 do
  905.          for j = 0, 15 do
  906.             pos = pos + 4
  907.             local a, b, c, d = byte(str, pos - 3, pos)   -- slow, but doesn't depend on endianness
  908.             W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  909.          end
  910.          local a, b, c, d = H[1], H[2], H[3], H[4]
  911.          for j = 0, 15, 4 do
  912.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j  ] + a),  7) + b)
  913.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+1] + a), 12) + b)
  914.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+2] + a), 17) + b)
  915.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+4] + W[j+3] + a), 22) + b)
  916.          end
  917.          for j = 16, 31, 4 do
  918.             local g = 5*j
  919.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 1, 15)] + a),  5) + b)
  920.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 6, 15)] + a),  9) + b)
  921.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 5, 15)] + a), 14) + b)
  922.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+4] + W[AND(g    , 15)] + a), 20) + b)
  923.          end
  924.          for j = 32, 47, 4 do
  925.             local g = 3*j
  926.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 5, 15)] + a),  4) + b)
  927.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 8, 15)] + a), 11) + b)
  928.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 5, 15)] + a), 16) + b)
  929.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+4] + W[AND(g - 2, 15)] + a), 23) + b)
  930.          end
  931.          for j = 48, 63, 4 do
  932.             local g = 7*j
  933.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g    , 15)] + a),  6) + b)
  934.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15)] + a), 10) + b)
  935.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15)] + a), 15) + b)
  936.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+4] + W[AND(g + 5, 15)] + a), 21) + b)
  937.          end
  938.          H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  939.       end
  940.    end
  941.  
  942.  
  943.    -- SHA-1 implementation for "LuaJIT with FFI" branch
  944.  
  945.    function sha1_feed_64(H, str, offs, size)
  946.       -- offs >= 0, size >= 0, size is multiple of 64
  947.       local W = common_W_FFI_int32
  948.       for pos = offs, offs + size - 1, 64 do
  949.          for j = 0, 15 do
  950.             pos = pos + 4
  951.             local a, b, c, d = byte(str, pos - 3, pos)   -- slow, but doesn't depend on endianness
  952.             W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  953.          end
  954.          for j = 16, 79 do
  955.             W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
  956.          end
  957.          local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
  958.          for j = 0, 19, 5 do
  959.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j]   + 0x5A827999 + e))          -- constant = floor(2^30 * sqrt(2))
  960.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
  961.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
  962.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
  963.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
  964.          end
  965.          for j = 20, 39, 5 do
  966.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j]   + 0x6ED9EBA1 + e))                       -- 2^30 * sqrt(3)
  967.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
  968.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
  969.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
  970.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
  971.          end
  972.          for j = 40, 59, 5 do
  973.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j]   + 0x8F1BBCDC + e))  -- 2^30 * sqrt(5)
  974.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
  975.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
  976.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
  977.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
  978.          end
  979.          for j = 60, 79, 5 do
  980.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j]   + 0xCA62C1D6 + e))                       -- 2^30 * sqrt(10)
  981.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
  982.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
  983.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
  984.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
  985.          end
  986.          H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
  987.       end
  988.    end
  989.  
  990. end
  991.  
  992.  
  993. if branch == "FFI" and not is_LuaJIT_21 or branch == "LJ" then
  994.  
  995.    if branch == "FFI" then
  996.       local arr32_t = ffi.typeof"int32_t[?]"
  997.  
  998.       function create_array_of_lanes()
  999.          return arr32_t(31)  -- 25 + 5 + 1 (due to 1-based indexing)
  1000.       end
  1001.  
  1002.    end
  1003.  
  1004.  
  1005.    -- SHA-3 implementation for "LuaJIT 2.0 + FFI" and "LuaJIT without FFI" branches
  1006.  
  1007.    function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
  1008.       -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  1009.       local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
  1010.       local qwords_qty = SHR(block_size_in_bytes, 3)
  1011.       for pos = offs, offs + size - 1, block_size_in_bytes do
  1012.          for j = 1, qwords_qty do
  1013.             local a, b, c, d = byte(str, pos + 1, pos + 4)
  1014.             lanes_lo[j] = XOR(lanes_lo[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
  1015.             pos = pos + 8
  1016.             a, b, c, d = byte(str, pos - 3, pos)
  1017.             lanes_hi[j] = XOR(lanes_hi[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
  1018.          end
  1019.          for round_idx = 1, 24 do
  1020.             for j = 1, 5 do
  1021.                lanes_lo[25 + j] = XOR(lanes_lo[j], lanes_lo[j + 5], lanes_lo[j + 10], lanes_lo[j + 15], lanes_lo[j + 20])
  1022.             end
  1023.             for j = 1, 5 do
  1024.                lanes_hi[25 + j] = XOR(lanes_hi[j], lanes_hi[j + 5], lanes_hi[j + 10], lanes_hi[j + 15], lanes_hi[j + 20])
  1025.             end
  1026.             local D_lo = XOR(lanes_lo[26], SHL(lanes_lo[28], 1), SHR(lanes_hi[28], 31))
  1027.             local D_hi = XOR(lanes_hi[26], SHL(lanes_hi[28], 1), SHR(lanes_lo[28], 31))
  1028.             lanes_lo[2], lanes_hi[2], lanes_lo[7], lanes_hi[7], lanes_lo[12], lanes_hi[12], lanes_lo[17], lanes_hi[17] = XOR(SHR(XOR(D_lo, lanes_lo[7]), 20), SHL(XOR(D_hi, lanes_hi[7]), 12)), XOR(SHR(XOR(D_hi, lanes_hi[7]), 20), SHL(XOR(D_lo, lanes_lo[7]), 12)), XOR(SHR(XOR(D_lo, lanes_lo[17]), 19), SHL(XOR(D_hi, lanes_hi[17]), 13)), XOR(SHR(XOR(D_hi, lanes_hi[17]), 19), SHL(XOR(D_lo, lanes_lo[17]), 13)), XOR(SHL(XOR(D_lo, lanes_lo[2]), 1), SHR(XOR(D_hi, lanes_hi[2]), 31)), XOR(SHL(XOR(D_hi, lanes_hi[2]), 1), SHR(XOR(D_lo, lanes_lo[2]), 31)), XOR(SHL(XOR(D_lo, lanes_lo[12]), 10), SHR(XOR(D_hi, lanes_hi[12]), 22)), XOR(SHL(XOR(D_hi, lanes_hi[12]), 10), SHR(XOR(D_lo, lanes_lo[12]), 22))
  1029.             local L, H = XOR(D_lo, lanes_lo[22]), XOR(D_hi, lanes_hi[22])
  1030.             lanes_lo[22], lanes_hi[22] = XOR(SHL(L, 2), SHR(H, 30)), XOR(SHL(H, 2), SHR(L, 30))
  1031.             D_lo = XOR(lanes_lo[27], SHL(lanes_lo[29], 1), SHR(lanes_hi[29], 31))
  1032.             D_hi = XOR(lanes_hi[27], SHL(lanes_hi[29], 1), SHR(lanes_lo[29], 31))
  1033.             lanes_lo[3], lanes_hi[3], lanes_lo[8], lanes_hi[8], lanes_lo[13], lanes_hi[13], lanes_lo[23], lanes_hi[23] = XOR(SHR(XOR(D_lo, lanes_lo[13]), 21), SHL(XOR(D_hi, lanes_hi[13]), 11)), XOR(SHR(XOR(D_hi, lanes_hi[13]), 21), SHL(XOR(D_lo, lanes_lo[13]), 11)), XOR(SHR(XOR(D_lo, lanes_lo[23]), 3), SHL(XOR(D_hi, lanes_hi[23]), 29)), XOR(SHR(XOR(D_hi, lanes_hi[23]), 3), SHL(XOR(D_lo, lanes_lo[23]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[8]), 6), SHR(XOR(D_hi, lanes_hi[8]), 26)), XOR(SHL(XOR(D_hi, lanes_hi[8]), 6), SHR(XOR(D_lo, lanes_lo[8]), 26)), XOR(SHR(XOR(D_lo, lanes_lo[3]), 2), SHL(XOR(D_hi, lanes_hi[3]), 30)), XOR(SHR(XOR(D_hi, lanes_hi[3]), 2), SHL(XOR(D_lo, lanes_lo[3]), 30))
  1034.             L, H = XOR(D_lo, lanes_lo[18]), XOR(D_hi, lanes_hi[18])
  1035.             lanes_lo[18], lanes_hi[18] = XOR(SHL(L, 15), SHR(H, 17)), XOR(SHL(H, 15), SHR(L, 17))
  1036.             D_lo = XOR(lanes_lo[28], SHL(lanes_lo[30], 1), SHR(lanes_hi[30], 31))
  1037.             D_hi = XOR(lanes_hi[28], SHL(lanes_hi[30], 1), SHR(lanes_lo[30], 31))
  1038.             lanes_lo[4], lanes_hi[4], lanes_lo[9], lanes_hi[9], lanes_lo[19], lanes_hi[19], lanes_lo[24], lanes_hi[24] = XOR(SHL(XOR(D_lo, lanes_lo[19]), 21), SHR(XOR(D_hi, lanes_hi[19]), 11)), XOR(SHL(XOR(D_hi, lanes_hi[19]), 21), SHR(XOR(D_lo, lanes_lo[19]), 11)), XOR(SHL(XOR(D_lo, lanes_lo[4]), 28), SHR(XOR(D_hi, lanes_hi[4]), 4)), XOR(SHL(XOR(D_hi, lanes_hi[4]), 28), SHR(XOR(D_lo, lanes_lo[4]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[24]), 8), SHL(XOR(D_hi, lanes_hi[24]), 24)), XOR(SHR(XOR(D_hi, lanes_hi[24]), 8), SHL(XOR(D_lo, lanes_lo[24]), 24)), XOR(SHR(XOR(D_lo, lanes_lo[9]), 9), SHL(XOR(D_hi, lanes_hi[9]), 23)), XOR(SHR(XOR(D_hi, lanes_hi[9]), 9), SHL(XOR(D_lo, lanes_lo[9]), 23))
  1039.             L, H = XOR(D_lo, lanes_lo[14]), XOR(D_hi, lanes_hi[14])
  1040.             lanes_lo[14], lanes_hi[14] = XOR(SHL(L, 25), SHR(H, 7)), XOR(SHL(H, 25), SHR(L, 7))
  1041.             D_lo = XOR(lanes_lo[29], SHL(lanes_lo[26], 1), SHR(lanes_hi[26], 31))
  1042.             D_hi = XOR(lanes_hi[29], SHL(lanes_hi[26], 1), SHR(lanes_lo[26], 31))
  1043.             lanes_lo[5], lanes_hi[5], lanes_lo[15], lanes_hi[15], lanes_lo[20], lanes_hi[20], lanes_lo[25], lanes_hi[25] = XOR(SHL(XOR(D_lo, lanes_lo[25]), 14), SHR(XOR(D_hi, lanes_hi[25]), 18)), XOR(SHL(XOR(D_hi, lanes_hi[25]), 14), SHR(XOR(D_lo, lanes_lo[25]), 18)), XOR(SHL(XOR(D_lo, lanes_lo[20]), 8), SHR(XOR(D_hi, lanes_hi[20]), 24)), XOR(SHL(XOR(D_hi, lanes_hi[20]), 8), SHR(XOR(D_lo, lanes_lo[20]), 24)), XOR(SHL(XOR(D_lo, lanes_lo[5]), 27), SHR(XOR(D_hi, lanes_hi[5]), 5)), XOR(SHL(XOR(D_hi, lanes_hi[5]), 27), SHR(XOR(D_lo, lanes_lo[5]), 5)), XOR(SHR(XOR(D_lo, lanes_lo[15]), 25), SHL(XOR(D_hi, lanes_hi[15]), 7)), XOR(SHR(XOR(D_hi, lanes_hi[15]), 25), SHL(XOR(D_lo, lanes_lo[15]), 7))
  1044.             L, H = XOR(D_lo, lanes_lo[10]), XOR(D_hi, lanes_hi[10])
  1045.             lanes_lo[10], lanes_hi[10] = XOR(SHL(L, 20), SHR(H, 12)), XOR(SHL(H, 20), SHR(L, 12))
  1046.             D_lo = XOR(lanes_lo[30], SHL(lanes_lo[27], 1), SHR(lanes_hi[27], 31))
  1047.             D_hi = XOR(lanes_hi[30], SHL(lanes_hi[27], 1), SHR(lanes_lo[27], 31))
  1048.             lanes_lo[6], lanes_hi[6], lanes_lo[11], lanes_hi[11], lanes_lo[16], lanes_hi[16], lanes_lo[21], lanes_hi[21] = XOR(SHL(XOR(D_lo, lanes_lo[11]), 3), SHR(XOR(D_hi, lanes_hi[11]), 29)), XOR(SHL(XOR(D_hi, lanes_hi[11]), 3), SHR(XOR(D_lo, lanes_lo[11]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[21]), 18), SHR(XOR(D_hi, lanes_hi[21]), 14)), XOR(SHL(XOR(D_hi, lanes_hi[21]), 18), SHR(XOR(D_lo, lanes_lo[21]), 14)), XOR(SHR(XOR(D_lo, lanes_lo[6]), 28), SHL(XOR(D_hi, lanes_hi[6]), 4)), XOR(SHR(XOR(D_hi, lanes_hi[6]), 28), SHL(XOR(D_lo, lanes_lo[6]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[16]), 23), SHL(XOR(D_hi, lanes_hi[16]), 9)), XOR(SHR(XOR(D_hi, lanes_hi[16]), 23), SHL(XOR(D_lo, lanes_lo[16]), 9))
  1049.             lanes_lo[1], lanes_hi[1] = XOR(D_lo, lanes_lo[1]), XOR(D_hi, lanes_hi[1])
  1050.             lanes_lo[1], lanes_lo[2], lanes_lo[3], lanes_lo[4], lanes_lo[5] = XOR(lanes_lo[1], AND(NOT(lanes_lo[2]), lanes_lo[3]), RC_lo[round_idx]), XOR(lanes_lo[2], AND(NOT(lanes_lo[3]), lanes_lo[4])), XOR(lanes_lo[3], AND(NOT(lanes_lo[4]), lanes_lo[5])), XOR(lanes_lo[4], AND(NOT(lanes_lo[5]), lanes_lo[1])), XOR(lanes_lo[5], AND(NOT(lanes_lo[1]), lanes_lo[2]))
  1051.             lanes_lo[6], lanes_lo[7], lanes_lo[8], lanes_lo[9], lanes_lo[10] = XOR(lanes_lo[9], AND(NOT(lanes_lo[10]), lanes_lo[6])), XOR(lanes_lo[10], AND(NOT(lanes_lo[6]), lanes_lo[7])), XOR(lanes_lo[6], AND(NOT(lanes_lo[7]), lanes_lo[8])), XOR(lanes_lo[7], AND(NOT(lanes_lo[8]), lanes_lo[9])), XOR(lanes_lo[8], AND(NOT(lanes_lo[9]), lanes_lo[10]))
  1052.             lanes_lo[11], lanes_lo[12], lanes_lo[13], lanes_lo[14], lanes_lo[15] = XOR(lanes_lo[12], AND(NOT(lanes_lo[13]), lanes_lo[14])), XOR(lanes_lo[13], AND(NOT(lanes_lo[14]), lanes_lo[15])), XOR(lanes_lo[14], AND(NOT(lanes_lo[15]), lanes_lo[11])), XOR(lanes_lo[15], AND(NOT(lanes_lo[11]), lanes_lo[12])), XOR(lanes_lo[11], AND(NOT(lanes_lo[12]), lanes_lo[13]))
  1053.             lanes_lo[16], lanes_lo[17], lanes_lo[18], lanes_lo[19], lanes_lo[20] = XOR(lanes_lo[20], AND(NOT(lanes_lo[16]), lanes_lo[17])), XOR(lanes_lo[16], AND(NOT(lanes_lo[17]), lanes_lo[18])), XOR(lanes_lo[17], AND(NOT(lanes_lo[18]), lanes_lo[19])), XOR(lanes_lo[18], AND(NOT(lanes_lo[19]), lanes_lo[20])), XOR(lanes_lo[19], AND(NOT(lanes_lo[20]), lanes_lo[16]))
  1054.             lanes_lo[21], lanes_lo[22], lanes_lo[23], lanes_lo[24], lanes_lo[25] = XOR(lanes_lo[23], AND(NOT(lanes_lo[24]), lanes_lo[25])), XOR(lanes_lo[24], AND(NOT(lanes_lo[25]), lanes_lo[21])), XOR(lanes_lo[25], AND(NOT(lanes_lo[21]), lanes_lo[22])), XOR(lanes_lo[21], AND(NOT(lanes_lo[22]), lanes_lo[23])), XOR(lanes_lo[22], AND(NOT(lanes_lo[23]), lanes_lo[24]))
  1055.             lanes_hi[1], lanes_hi[2], lanes_hi[3], lanes_hi[4], lanes_hi[5] = XOR(lanes_hi[1], AND(NOT(lanes_hi[2]), lanes_hi[3]), RC_hi[round_idx]), XOR(lanes_hi[2], AND(NOT(lanes_hi[3]), lanes_hi[4])), XOR(lanes_hi[3], AND(NOT(lanes_hi[4]), lanes_hi[5])), XOR(lanes_hi[4], AND(NOT(lanes_hi[5]), lanes_hi[1])), XOR(lanes_hi[5], AND(NOT(lanes_hi[1]), lanes_hi[2]))
  1056.             lanes_hi[6], lanes_hi[7], lanes_hi[8], lanes_hi[9], lanes_hi[10] = XOR(lanes_hi[9], AND(NOT(lanes_hi[10]), lanes_hi[6])), XOR(lanes_hi[10], AND(NOT(lanes_hi[6]), lanes_hi[7])), XOR(lanes_hi[6], AND(NOT(lanes_hi[7]), lanes_hi[8])), XOR(lanes_hi[7], AND(NOT(lanes_hi[8]), lanes_hi[9])), XOR(lanes_hi[8], AND(NOT(lanes_hi[9]), lanes_hi[10]))
  1057.             lanes_hi[11], lanes_hi[12], lanes_hi[13], lanes_hi[14], lanes_hi[15] = XOR(lanes_hi[12], AND(NOT(lanes_hi[13]), lanes_hi[14])), XOR(lanes_hi[13], AND(NOT(lanes_hi[14]), lanes_hi[15])), XOR(lanes_hi[14], AND(NOT(lanes_hi[15]), lanes_hi[11])), XOR(lanes_hi[15], AND(NOT(lanes_hi[11]), lanes_hi[12])), XOR(lanes_hi[11], AND(NOT(lanes_hi[12]), lanes_hi[13]))
  1058.             lanes_hi[16], lanes_hi[17], lanes_hi[18], lanes_hi[19], lanes_hi[20] = XOR(lanes_hi[20], AND(NOT(lanes_hi[16]), lanes_hi[17])), XOR(lanes_hi[16], AND(NOT(lanes_hi[17]), lanes_hi[18])), XOR(lanes_hi[17], AND(NOT(lanes_hi[18]), lanes_hi[19])), XOR(lanes_hi[18], AND(NOT(lanes_hi[19]), lanes_hi[20])), XOR(lanes_hi[19], AND(NOT(lanes_hi[20]), lanes_hi[16]))
  1059.             lanes_hi[21], lanes_hi[22], lanes_hi[23], lanes_hi[24], lanes_hi[25] = XOR(lanes_hi[23], AND(NOT(lanes_hi[24]), lanes_hi[25])), XOR(lanes_hi[24], AND(NOT(lanes_hi[25]), lanes_hi[21])), XOR(lanes_hi[25], AND(NOT(lanes_hi[21]), lanes_hi[22])), XOR(lanes_hi[21], AND(NOT(lanes_hi[22]), lanes_hi[23])), XOR(lanes_hi[22], AND(NOT(lanes_hi[23]), lanes_hi[24]))
  1060.          end
  1061.       end
  1062.    end
  1063.  
  1064. end
  1065.  
  1066.  
  1067. if branch == "LJ" then
  1068.  
  1069.  
  1070.    -- SHA256 implementation for "LuaJIT without FFI" branch
  1071.  
  1072.    function sha256_feed_64(H, str, offs, size)
  1073.       -- offs >= 0, size >= 0, size is multiple of 64
  1074.       local W, K = common_W, sha2_K_hi
  1075.       for pos = offs, offs + size - 1, 64 do
  1076.          for j = 1, 16 do
  1077.             pos = pos + 4
  1078.             local a, b, c, d = byte(str, pos - 3, pos)
  1079.             W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1080.          end
  1081.          for j = 17, 64 do
  1082.             local a, b = W[j-15], W[j-2]
  1083.             W[j] = NORM( NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) ) + NORM( W[j-7] + W[j-16] ) )
  1084.          end
  1085.          local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1086.          for j = 1, 64, 8 do  -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
  1087.             local z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j] + W[j] + h) )
  1088.             h, g, f, e = g, f, e, NORM(d + z)
  1089.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1090.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+1] + W[j+1] + h) )
  1091.             h, g, f, e = g, f, e, NORM(d + z)
  1092.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1093.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+2] + W[j+2] + h) )
  1094.             h, g, f, e = g, f, e, NORM(d + z)
  1095.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1096.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+3] + W[j+3] + h) )
  1097.             h, g, f, e = g, f, e, NORM(d + z)
  1098.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1099.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+4] + W[j+4] + h) )
  1100.             h, g, f, e = g, f, e, NORM(d + z)
  1101.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1102.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+5] + W[j+5] + h) )
  1103.             h, g, f, e = g, f, e, NORM(d + z)
  1104.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1105.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+6] + W[j+6] + h) )
  1106.             h, g, f, e = g, f, e, NORM(d + z)
  1107.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1108.             z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+7] + W[j+7] + h) )
  1109.             h, g, f, e = g, f, e, NORM(d + z)
  1110.             d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
  1111.          end
  1112.          H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  1113.          H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
  1114.       end
  1115.    end
  1116.  
  1117.    local function ADD64_4(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi)
  1118.       local sum_lo = a_lo % 2^32 + b_lo % 2^32 + c_lo % 2^32 + d_lo % 2^32
  1119.       local sum_hi = a_hi + b_hi + c_hi + d_hi
  1120.       local result_lo = NORM( sum_lo )
  1121.       local result_hi = NORM( sum_hi + floor(sum_lo / 2^32) )
  1122.       return result_lo, result_hi
  1123.    end
  1124.  
  1125.    if LuaJIT_arch == "x86" then  -- Special trick is required to avoid "PHI shuffling too complex" on x86 platform
  1126.  
  1127.  
  1128.       -- SHA512 implementation for "LuaJIT x86 without FFI" branch
  1129.  
  1130.       function sha512_feed_128(H_lo, H_hi, str, offs, size)
  1131.          -- offs >= 0, size >= 0, size is multiple of 128
  1132.          -- W1_hi, W1_lo, W2_hi, W2_lo, ...   Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  1133.          local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
  1134.          for pos = offs, offs + size - 1, 128 do
  1135.             for j = 1, 16*2 do
  1136.                pos = pos + 4
  1137.                local a, b, c, d = byte(str, pos - 3, pos)
  1138.                W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1139.             end
  1140.             for jj = 17*2, 80*2, 2 do
  1141.                local a_lo, a_hi = W[jj-30], W[jj-31]
  1142.                local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
  1143.                local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
  1144.                local b_lo, b_hi = W[jj-4], W[jj-5]
  1145.                local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
  1146.                local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
  1147.                W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
  1148.             end
  1149.             local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  1150.             local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  1151.             local zero = 0
  1152.             for j = 1, 80 do
  1153.                local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
  1154.                local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
  1155.                local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
  1156.                local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
  1157.                local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
  1158.                local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
  1159.                zero = zero + zero  -- this thick is needed to avoid "PHI shuffling too complex" due to PHIs overlap
  1160.                h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = OR(zero, g_lo), OR(zero, g_hi), OR(zero, f_lo), OR(zero, f_hi), OR(zero, e_lo), OR(zero, e_hi)
  1161.                local sum_lo = z_lo % 2^32 + d_lo % 2^32
  1162.                e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
  1163.                d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = OR(zero, c_lo), OR(zero, c_hi), OR(zero, b_lo), OR(zero, b_hi), OR(zero, a_lo), OR(zero, a_hi)
  1164.                u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
  1165.                u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
  1166.                t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
  1167.                t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
  1168.                local sum_lo = z_lo % 2^32 + t_lo % 2^32 + u_lo % 2^32
  1169.                a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + t_hi + u_hi + floor(sum_lo / 2^32) )
  1170.             end
  1171.             H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
  1172.             H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
  1173.             H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
  1174.             H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
  1175.             H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
  1176.             H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
  1177.             H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
  1178.             H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
  1179.          end
  1180.       end
  1181.  
  1182.    else  -- all platforms except x86
  1183.  
  1184.  
  1185.       -- SHA512 implementation for "LuaJIT non-x86 without FFI" branch
  1186.  
  1187.       function sha512_feed_128(H_lo, H_hi, str, offs, size)
  1188.          -- offs >= 0, size >= 0, size is multiple of 128
  1189.          -- W1_hi, W1_lo, W2_hi, W2_lo, ...   Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  1190.          local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
  1191.          for pos = offs, offs + size - 1, 128 do
  1192.             for j = 1, 16*2 do
  1193.                pos = pos + 4
  1194.                local a, b, c, d = byte(str, pos - 3, pos)
  1195.                W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1196.             end
  1197.             for jj = 17*2, 80*2, 2 do
  1198.                local a_lo, a_hi = W[jj-30], W[jj-31]
  1199.                local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
  1200.                local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
  1201.                local b_lo, b_hi = W[jj-4], W[jj-5]
  1202.                local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
  1203.                local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
  1204.                W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
  1205.             end
  1206.             local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  1207.             local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  1208.             for j = 1, 80 do
  1209.                local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
  1210.                local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
  1211.                local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
  1212.                local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
  1213.                local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
  1214.                local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
  1215.                h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = g_lo, g_hi, f_lo, f_hi, e_lo, e_hi
  1216.                local sum_lo = z_lo % 2^32 + d_lo % 2^32
  1217.                e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
  1218.                d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = c_lo, c_hi, b_lo, b_hi, a_lo, a_hi
  1219.                u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
  1220.                u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
  1221.                t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
  1222.                t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
  1223.                local sum_lo = z_lo % 2^32 + u_lo % 2^32 + t_lo % 2^32
  1224.                a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + u_hi + t_hi + floor(sum_lo / 2^32) )
  1225.             end
  1226.             H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
  1227.             H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
  1228.             H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
  1229.             H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
  1230.             H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
  1231.             H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
  1232.             H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
  1233.             H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
  1234.          end
  1235.       end
  1236.  
  1237.    end
  1238.  
  1239.  
  1240.    -- MD5 implementation for "LuaJIT without FFI" branch
  1241.  
  1242.    function md5_feed_64(H, str, offs, size)
  1243.       -- offs >= 0, size >= 0, size is multiple of 64
  1244.       local W, K = common_W, md5_K
  1245.       for pos = offs, offs + size - 1, 64 do
  1246.          for j = 1, 16 do
  1247.             pos = pos + 4
  1248.             local a, b, c, d = byte(str, pos - 3, pos)
  1249.             W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  1250.          end
  1251.          local a, b, c, d = H[1], H[2], H[3], H[4]
  1252.          for j = 1, 16, 4 do
  1253.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j  ] + W[j  ] + a),  7) + b)
  1254.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j+1] + a), 12) + b)
  1255.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+2] + a), 17) + b)
  1256.             a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+3] + a), 22) + b)
  1257.          end
  1258.          for j = 17, 32, 4 do
  1259.             local g = 5*j-4
  1260.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j  ] + W[AND(g     , 15) + 1] + a),  5) + b)
  1261.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g +  5, 15) + 1] + a),  9) + b)
  1262.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 10, 15) + 1] + a), 14) + b)
  1263.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g -  1, 15) + 1] + a), 20) + b)
  1264.          end
  1265.          for j = 33, 48, 4 do
  1266.             local g = 3*j+2
  1267.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j  ] + W[AND(g    , 15) + 1] + a),  4) + b)
  1268.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 3, 15) + 1] + a), 11) + b)
  1269.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 6, 15) + 1] + a), 16) + b)
  1270.             a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 7, 15) + 1] + a), 23) + b)
  1271.          end
  1272.          for j = 49, 64, 4 do
  1273.             local g = j*7
  1274.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j  ] + W[AND(g - 7, 15) + 1] + a),  6) + b)
  1275.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g    , 15) + 1] + a), 10) + b)
  1276.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15) + 1] + a), 15) + b)
  1277.             a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15) + 1] + a), 21) + b)
  1278.          end
  1279.          H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
  1280.       end
  1281.    end
  1282.  
  1283.  
  1284.    -- SHA-1 implementation for "LuaJIT without FFI" branch
  1285.  
  1286.    function sha1_feed_64(H, str, offs, size)
  1287.       -- offs >= 0, size >= 0, size is multiple of 64
  1288.       local W = common_W
  1289.       for pos = offs, offs + size - 1, 64 do
  1290.          for j = 1, 16 do
  1291.             pos = pos + 4
  1292.             local a, b, c, d = byte(str, pos - 3, pos)
  1293.             W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
  1294.          end
  1295.          for j = 17, 80 do
  1296.             W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
  1297.          end
  1298.          local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
  1299.          for j = 1, 20, 5 do
  1300.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j]   + 0x5A827999 + e))          -- constant = floor(2^30 * sqrt(2))
  1301.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
  1302.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
  1303.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
  1304.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
  1305.          end
  1306.          for j = 21, 40, 5 do
  1307.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j]   + 0x6ED9EBA1 + e))                       -- 2^30 * sqrt(3)
  1308.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
  1309.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
  1310.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
  1311.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
  1312.          end
  1313.          for j = 41, 60, 5 do
  1314.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j]   + 0x8F1BBCDC + e))  -- 2^30 * sqrt(5)
  1315.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
  1316.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
  1317.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
  1318.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
  1319.          end
  1320.          for j = 61, 80, 5 do
  1321.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j]   + 0xCA62C1D6 + e))                       -- 2^30 * sqrt(10)
  1322.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
  1323.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
  1324.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
  1325.             e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
  1326.          end
  1327.          H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
  1328.       end
  1329.    end
  1330.  
  1331.  
  1332.    -- BLAKE2b implementation for "LuaJIT without FFI" branch
  1333.  
  1334.    do
  1335.       local v_lo, v_hi = {}, {}
  1336.  
  1337.       local function G(a, b, c, d, k1, k2)
  1338.          local W = common_W
  1339.          local va_lo, vb_lo, vc_lo, vd_lo = v_lo[a], v_lo[b], v_lo[c], v_lo[d]
  1340.          local va_hi, vb_hi, vc_hi, vd_hi = v_hi[a], v_hi[b], v_hi[c], v_hi[d]
  1341.          local z = W[2*k1-1] + (va_lo % 2^32 + vb_lo % 2^32)
  1342.          va_lo = NORM(z)
  1343.          va_hi = NORM(W[2*k1] + (va_hi + vb_hi + floor(z / 2^32)))
  1344.          vd_lo, vd_hi = XOR(vd_hi, va_hi), XOR(vd_lo, va_lo)
  1345.          z = vc_lo % 2^32 + vd_lo % 2^32
  1346.          vc_lo = NORM(z)
  1347.          vc_hi = NORM(vc_hi + vd_hi + floor(z / 2^32))
  1348.          vb_lo, vb_hi = XOR(vb_lo, vc_lo), XOR(vb_hi, vc_hi)
  1349.          vb_lo, vb_hi = XOR(SHR(vb_lo, 24), SHL(vb_hi, 8)), XOR(SHR(vb_hi, 24), SHL(vb_lo, 8))
  1350.          z = W[2*k2-1] + (va_lo % 2^32 + vb_lo % 2^32)
  1351.          va_lo = NORM(z)
  1352.          va_hi = NORM(W[2*k2] + (va_hi + vb_hi + floor(z / 2^32)))
  1353.          vd_lo, vd_hi = XOR(vd_lo, va_lo), XOR(vd_hi, va_hi)
  1354.          vd_lo, vd_hi = XOR(SHR(vd_lo, 16), SHL(vd_hi, 16)), XOR(SHR(vd_hi, 16), SHL(vd_lo, 16))
  1355.          z = vc_lo % 2^32 + vd_lo % 2^32
  1356.          vc_lo = NORM(z)
  1357.          vc_hi = NORM(vc_hi + vd_hi + floor(z / 2^32))
  1358.          vb_lo, vb_hi = XOR(vb_lo, vc_lo), XOR(vb_hi, vc_hi)
  1359.          vb_lo, vb_hi = XOR(SHL(vb_lo, 1), SHR(vb_hi, 31)), XOR(SHL(vb_hi, 1), SHR(vb_lo, 31))
  1360.          v_lo[a], v_lo[b], v_lo[c], v_lo[d] = va_lo, vb_lo, vc_lo, vd_lo
  1361.          v_hi[a], v_hi[b], v_hi[c], v_hi[d] = va_hi, vb_hi, vc_hi, vd_hi
  1362.       end
  1363.  
  1364.       function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1365.          -- offs >= 0, size >= 0, size is multiple of 128
  1366.          local W = common_W
  1367.          local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  1368.          local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  1369.          for pos = offs, offs + size - 1, 128 do
  1370.             if str then
  1371.                for j = 1, 32 do
  1372.                   pos = pos + 4
  1373.                   local a, b, c, d = byte(str, pos - 3, pos)
  1374.                   W[j] = d * 2^24 + OR(SHL(c, 16), SHL(b, 8), a)
  1375.                end
  1376.             end
  1377.             v_lo[0x0], v_lo[0x1], v_lo[0x2], v_lo[0x3], v_lo[0x4], v_lo[0x5], v_lo[0x6], v_lo[0x7] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  1378.             v_lo[0x8], v_lo[0x9], v_lo[0xA], v_lo[0xB], v_lo[0xC], v_lo[0xD], v_lo[0xE], v_lo[0xF] = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  1379.             v_hi[0x0], v_hi[0x1], v_hi[0x2], v_hi[0x3], v_hi[0x4], v_hi[0x5], v_hi[0x6], v_hi[0x7] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  1380.             v_hi[0x8], v_hi[0x9], v_hi[0xA], v_hi[0xB], v_hi[0xC], v_hi[0xD], v_hi[0xE], v_hi[0xF] = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  1381.             bytes_compressed = bytes_compressed + (last_block_size or 128)
  1382.             local t0_lo = bytes_compressed % 2^32
  1383.             local t0_hi = floor(bytes_compressed / 2^32)
  1384.             v_lo[0xC] = XOR(v_lo[0xC], t0_lo)  -- t0 = low_8_bytes(bytes_compressed)
  1385.             v_hi[0xC] = XOR(v_hi[0xC], t0_hi)
  1386.             -- t1 = high_8_bytes(bytes_compressed) = 0,  message length is always below 2^53 bytes
  1387.             if last_block_size then  -- flag f0
  1388.                v_lo[0xE] = NOT(v_lo[0xE])
  1389.                v_hi[0xE] = NOT(v_hi[0xE])
  1390.             end
  1391.             if is_last_node then  -- flag f1
  1392.                v_lo[0xF] = NOT(v_lo[0xF])
  1393.                v_hi[0xF] = NOT(v_hi[0xF])
  1394.             end
  1395.             for j = 1, 12 do
  1396.                local row = sigma[j]
  1397.                G(0, 4,  8, 12, row[ 1], row[ 2])
  1398.                G(1, 5,  9, 13, row[ 3], row[ 4])
  1399.                G(2, 6, 10, 14, row[ 5], row[ 6])
  1400.                G(3, 7, 11, 15, row[ 7], row[ 8])
  1401.                G(0, 5, 10, 15, row[ 9], row[10])
  1402.                G(1, 6, 11, 12, row[11], row[12])
  1403.                G(2, 7,  8, 13, row[13], row[14])
  1404.                G(3, 4,  9, 14, row[15], row[16])
  1405.             end
  1406.             h1_lo = XOR(h1_lo, v_lo[0x0], v_lo[0x8])
  1407.             h2_lo = XOR(h2_lo, v_lo[0x1], v_lo[0x9])
  1408.             h3_lo = XOR(h3_lo, v_lo[0x2], v_lo[0xA])
  1409.             h4_lo = XOR(h4_lo, v_lo[0x3], v_lo[0xB])
  1410.             h5_lo = XOR(h5_lo, v_lo[0x4], v_lo[0xC])
  1411.             h6_lo = XOR(h6_lo, v_lo[0x5], v_lo[0xD])
  1412.             h7_lo = XOR(h7_lo, v_lo[0x6], v_lo[0xE])
  1413.             h8_lo = XOR(h8_lo, v_lo[0x7], v_lo[0xF])
  1414.             h1_hi = XOR(h1_hi, v_hi[0x0], v_hi[0x8])
  1415.             h2_hi = XOR(h2_hi, v_hi[0x1], v_hi[0x9])
  1416.             h3_hi = XOR(h3_hi, v_hi[0x2], v_hi[0xA])
  1417.             h4_hi = XOR(h4_hi, v_hi[0x3], v_hi[0xB])
  1418.             h5_hi = XOR(h5_hi, v_hi[0x4], v_hi[0xC])
  1419.             h6_hi = XOR(h6_hi, v_hi[0x5], v_hi[0xD])
  1420.             h7_hi = XOR(h7_hi, v_hi[0x6], v_hi[0xE])
  1421.             h8_hi = XOR(h8_hi, v_hi[0x7], v_hi[0xF])
  1422.          end
  1423.          H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo % 2^32, h2_lo % 2^32, h3_lo % 2^32, h4_lo % 2^32, h5_lo % 2^32, h6_lo % 2^32, h7_lo % 2^32, h8_lo % 2^32
  1424.          H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi % 2^32, h2_hi % 2^32, h3_hi % 2^32, h4_hi % 2^32, h5_hi % 2^32, h6_hi % 2^32, h7_hi % 2^32, h8_hi % 2^32
  1425.          return bytes_compressed
  1426.       end
  1427.  
  1428.    end
  1429. end
  1430.  
  1431.  
  1432. if branch == "FFI" or branch == "LJ" then
  1433.  
  1434.  
  1435.    -- BLAKE2s and BLAKE3 implementations for "LuaJIT with FFI" and "LuaJIT without FFI" branches
  1436.  
  1437.    do
  1438.       local W = common_W_blake2s
  1439.       local v = v_for_blake2s_feed_64
  1440.  
  1441.       local function G(a, b, c, d, k1, k2)
  1442.          local va, vb, vc, vd = v[a], v[b], v[c], v[d]
  1443.          va = NORM(W[k1] + (va + vb))
  1444.          vd = ROR(XOR(vd, va), 16)
  1445.          vc = NORM(vc + vd)
  1446.          vb = ROR(XOR(vb, vc), 12)
  1447.          va = NORM(W[k2] + (va + vb))
  1448.          vd = ROR(XOR(vd, va), 8)
  1449.          vc = NORM(vc + vd)
  1450.          vb = ROR(XOR(vb, vc), 7)
  1451.          v[a], v[b], v[c], v[d] = va, vb, vc, vd
  1452.       end
  1453.  
  1454.       function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1455.          -- offs >= 0, size >= 0, size is multiple of 64
  1456.          local h1, h2, h3, h4, h5, h6, h7, h8 = NORM(H[1]), NORM(H[2]), NORM(H[3]), NORM(H[4]), NORM(H[5]), NORM(H[6]), NORM(H[7]), NORM(H[8])
  1457.          for pos = offs, offs + size - 1, 64 do
  1458.             if str then
  1459.                for j = 1, 16 do
  1460.                   pos = pos + 4
  1461.                   local a, b, c, d = byte(str, pos - 3, pos)
  1462.                   W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  1463.                end
  1464.             end
  1465.             v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  1466.             v[0x8], v[0x9], v[0xA], v[0xB], v[0xE], v[0xF] = NORM(sha2_H_hi[1]), NORM(sha2_H_hi[2]), NORM(sha2_H_hi[3]), NORM(sha2_H_hi[4]), NORM(sha2_H_hi[7]), NORM(sha2_H_hi[8])
  1467.             bytes_compressed = bytes_compressed + (last_block_size or 64)
  1468.             local t0 = bytes_compressed % 2^32
  1469.             local t1 = floor(bytes_compressed / 2^32)
  1470.             v[0xC] = XOR(sha2_H_hi[5], t0)  -- t0 = low_4_bytes(bytes_compressed)
  1471.             v[0xD] = XOR(sha2_H_hi[6], t1)  -- t1 = high_4_bytes(bytes_compressed
  1472.             if last_block_size then  -- flag f0
  1473.                v[0xE] = NOT(v[0xE])
  1474.             end
  1475.             if is_last_node then  -- flag f1
  1476.                v[0xF] = NOT(v[0xF])
  1477.             end
  1478.             for j = 1, 10 do
  1479.                local row = sigma[j]
  1480.                G(0, 4,  8, 12, row[ 1], row[ 2])
  1481.                G(1, 5,  9, 13, row[ 3], row[ 4])
  1482.                G(2, 6, 10, 14, row[ 5], row[ 6])
  1483.                G(3, 7, 11, 15, row[ 7], row[ 8])
  1484.                G(0, 5, 10, 15, row[ 9], row[10])
  1485.                G(1, 6, 11, 12, row[11], row[12])
  1486.                G(2, 7,  8, 13, row[13], row[14])
  1487.                G(3, 4,  9, 14, row[15], row[16])
  1488.             end
  1489.             h1 = XOR(h1, v[0x0], v[0x8])
  1490.             h2 = XOR(h2, v[0x1], v[0x9])
  1491.             h3 = XOR(h3, v[0x2], v[0xA])
  1492.             h4 = XOR(h4, v[0x3], v[0xB])
  1493.             h5 = XOR(h5, v[0x4], v[0xC])
  1494.             h6 = XOR(h6, v[0x5], v[0xD])
  1495.             h7 = XOR(h7, v[0x6], v[0xE])
  1496.             h8 = XOR(h8, v[0x7], v[0xF])
  1497.          end
  1498.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1499.          return bytes_compressed
  1500.       end
  1501.  
  1502.       function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  1503.          -- offs >= 0, size >= 0, size is multiple of 64
  1504.          block_length = block_length or 64
  1505.          local h1, h2, h3, h4, h5, h6, h7, h8 = NORM(H_in[1]), NORM(H_in[2]), NORM(H_in[3]), NORM(H_in[4]), NORM(H_in[5]), NORM(H_in[6]), NORM(H_in[7]), NORM(H_in[8])
  1506.          H_out = H_out or H_in
  1507.          for pos = offs, offs + size - 1, 64 do
  1508.             if str then
  1509.                for j = 1, 16 do
  1510.                   pos = pos + 4
  1511.                   local a, b, c, d = byte(str, pos - 3, pos)
  1512.                   W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  1513.                end
  1514.             end
  1515.             v[0x0], v[0x1], v[0x2], v[0x3], v[0x4], v[0x5], v[0x6], v[0x7] = h1, h2, h3, h4, h5, h6, h7, h8
  1516.             v[0x8], v[0x9], v[0xA], v[0xB] = NORM(sha2_H_hi[1]), NORM(sha2_H_hi[2]), NORM(sha2_H_hi[3]), NORM(sha2_H_hi[4])
  1517.             v[0xC] = NORM(chunk_index % 2^32)   -- t0 = low_4_bytes(chunk_index)
  1518.             v[0xD] = floor(chunk_index / 2^32)  -- t1 = high_4_bytes(chunk_index)
  1519.             v[0xE], v[0xF] = block_length, flags
  1520.             for j = 1, 7 do
  1521.                G(0, 4,  8, 12, perm_blake3[j],      perm_blake3[j + 14])
  1522.                G(1, 5,  9, 13, perm_blake3[j + 1],  perm_blake3[j + 2])
  1523.                G(2, 6, 10, 14, perm_blake3[j + 16], perm_blake3[j + 7])
  1524.                G(3, 7, 11, 15, perm_blake3[j + 15], perm_blake3[j + 17])
  1525.                G(0, 5, 10, 15, perm_blake3[j + 21], perm_blake3[j + 5])
  1526.                G(1, 6, 11, 12, perm_blake3[j + 3],  perm_blake3[j + 6])
  1527.                G(2, 7,  8, 13, perm_blake3[j + 4],  perm_blake3[j + 18])
  1528.                G(3, 4,  9, 14, perm_blake3[j + 19], perm_blake3[j + 20])
  1529.             end
  1530.             if wide_output then
  1531.                H_out[ 9] = XOR(h1, v[0x8])
  1532.                H_out[10] = XOR(h2, v[0x9])
  1533.                H_out[11] = XOR(h3, v[0xA])
  1534.                H_out[12] = XOR(h4, v[0xB])
  1535.                H_out[13] = XOR(h5, v[0xC])
  1536.                H_out[14] = XOR(h6, v[0xD])
  1537.                H_out[15] = XOR(h7, v[0xE])
  1538.                H_out[16] = XOR(h8, v[0xF])
  1539.             end
  1540.             h1 = XOR(v[0x0], v[0x8])
  1541.             h2 = XOR(v[0x1], v[0x9])
  1542.             h3 = XOR(v[0x2], v[0xA])
  1543.             h4 = XOR(v[0x3], v[0xB])
  1544.             h5 = XOR(v[0x4], v[0xC])
  1545.             h6 = XOR(v[0x5], v[0xD])
  1546.             h7 = XOR(v[0x6], v[0xE])
  1547.             h8 = XOR(v[0x7], v[0xF])
  1548.          end
  1549.          H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1550.       end
  1551.  
  1552.    end
  1553.  
  1554. end
  1555.  
  1556.  
  1557. if branch == "INT64" then
  1558.  
  1559.  
  1560.    -- implementation for Lua 5.3/5.4
  1561.  
  1562.    hi_factor = 4294967296
  1563.    hi_factor_keccak = 4294967296
  1564.    lanes_index_base = 1
  1565.  
  1566.    HEX64, XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64 = load[=[-- branch "INT64"
  1567.       local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3 = ...
  1568.       local string_format, string_unpack = string.format, string.unpack
  1569.  
  1570.       local function HEX64(x)
  1571.          return string_format("%016x", x)
  1572.       end
  1573.  
  1574.       local function XORA5(x, y)
  1575.          return x ~ (y or 0xa5a5a5a5a5a5a5a5)
  1576.       end
  1577.  
  1578.       local function XOR_BYTE(x, y)
  1579.          return x ~ y
  1580.       end
  1581.  
  1582.       local function sha256_feed_64(H, str, offs, size)
  1583.          -- offs >= 0, size >= 0, size is multiple of 64
  1584.          local W, K = common_W, sha2_K_hi
  1585.          local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1586.          for pos = offs + 1, offs + size, 64 do
  1587.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1588.                string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1589.             for j = 17, 64 do
  1590.                local a = W[j-15]
  1591.                a = a<<32 | a
  1592.                local b = W[j-2]
  1593.                b = b<<32 | b
  1594.                W[j] = (a>>7 ~ a>>18 ~ a>>35) + (b>>17 ~ b>>19 ~ b>>42) + W[j-7] + W[j-16] & (1<<32)-1
  1595.             end
  1596.             local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  1597.             for j = 1, 64 do
  1598.                e = e<<32 | e & (1<<32)-1
  1599.                local z = (e>>6 ~ e>>11 ~ e>>25) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
  1600.                h = g
  1601.                g = f
  1602.                f = e
  1603.                e = z + d
  1604.                d = c
  1605.                c = b
  1606.                b = a
  1607.                a = a<<32 | a & (1<<32)-1
  1608.                a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a>>13 ~ a>>22)
  1609.             end
  1610.             h1 = a + h1
  1611.             h2 = b + h2
  1612.             h3 = c + h3
  1613.             h4 = d + h4
  1614.             h5 = e + h5
  1615.             h6 = f + h6
  1616.             h7 = g + h7
  1617.             h8 = h + h8
  1618.          end
  1619.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1620.       end
  1621.  
  1622.       local function sha512_feed_128(H, _, str, offs, size)
  1623.          -- offs >= 0, size >= 0, size is multiple of 128
  1624.          local W, K = common_W, sha2_K_lo
  1625.          local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1626.          for pos = offs + 1, offs + size, 128 do
  1627.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1628.                string_unpack(">i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos)
  1629.             for j = 17, 80 do
  1630.                local a = W[j-15]
  1631.                local b = W[j-2]
  1632.                W[j] = (a >> 1 ~ a >> 7 ~ a >> 8 ~ a << 56 ~ a << 63) + (b >> 6 ~ b >> 19 ~ b >> 61 ~ b << 3 ~ b << 45) + W[j-7] + W[j-16]
  1633.             end
  1634.             local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  1635.             for j = 1, 80 do
  1636.                local z = (e >> 14 ~ e >> 18 ~ e >> 41 ~ e << 23 ~ e << 46 ~ e << 50) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
  1637.                h = g
  1638.                g = f
  1639.                f = e
  1640.                e = z + d
  1641.                d = c
  1642.                c = b
  1643.                b = a
  1644.                a = z + ((a ~ c) & d ~ a & c) + (a >> 28 ~ a >> 34 ~ a >> 39 ~ a << 25 ~ a << 30 ~ a << 36)
  1645.             end
  1646.             h1 = a + h1
  1647.             h2 = b + h2
  1648.             h3 = c + h3
  1649.             h4 = d + h4
  1650.             h5 = e + h5
  1651.             h6 = f + h6
  1652.             h7 = g + h7
  1653.             h8 = h + h8
  1654.          end
  1655.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  1656.       end
  1657.  
  1658.       local function md5_feed_64(H, str, offs, size)
  1659.          -- offs >= 0, size >= 0, size is multiple of 64
  1660.          local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  1661.          local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  1662.          for pos = offs + 1, offs + size, 64 do
  1663.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1664.                string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1665.             local a, b, c, d = h1, h2, h3, h4
  1666.             local s = 32-7
  1667.             for j = 1, 16 do
  1668.                local F = (d ~ b & (c ~ d)) + a + K[j] + W[j]
  1669.                a = d
  1670.                d = c
  1671.                c = b
  1672.                b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1673.                s = md5_next_shift[s]
  1674.             end
  1675.             s = 32-5
  1676.             for j = 17, 32 do
  1677.                local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1]
  1678.                a = d
  1679.                d = c
  1680.                c = b
  1681.                b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1682.                s = md5_next_shift[s]
  1683.             end
  1684.             s = 32-4
  1685.             for j = 33, 48 do
  1686.                local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1]
  1687.                a = d
  1688.                d = c
  1689.                c = b
  1690.                b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1691.                s = md5_next_shift[s]
  1692.             end
  1693.             s = 32-6
  1694.             for j = 49, 64 do
  1695.                local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1]
  1696.                a = d
  1697.                d = c
  1698.                c = b
  1699.                b = ((F<<32 | F & (1<<32)-1) >> s) + b
  1700.                s = md5_next_shift[s]
  1701.             end
  1702.             h1 = a + h1
  1703.             h2 = b + h2
  1704.             h3 = c + h3
  1705.             h4 = d + h4
  1706.          end
  1707.          H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  1708.       end
  1709.  
  1710.       local function sha1_feed_64(H, str, offs, size)
  1711.          -- offs >= 0, size >= 0, size is multiple of 64
  1712.          local W = common_W
  1713.          local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
  1714.          for pos = offs + 1, offs + size, 64 do
  1715.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1716.                string_unpack(">I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1717.             for j = 17, 80 do
  1718.                local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16]
  1719.                W[j] = (a<<32 | a) << 1 >> 32
  1720.             end
  1721.             local a, b, c, d, e = h1, h2, h3, h4, h5
  1722.             for j = 1, 20 do
  1723.                local z = ((a<<32 | a & (1<<32)-1) >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e      -- constant = floor(2^30 * sqrt(2))
  1724.                e = d
  1725.                d = c
  1726.                c = (b<<32 | b & (1<<32)-1) >> 2
  1727.                b = a
  1728.                a = z
  1729.             end
  1730.             for j = 21, 40 do
  1731.                local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e            -- 2^30 * sqrt(3)
  1732.                e = d
  1733.                d = c
  1734.                c = (b<<32 | b & (1<<32)-1) >> 2
  1735.                b = a
  1736.                a = z
  1737.             end
  1738.             for j = 41, 60 do
  1739.                local z = ((a<<32 | a & (1<<32)-1) >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e  -- 2^30 * sqrt(5)
  1740.                e = d
  1741.                d = c
  1742.                c = (b<<32 | b & (1<<32)-1) >> 2
  1743.                b = a
  1744.                a = z
  1745.             end
  1746.             for j = 61, 80 do
  1747.                local z = ((a<<32 | a & (1<<32)-1) >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e            -- 2^30 * sqrt(10)
  1748.                e = d
  1749.                d = c
  1750.                c = (b<<32 | b & (1<<32)-1) >> 2
  1751.                b = a
  1752.                a = z
  1753.             end
  1754.             h1 = a + h1
  1755.             h2 = b + h2
  1756.             h3 = c + h3
  1757.             h4 = d + h4
  1758.             h5 = e + h5
  1759.          end
  1760.          H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
  1761.       end
  1762.  
  1763.       local keccak_format_i8 = build_keccak_format("i8")
  1764.  
  1765.       local function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
  1766.          -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  1767.          local RC = sha3_RC_lo
  1768.          local qwords_qty = block_size_in_bytes / 8
  1769.          local keccak_format = keccak_format_i8[qwords_qty]
  1770.          for pos = offs + 1, offs + size, block_size_in_bytes do
  1771.             local qwords_from_message = {string_unpack(keccak_format, str, pos)}
  1772.             for j = 1, qwords_qty do
  1773.                lanes[j] = lanes[j] ~ qwords_from_message[j]
  1774.             end
  1775.             local L01, L02, L03, L04, L05, L06, L07, L08, L09, L10, L11, L12, L13, L14, L15, L16, L17, L18, L19, L20, L21, L22, L23, L24, L25 =
  1776.                lanes[1], lanes[2], lanes[3], lanes[4], lanes[5], lanes[6], lanes[7], lanes[8], lanes[9], lanes[10], lanes[11], lanes[12], lanes[13],
  1777.                lanes[14], lanes[15], lanes[16], lanes[17], lanes[18], lanes[19], lanes[20], lanes[21], lanes[22], lanes[23], lanes[24], lanes[25]
  1778.             for round_idx = 1, 24 do
  1779.                local C1 = L01 ~ L06 ~ L11 ~ L16 ~ L21
  1780.                local C2 = L02 ~ L07 ~ L12 ~ L17 ~ L22
  1781.                local C3 = L03 ~ L08 ~ L13 ~ L18 ~ L23
  1782.                local C4 = L04 ~ L09 ~ L14 ~ L19 ~ L24
  1783.                local C5 = L05 ~ L10 ~ L15 ~ L20 ~ L25
  1784.                local D = C1 ~ C3<<1 ~ C3>>63
  1785.                local T0 = D ~ L02
  1786.                local T1 = D ~ L07
  1787.                local T2 = D ~ L12
  1788.                local T3 = D ~ L17
  1789.                local T4 = D ~ L22
  1790.                L02 = T1<<44 ~ T1>>20
  1791.                L07 = T3<<45 ~ T3>>19
  1792.                L12 = T0<<1 ~ T0>>63
  1793.                L17 = T2<<10 ~ T2>>54
  1794.                L22 = T4<<2 ~ T4>>62
  1795.                D = C2 ~ C4<<1 ~ C4>>63
  1796.                T0 = D ~ L03
  1797.                T1 = D ~ L08
  1798.                T2 = D ~ L13
  1799.                T3 = D ~ L18
  1800.                T4 = D ~ L23
  1801.                L03 = T2<<43 ~ T2>>21
  1802.                L08 = T4<<61 ~ T4>>3
  1803.                L13 = T1<<6 ~ T1>>58
  1804.                L18 = T3<<15 ~ T3>>49
  1805.                L23 = T0<<62 ~ T0>>2
  1806.                D = C3 ~ C5<<1 ~ C5>>63
  1807.                T0 = D ~ L04
  1808.                T1 = D ~ L09
  1809.                T2 = D ~ L14
  1810.                T3 = D ~ L19
  1811.                T4 = D ~ L24
  1812.                L04 = T3<<21 ~ T3>>43
  1813.                L09 = T0<<28 ~ T0>>36
  1814.                L14 = T2<<25 ~ T2>>39
  1815.                L19 = T4<<56 ~ T4>>8
  1816.                L24 = T1<<55 ~ T1>>9
  1817.                D = C4 ~ C1<<1 ~ C1>>63
  1818.                T0 = D ~ L05
  1819.                T1 = D ~ L10
  1820.                T2 = D ~ L15
  1821.                T3 = D ~ L20
  1822.                T4 = D ~ L25
  1823.                L05 = T4<<14 ~ T4>>50
  1824.                L10 = T1<<20 ~ T1>>44
  1825.                L15 = T3<<8 ~ T3>>56
  1826.                L20 = T0<<27 ~ T0>>37
  1827.                L25 = T2<<39 ~ T2>>25
  1828.                D = C5 ~ C2<<1 ~ C2>>63
  1829.                T1 = D ~ L06
  1830.                T2 = D ~ L11
  1831.                T3 = D ~ L16
  1832.                T4 = D ~ L21
  1833.                L06 = T2<<3 ~ T2>>61
  1834.                L11 = T4<<18 ~ T4>>46
  1835.                L16 = T1<<36 ~ T1>>28
  1836.                L21 = T3<<41 ~ T3>>23
  1837.                L01 = D ~ L01
  1838.                L01, L02, L03, L04, L05 = L01 ~ ~L02 & L03, L02 ~ ~L03 & L04, L03 ~ ~L04 & L05, L04 ~ ~L05 & L01, L05 ~ ~L01 & L02
  1839.                L06, L07, L08, L09, L10 = L09 ~ ~L10 & L06, L10 ~ ~L06 & L07, L06 ~ ~L07 & L08, L07 ~ ~L08 & L09, L08 ~ ~L09 & L10
  1840.                L11, L12, L13, L14, L15 = L12 ~ ~L13 & L14, L13 ~ ~L14 & L15, L14 ~ ~L15 & L11, L15 ~ ~L11 & L12, L11 ~ ~L12 & L13
  1841.                L16, L17, L18, L19, L20 = L20 ~ ~L16 & L17, L16 ~ ~L17 & L18, L17 ~ ~L18 & L19, L18 ~ ~L19 & L20, L19 ~ ~L20 & L16
  1842.                L21, L22, L23, L24, L25 = L23 ~ ~L24 & L25, L24 ~ ~L25 & L21, L25 ~ ~L21 & L22, L21 ~ ~L22 & L23, L22 ~ ~L23 & L24
  1843.                L01 = L01 ~ RC[round_idx]
  1844.             end
  1845.             lanes[1]  = L01
  1846.             lanes[2]  = L02
  1847.             lanes[3]  = L03
  1848.             lanes[4]  = L04
  1849.             lanes[5]  = L05
  1850.             lanes[6]  = L06
  1851.             lanes[7]  = L07
  1852.             lanes[8]  = L08
  1853.             lanes[9]  = L09
  1854.             lanes[10] = L10
  1855.             lanes[11] = L11
  1856.             lanes[12] = L12
  1857.             lanes[13] = L13
  1858.             lanes[14] = L14
  1859.             lanes[15] = L15
  1860.             lanes[16] = L16
  1861.             lanes[17] = L17
  1862.             lanes[18] = L18
  1863.             lanes[19] = L19
  1864.             lanes[20] = L20
  1865.             lanes[21] = L21
  1866.             lanes[22] = L22
  1867.             lanes[23] = L23
  1868.             lanes[24] = L24
  1869.             lanes[25] = L25
  1870.          end
  1871.       end
  1872.  
  1873.       local function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  1874.          -- offs >= 0, size >= 0, size is multiple of 64
  1875.          local W = common_W
  1876.          local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  1877.          for pos = offs + 1, offs + size, 64 do
  1878.             if str then
  1879.                W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  1880.                   string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  1881.             end
  1882.             local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  1883.             local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  1884.             bytes_compressed = bytes_compressed + (last_block_size or 64)
  1885.             vC = vC ~ bytes_compressed        -- t0 = low_4_bytes(bytes_compressed)
  1886.             vD = vD ~ bytes_compressed >> 32  -- t1 = high_4_bytes(bytes_compressed)
  1887.             if last_block_size then  -- flag f0
  1888.                vE = ~vE
  1889.             end
  1890.             if is_last_node then  -- flag f1
  1891.                vF = ~vF
  1892.             end
  1893.             for j = 1, 10 do
  1894.                local row = sigma[j]
  1895.                v0 = v0 + v4 + W[row[1]]
  1896.                vC = vC ~ v0
  1897.                vC = (vC & (1<<32)-1) >> 16 | vC << 16
  1898.                v8 = v8 + vC
  1899.                v4 = v4 ~ v8
  1900.                v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  1901.                v0 = v0 + v4 + W[row[2]]
  1902.                vC = vC ~ v0
  1903.                vC = (vC & (1<<32)-1) >> 8 | vC << 24
  1904.                v8 = v8 + vC
  1905.                v4 = v4 ~ v8
  1906.                v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  1907.                v1 = v1 + v5 + W[row[3]]
  1908.                vD = vD ~ v1
  1909.                vD = (vD & (1<<32)-1) >> 16 | vD << 16
  1910.                v9 = v9 + vD
  1911.                v5 = v5 ~ v9
  1912.                v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  1913.                v1 = v1 + v5 + W[row[4]]
  1914.                vD = vD ~ v1
  1915.                vD = (vD & (1<<32)-1) >> 8 | vD << 24
  1916.                v9 = v9 + vD
  1917.                v5 = v5 ~ v9
  1918.                v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  1919.                v2 = v2 + v6 + W[row[5]]
  1920.                vE = vE ~ v2
  1921.                vE = (vE & (1<<32)-1) >> 16 | vE << 16
  1922.                vA = vA + vE
  1923.                v6 = v6 ~ vA
  1924.                v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  1925.                v2 = v2 + v6 + W[row[6]]
  1926.                vE = vE ~ v2
  1927.                vE = (vE & (1<<32)-1) >> 8 | vE << 24
  1928.                vA = vA + vE
  1929.                v6 = v6 ~ vA
  1930.                v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  1931.                v3 = v3 + v7 + W[row[7]]
  1932.                vF = vF ~ v3
  1933.                vF = (vF & (1<<32)-1) >> 16 | vF << 16
  1934.                vB = vB + vF
  1935.                v7 = v7 ~ vB
  1936.                v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  1937.                v3 = v3 + v7 + W[row[8]]
  1938.                vF = vF ~ v3
  1939.                vF = (vF & (1<<32)-1) >> 8 | vF << 24
  1940.                vB = vB + vF
  1941.                v7 = v7 ~ vB
  1942.                v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  1943.                v0 = v0 + v5 + W[row[9]]
  1944.                vF = vF ~ v0
  1945.                vF = (vF & (1<<32)-1) >> 16 | vF << 16
  1946.                vA = vA + vF
  1947.                v5 = v5 ~ vA
  1948.                v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  1949.                v0 = v0 + v5 + W[row[10]]
  1950.                vF = vF ~ v0
  1951.                vF = (vF & (1<<32)-1) >> 8 | vF << 24
  1952.                vA = vA + vF
  1953.                v5 = v5 ~ vA
  1954.                v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  1955.                v1 = v1 + v6 + W[row[11]]
  1956.                vC = vC ~ v1
  1957.                vC = (vC & (1<<32)-1) >> 16 | vC << 16
  1958.                vB = vB + vC
  1959.                v6 = v6 ~ vB
  1960.                v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  1961.                v1 = v1 + v6 + W[row[12]]
  1962.                vC = vC ~ v1
  1963.                vC = (vC & (1<<32)-1) >> 8 | vC << 24
  1964.                vB = vB + vC
  1965.                v6 = v6 ~ vB
  1966.                v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  1967.                v2 = v2 + v7 + W[row[13]]
  1968.                vD = vD ~ v2
  1969.                vD = (vD & (1<<32)-1) >> 16 | vD << 16
  1970.                v8 = v8 + vD
  1971.                v7 = v7 ~ v8
  1972.                v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  1973.                v2 = v2 + v7 + W[row[14]]
  1974.                vD = vD ~ v2
  1975.                vD = (vD & (1<<32)-1) >> 8 | vD << 24
  1976.                v8 = v8 + vD
  1977.                v7 = v7 ~ v8
  1978.                v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  1979.                v3 = v3 + v4 + W[row[15]]
  1980.                vE = vE ~ v3
  1981.                vE = (vE & (1<<32)-1) >> 16 | vE << 16
  1982.                v9 = v9 + vE
  1983.                v4 = v4 ~ v9
  1984.                v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  1985.                v3 = v3 + v4 + W[row[16]]
  1986.                vE = vE ~ v3
  1987.                vE = (vE & (1<<32)-1) >> 8 | vE << 24
  1988.                v9 = v9 + vE
  1989.                v4 = v4 ~ v9
  1990.                v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  1991.             end
  1992.             h1 = h1 ~ v0 ~ v8
  1993.             h2 = h2 ~ v1 ~ v9
  1994.             h3 = h3 ~ v2 ~ vA
  1995.             h4 = h4 ~ v3 ~ vB
  1996.             h5 = h5 ~ v4 ~ vC
  1997.             h6 = h6 ~ v5 ~ vD
  1998.             h7 = h7 ~ v6 ~ vE
  1999.             h8 = h8 ~ v7 ~ vF
  2000.          end
  2001.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2002.          return bytes_compressed
  2003.       end
  2004.  
  2005.       local function blake2b_feed_128(H, _, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  2006.          -- offs >= 0, size >= 0, size is multiple of 128
  2007.          local W = common_W
  2008.          local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  2009.          for pos = offs + 1, offs + size, 128 do
  2010.             if str then
  2011.                W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2012.                   string_unpack("<i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8i8", str, pos)
  2013.             end
  2014.             local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  2015.             local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  2016.             bytes_compressed = bytes_compressed + (last_block_size or 128)
  2017.             vC = vC ~ bytes_compressed  -- t0 = low_8_bytes(bytes_compressed)
  2018.             -- t1 = high_8_bytes(bytes_compressed) = 0,  message length is always below 2^53 bytes
  2019.             if last_block_size then  -- flag f0
  2020.                vE = ~vE
  2021.             end
  2022.             if is_last_node then  -- flag f1
  2023.                vF = ~vF
  2024.             end
  2025.             for j = 1, 12 do
  2026.                local row = sigma[j]
  2027.                v0 = v0 + v4 + W[row[1]]
  2028.                vC = vC ~ v0
  2029.                vC = vC >> 32 | vC << 32
  2030.                v8 = v8 + vC
  2031.                v4 = v4 ~ v8
  2032.                v4 = v4 >> 24 | v4 << 40
  2033.                v0 = v0 + v4 + W[row[2]]
  2034.                vC = vC ~ v0
  2035.                vC = vC >> 16 | vC << 48
  2036.                v8 = v8 + vC
  2037.                v4 = v4 ~ v8
  2038.                v4 = v4 >> 63 | v4 << 1
  2039.                v1 = v1 + v5 + W[row[3]]
  2040.                vD = vD ~ v1
  2041.                vD = vD >> 32 | vD << 32
  2042.                v9 = v9 + vD
  2043.                v5 = v5 ~ v9
  2044.                v5 = v5 >> 24 | v5 << 40
  2045.                v1 = v1 + v5 + W[row[4]]
  2046.                vD = vD ~ v1
  2047.                vD = vD >> 16 | vD << 48
  2048.                v9 = v9 + vD
  2049.                v5 = v5 ~ v9
  2050.                v5 = v5 >> 63 | v5 << 1
  2051.                v2 = v2 + v6 + W[row[5]]
  2052.                vE = vE ~ v2
  2053.                vE = vE >> 32 | vE << 32
  2054.                vA = vA + vE
  2055.                v6 = v6 ~ vA
  2056.                v6 = v6 >> 24 | v6 << 40
  2057.                v2 = v2 + v6 + W[row[6]]
  2058.                vE = vE ~ v2
  2059.                vE = vE >> 16 | vE << 48
  2060.                vA = vA + vE
  2061.                v6 = v6 ~ vA
  2062.                v6 = v6 >> 63 | v6 << 1
  2063.                v3 = v3 + v7 + W[row[7]]
  2064.                vF = vF ~ v3
  2065.                vF = vF >> 32 | vF << 32
  2066.                vB = vB + vF
  2067.                v7 = v7 ~ vB
  2068.                v7 = v7 >> 24 | v7 << 40
  2069.                v3 = v3 + v7 + W[row[8]]
  2070.                vF = vF ~ v3
  2071.                vF = vF >> 16 | vF << 48
  2072.                vB = vB + vF
  2073.                v7 = v7 ~ vB
  2074.                v7 = v7 >> 63 | v7 << 1
  2075.                v0 = v0 + v5 + W[row[9]]
  2076.                vF = vF ~ v0
  2077.                vF = vF >> 32 | vF << 32
  2078.                vA = vA + vF
  2079.                v5 = v5 ~ vA
  2080.                v5 = v5 >> 24 | v5 << 40
  2081.                v0 = v0 + v5 + W[row[10]]
  2082.                vF = vF ~ v0
  2083.                vF = vF >> 16 | vF << 48
  2084.                vA = vA + vF
  2085.                v5 = v5 ~ vA
  2086.                v5 = v5 >> 63 | v5 << 1
  2087.                v1 = v1 + v6 + W[row[11]]
  2088.                vC = vC ~ v1
  2089.                vC = vC >> 32 | vC << 32
  2090.                vB = vB + vC
  2091.                v6 = v6 ~ vB
  2092.                v6 = v6 >> 24 | v6 << 40
  2093.                v1 = v1 + v6 + W[row[12]]
  2094.                vC = vC ~ v1
  2095.                vC = vC >> 16 | vC << 48
  2096.                vB = vB + vC
  2097.                v6 = v6 ~ vB
  2098.                v6 = v6 >> 63 | v6 << 1
  2099.                v2 = v2 + v7 + W[row[13]]
  2100.                vD = vD ~ v2
  2101.                vD = vD >> 32 | vD << 32
  2102.                v8 = v8 + vD
  2103.                v7 = v7 ~ v8
  2104.                v7 = v7 >> 24 | v7 << 40
  2105.                v2 = v2 + v7 + W[row[14]]
  2106.                vD = vD ~ v2
  2107.                vD = vD >> 16 | vD << 48
  2108.                v8 = v8 + vD
  2109.                v7 = v7 ~ v8
  2110.                v7 = v7 >> 63 | v7 << 1
  2111.                v3 = v3 + v4 + W[row[15]]
  2112.                vE = vE ~ v3
  2113.                vE = vE >> 32 | vE << 32
  2114.                v9 = v9 + vE
  2115.                v4 = v4 ~ v9
  2116.                v4 = v4 >> 24 | v4 << 40
  2117.                v3 = v3 + v4 + W[row[16]]
  2118.                vE = vE ~ v3
  2119.                vE = vE >> 16 | vE << 48
  2120.                v9 = v9 + vE
  2121.                v4 = v4 ~ v9
  2122.                v4 = v4 >> 63 | v4 << 1
  2123.             end
  2124.             h1 = h1 ~ v0 ~ v8
  2125.             h2 = h2 ~ v1 ~ v9
  2126.             h3 = h3 ~ v2 ~ vA
  2127.             h4 = h4 ~ v3 ~ vB
  2128.             h5 = h5 ~ v4 ~ vC
  2129.             h6 = h6 ~ v5 ~ vD
  2130.             h7 = h7 ~ v6 ~ vE
  2131.             h8 = h8 ~ v7 ~ vF
  2132.          end
  2133.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2134.          return bytes_compressed
  2135.       end
  2136.  
  2137.       local function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  2138.          -- offs >= 0, size >= 0, size is multiple of 64
  2139.          block_length = block_length or 64
  2140.          local W = common_W
  2141.          local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
  2142.          H_out = H_out or H_in
  2143.          for pos = offs + 1, offs + size, 64 do
  2144.             if str then
  2145.                W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2146.                   string_unpack("<I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4I4", str, pos)
  2147.             end
  2148.             local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  2149.             local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
  2150.             local t0 = chunk_index % 2^32         -- t0 = low_4_bytes(chunk_index)
  2151.             local t1 = (chunk_index - t0) / 2^32  -- t1 = high_4_bytes(chunk_index)
  2152.             local vC, vD, vE, vF = 0|t0, 0|t1, block_length, flags
  2153.             for j = 1, 7 do
  2154.                v0 = v0 + v4 + W[perm_blake3[j]]
  2155.                vC = vC ~ v0
  2156.                vC = (vC & (1<<32)-1) >> 16 | vC << 16
  2157.                v8 = v8 + vC
  2158.                v4 = v4 ~ v8
  2159.                v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  2160.                v0 = v0 + v4 + W[perm_blake3[j + 14]]
  2161.                vC = vC ~ v0
  2162.                vC = (vC & (1<<32)-1) >> 8 | vC << 24
  2163.                v8 = v8 + vC
  2164.                v4 = v4 ~ v8
  2165.                v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  2166.                v1 = v1 + v5 + W[perm_blake3[j + 1]]
  2167.                vD = vD ~ v1
  2168.                vD = (vD & (1<<32)-1) >> 16 | vD << 16
  2169.                v9 = v9 + vD
  2170.                v5 = v5 ~ v9
  2171.                v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  2172.                v1 = v1 + v5 + W[perm_blake3[j + 2]]
  2173.                vD = vD ~ v1
  2174.                vD = (vD & (1<<32)-1) >> 8 | vD << 24
  2175.                v9 = v9 + vD
  2176.                v5 = v5 ~ v9
  2177.                v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  2178.                v2 = v2 + v6 + W[perm_blake3[j + 16]]
  2179.                vE = vE ~ v2
  2180.                vE = (vE & (1<<32)-1) >> 16 | vE << 16
  2181.                vA = vA + vE
  2182.                v6 = v6 ~ vA
  2183.                v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  2184.                v2 = v2 + v6 + W[perm_blake3[j + 7]]
  2185.                vE = vE ~ v2
  2186.                vE = (vE & (1<<32)-1) >> 8 | vE << 24
  2187.                vA = vA + vE
  2188.                v6 = v6 ~ vA
  2189.                v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  2190.                v3 = v3 + v7 + W[perm_blake3[j + 15]]
  2191.                vF = vF ~ v3
  2192.                vF = (vF & (1<<32)-1) >> 16 | vF << 16
  2193.                vB = vB + vF
  2194.                v7 = v7 ~ vB
  2195.                v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  2196.                v3 = v3 + v7 + W[perm_blake3[j + 17]]
  2197.                vF = vF ~ v3
  2198.                vF = (vF & (1<<32)-1) >> 8 | vF << 24
  2199.                vB = vB + vF
  2200.                v7 = v7 ~ vB
  2201.                v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  2202.                v0 = v0 + v5 + W[perm_blake3[j + 21]]
  2203.                vF = vF ~ v0
  2204.                vF = (vF & (1<<32)-1) >> 16 | vF << 16
  2205.                vA = vA + vF
  2206.                v5 = v5 ~ vA
  2207.                v5 = (v5 & (1<<32)-1) >> 12 | v5 << 20
  2208.                v0 = v0 + v5 + W[perm_blake3[j + 5]]
  2209.                vF = vF ~ v0
  2210.                vF = (vF & (1<<32)-1) >> 8 | vF << 24
  2211.                vA = vA + vF
  2212.                v5 = v5 ~ vA
  2213.                v5 = (v5 & (1<<32)-1) >> 7 | v5 << 25
  2214.                v1 = v1 + v6 + W[perm_blake3[j + 3]]
  2215.                vC = vC ~ v1
  2216.                vC = (vC & (1<<32)-1) >> 16 | vC << 16
  2217.                vB = vB + vC
  2218.                v6 = v6 ~ vB
  2219.                v6 = (v6 & (1<<32)-1) >> 12 | v6 << 20
  2220.                v1 = v1 + v6 + W[perm_blake3[j + 6]]
  2221.                vC = vC ~ v1
  2222.                vC = (vC & (1<<32)-1) >> 8 | vC << 24
  2223.                vB = vB + vC
  2224.                v6 = v6 ~ vB
  2225.                v6 = (v6 & (1<<32)-1) >> 7 | v6 << 25
  2226.                v2 = v2 + v7 + W[perm_blake3[j + 4]]
  2227.                vD = vD ~ v2
  2228.                vD = (vD & (1<<32)-1) >> 16 | vD << 16
  2229.                v8 = v8 + vD
  2230.                v7 = v7 ~ v8
  2231.                v7 = (v7 & (1<<32)-1) >> 12 | v7 << 20
  2232.                v2 = v2 + v7 + W[perm_blake3[j + 18]]
  2233.                vD = vD ~ v2
  2234.                vD = (vD & (1<<32)-1) >> 8 | vD << 24
  2235.                v8 = v8 + vD
  2236.                v7 = v7 ~ v8
  2237.                v7 = (v7 & (1<<32)-1) >> 7 | v7 << 25
  2238.                v3 = v3 + v4 + W[perm_blake3[j + 19]]
  2239.                vE = vE ~ v3
  2240.                vE = (vE & (1<<32)-1) >> 16 | vE << 16
  2241.                v9 = v9 + vE
  2242.                v4 = v4 ~ v9
  2243.                v4 = (v4 & (1<<32)-1) >> 12 | v4 << 20
  2244.                v3 = v3 + v4 + W[perm_blake3[j + 20]]
  2245.                vE = vE ~ v3
  2246.                vE = (vE & (1<<32)-1) >> 8 | vE << 24
  2247.                v9 = v9 + vE
  2248.                v4 = v4 ~ v9
  2249.                v4 = (v4 & (1<<32)-1) >> 7 | v4 << 25
  2250.             end
  2251.             if wide_output then
  2252.                H_out[ 9] = h1 ~ v8
  2253.                H_out[10] = h2 ~ v9
  2254.                H_out[11] = h3 ~ vA
  2255.                H_out[12] = h4 ~ vB
  2256.                H_out[13] = h5 ~ vC
  2257.                H_out[14] = h6 ~ vD
  2258.                H_out[15] = h7 ~ vE
  2259.                H_out[16] = h8 ~ vF
  2260.             end
  2261.             h1 = v0 ~ v8
  2262.             h2 = v1 ~ v9
  2263.             h3 = v2 ~ vA
  2264.             h4 = v3 ~ vB
  2265.             h5 = v4 ~ vC
  2266.             h6 = v5 ~ vD
  2267.             h7 = v6 ~ vE
  2268.             h8 = v7 ~ vF
  2269.          end
  2270.          H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2271.       end
  2272.  
  2273.       return HEX64, XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
  2274.    ]=](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3)
  2275.  
  2276. end
  2277.  
  2278.  
  2279. if branch == "INT32" then
  2280.  
  2281.  
  2282.    -- implementation for Lua 5.3/5.4 having non-standard numbers config "int32"+"double" (built with LUA_INT_TYPE=LUA_INT_INT)
  2283.  
  2284.    K_lo_modulo = 2^32
  2285.  
  2286.    function HEX(x) -- returns string of 8 lowercase hexadecimal digits
  2287.       return string_format("%08x", x)
  2288.    end
  2289.  
  2290.    XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64 = load[=[-- branch "INT32"
  2291.       local md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3 = ...
  2292.       local string_unpack, floor = string.unpack, math.floor
  2293.  
  2294.       local function XORA5(x, y)
  2295.          return x ~ (y and (y + 2^31) % 2^32 - 2^31 or 0xA5A5A5A5)
  2296.       end
  2297.  
  2298.       local function XOR_BYTE(x, y)
  2299.          return x ~ y
  2300.       end
  2301.  
  2302.       local function sha256_feed_64(H, str, offs, size)
  2303.          -- offs >= 0, size >= 0, size is multiple of 64
  2304.          local W, K = common_W, sha2_K_hi
  2305.          local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  2306.          for pos = offs + 1, offs + size, 64 do
  2307.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2308.                string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2309.             for j = 17, 64 do
  2310.                local a, b = W[j-15], W[j-2]
  2311.                W[j] = (a>>7 ~ a<<25 ~ a<<14 ~ a>>18 ~ a>>3) + (b<<15 ~ b>>17 ~ b<<13 ~ b>>19 ~ b>>10) + W[j-7] + W[j-16]
  2312.             end
  2313.             local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  2314.             for j = 1, 64 do
  2315.                local z = (e>>6 ~ e<<26 ~ e>>11 ~ e<<21 ~ e>>25 ~ e<<7) + (g ~ e & (f ~ g)) + h + K[j] + W[j]
  2316.                h = g
  2317.                g = f
  2318.                f = e
  2319.                e = z + d
  2320.                d = c
  2321.                c = b
  2322.                b = a
  2323.                a = z + ((a ~ c) & d ~ a & c) + (a>>2 ~ a<<30 ~ a>>13 ~ a<<19 ~ a<<10 ~ a>>22)
  2324.             end
  2325.             h1 = a + h1
  2326.             h2 = b + h2
  2327.             h3 = c + h3
  2328.             h4 = d + h4
  2329.             h5 = e + h5
  2330.             h6 = f + h6
  2331.             h7 = g + h7
  2332.             h8 = h + h8
  2333.          end
  2334.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2335.       end
  2336.  
  2337.       local function sha512_feed_128(H_lo, H_hi, str, offs, size)
  2338.          -- offs >= 0, size >= 0, size is multiple of 128
  2339.          -- W1_hi, W1_lo, W2_hi, W2_lo, ...   Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  2340.          local floor, W, K_lo, K_hi = floor, common_W, sha2_K_lo, sha2_K_hi
  2341.          local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  2342.          local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  2343.          for pos = offs + 1, offs + size, 128 do
  2344.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16],
  2345.                W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] =
  2346.                string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2347.             for jj = 17*2, 80*2, 2 do
  2348.                local a_lo, a_hi, b_lo, b_hi = W[jj-30], W[jj-31], W[jj-4], W[jj-5]
  2349.                local tmp =
  2350.                   (a_lo>>1 ~ a_hi<<31 ~ a_lo>>8 ~ a_hi<<24 ~ a_lo>>7 ~ a_hi<<25) % 2^32
  2351.                   + (b_lo>>19 ~ b_hi<<13 ~ b_lo<<3 ~ b_hi>>29 ~ b_lo>>6 ~ b_hi<<26) % 2^32
  2352.                   + W[jj-14] % 2^32 + W[jj-32] % 2^32
  2353.                W[jj-1] =
  2354.                   (a_hi>>1 ~ a_lo<<31 ~ a_hi>>8 ~ a_lo<<24 ~ a_hi>>7)
  2355.                   + (b_hi>>19 ~ b_lo<<13 ~ b_hi<<3 ~ b_lo>>29 ~ b_hi>>6)
  2356.                   + W[jj-15] + W[jj-33] + floor(tmp / 2^32)
  2357.                W[jj] = 0|((tmp + 2^31) % 2^32 - 2^31)
  2358.             end
  2359.             local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2360.             local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2361.             for j = 1, 80 do
  2362.                local jj = 2*j
  2363.                local z_lo = (e_lo>>14 ~ e_hi<<18 ~ e_lo>>18 ~ e_hi<<14 ~ e_lo<<23 ~ e_hi>>9) % 2^32 + (g_lo ~ e_lo & (f_lo ~ g_lo)) % 2^32 + h_lo % 2^32 + K_lo[j] + W[jj] % 2^32
  2364.                local z_hi = (e_hi>>14 ~ e_lo<<18 ~ e_hi>>18 ~ e_lo<<14 ~ e_hi<<23 ~ e_lo>>9) + (g_hi ~ e_hi & (f_hi ~ g_hi)) + h_hi + K_hi[j] + W[jj-1] + floor(z_lo / 2^32)
  2365.                z_lo = z_lo % 2^32
  2366.                h_lo = g_lo;  h_hi = g_hi
  2367.                g_lo = f_lo;  g_hi = f_hi
  2368.                f_lo = e_lo;  f_hi = e_hi
  2369.                e_lo = z_lo + d_lo % 2^32
  2370.                e_hi = z_hi + d_hi + floor(e_lo / 2^32)
  2371.                e_lo = 0|((e_lo + 2^31) % 2^32 - 2^31)
  2372.                d_lo = c_lo;  d_hi = c_hi
  2373.                c_lo = b_lo;  c_hi = b_hi
  2374.                b_lo = a_lo;  b_hi = a_hi
  2375.                z_lo = z_lo + (d_lo & c_lo ~ b_lo & (d_lo ~ c_lo)) % 2^32 + (b_lo>>28 ~ b_hi<<4 ~ b_lo<<30 ~ b_hi>>2 ~ b_lo<<25 ~ b_hi>>7) % 2^32
  2376.                a_hi = z_hi + (d_hi & c_hi ~ b_hi & (d_hi ~ c_hi)) + (b_hi>>28 ~ b_lo<<4 ~ b_hi<<30 ~ b_lo>>2 ~ b_hi<<25 ~ b_lo>>7) + floor(z_lo / 2^32)
  2377.                a_lo = 0|((z_lo + 2^31) % 2^32 - 2^31)
  2378.             end
  2379.             a_lo = h1_lo % 2^32 + a_lo % 2^32
  2380.             h1_hi = h1_hi + a_hi + floor(a_lo / 2^32)
  2381.             h1_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2382.             a_lo = h2_lo % 2^32 + b_lo % 2^32
  2383.             h2_hi = h2_hi + b_hi + floor(a_lo / 2^32)
  2384.             h2_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2385.             a_lo = h3_lo % 2^32 + c_lo % 2^32
  2386.             h3_hi = h3_hi + c_hi + floor(a_lo / 2^32)
  2387.             h3_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2388.             a_lo = h4_lo % 2^32 + d_lo % 2^32
  2389.             h4_hi = h4_hi + d_hi + floor(a_lo / 2^32)
  2390.             h4_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2391.             a_lo = h5_lo % 2^32 + e_lo % 2^32
  2392.             h5_hi = h5_hi + e_hi + floor(a_lo / 2^32)
  2393.             h5_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2394.             a_lo = h6_lo % 2^32 + f_lo % 2^32
  2395.             h6_hi = h6_hi + f_hi + floor(a_lo / 2^32)
  2396.             h6_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2397.             a_lo = h7_lo % 2^32 + g_lo % 2^32
  2398.             h7_hi = h7_hi + g_hi + floor(a_lo / 2^32)
  2399.             h7_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2400.             a_lo = h8_lo % 2^32 + h_lo % 2^32
  2401.             h8_hi = h8_hi + h_hi + floor(a_lo / 2^32)
  2402.             h8_lo = 0|((a_lo + 2^31) % 2^32 - 2^31)
  2403.          end
  2404.          H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2405.          H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2406.       end
  2407.  
  2408.       local function md5_feed_64(H, str, offs, size)
  2409.          -- offs >= 0, size >= 0, size is multiple of 64
  2410.          local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  2411.          local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  2412.          for pos = offs + 1, offs + size, 64 do
  2413.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2414.                string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2415.             local a, b, c, d = h1, h2, h3, h4
  2416.             local s = 32-7
  2417.             for j = 1, 16 do
  2418.                local F = (d ~ b & (c ~ d)) + a + K[j] + W[j]
  2419.                a = d
  2420.                d = c
  2421.                c = b
  2422.                b = (F << 32-s | F>>s) + b
  2423.                s = md5_next_shift[s]
  2424.             end
  2425.             s = 32-5
  2426.             for j = 17, 32 do
  2427.                local F = (c ~ d & (b ~ c)) + a + K[j] + W[(5*j-4 & 15) + 1]
  2428.                a = d
  2429.                d = c
  2430.                c = b
  2431.                b = (F << 32-s | F>>s) + b
  2432.                s = md5_next_shift[s]
  2433.             end
  2434.             s = 32-4
  2435.             for j = 33, 48 do
  2436.                local F = (b ~ c ~ d) + a + K[j] + W[(3*j+2 & 15) + 1]
  2437.                a = d
  2438.                d = c
  2439.                c = b
  2440.                b = (F << 32-s | F>>s) + b
  2441.                s = md5_next_shift[s]
  2442.             end
  2443.             s = 32-6
  2444.             for j = 49, 64 do
  2445.                local F = (c ~ (b | ~d)) + a + K[j] + W[(j*7-7 & 15) + 1]
  2446.                a = d
  2447.                d = c
  2448.                c = b
  2449.                b = (F << 32-s | F>>s) + b
  2450.                s = md5_next_shift[s]
  2451.             end
  2452.             h1 = a + h1
  2453.             h2 = b + h2
  2454.             h3 = c + h3
  2455.             h4 = d + h4
  2456.          end
  2457.          H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  2458.       end
  2459.  
  2460.       local function sha1_feed_64(H, str, offs, size)
  2461.          -- offs >= 0, size >= 0, size is multiple of 64
  2462.          local W = common_W
  2463.          local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
  2464.          for pos = offs + 1, offs + size, 64 do
  2465.             W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2466.                string_unpack(">i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2467.             for j = 17, 80 do
  2468.                local a = W[j-3] ~ W[j-8] ~ W[j-14] ~ W[j-16]
  2469.                W[j] = a << 1 ~ a >> 31
  2470.             end
  2471.             local a, b, c, d, e = h1, h2, h3, h4, h5
  2472.             for j = 1, 20 do
  2473.                local z = (a << 5 ~ a >> 27) + (d ~ b & (c ~ d)) + 0x5A827999 + W[j] + e      -- constant = floor(2^30 * sqrt(2))
  2474.                e = d
  2475.                d = c
  2476.                c = b << 30 ~ b >> 2
  2477.                b = a
  2478.                a = z
  2479.             end
  2480.             for j = 21, 40 do
  2481.                local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0x6ED9EBA1 + W[j] + e            -- 2^30 * sqrt(3)
  2482.                e = d
  2483.                d = c
  2484.                c = b << 30 ~ b >> 2
  2485.                b = a
  2486.                a = z
  2487.             end
  2488.             for j = 41, 60 do
  2489.                local z = (a << 5 ~ a >> 27) + ((b ~ c) & d ~ b & c) + 0x8F1BBCDC + W[j] + e  -- 2^30 * sqrt(5)
  2490.                e = d
  2491.                d = c
  2492.                c = b << 30 ~ b >> 2
  2493.                b = a
  2494.                a = z
  2495.             end
  2496.             for j = 61, 80 do
  2497.                local z = (a << 5 ~ a >> 27) + (b ~ c ~ d) + 0xCA62C1D6 + W[j] + e            -- 2^30 * sqrt(10)
  2498.                e = d
  2499.                d = c
  2500.                c = b << 30 ~ b >> 2
  2501.                b = a
  2502.                a = z
  2503.             end
  2504.             h1 = a + h1
  2505.             h2 = b + h2
  2506.             h3 = c + h3
  2507.             h4 = d + h4
  2508.             h5 = e + h5
  2509.          end
  2510.          H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
  2511.       end
  2512.  
  2513.       local keccak_format_i4i4 = build_keccak_format("i4i4")
  2514.  
  2515.       local function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
  2516.          -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  2517.          local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
  2518.          local qwords_qty = block_size_in_bytes / 8
  2519.          local keccak_format = keccak_format_i4i4[qwords_qty]
  2520.          for pos = offs + 1, offs + size, block_size_in_bytes do
  2521.             local dwords_from_message = {string_unpack(keccak_format, str, pos)}
  2522.             for j = 1, qwords_qty do
  2523.                lanes_lo[j] = lanes_lo[j] ~ dwords_from_message[2*j-1]
  2524.                lanes_hi[j] = lanes_hi[j] ~ dwords_from_message[2*j]
  2525.             end
  2526.             local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi,
  2527.                L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi,
  2528.                L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi =
  2529.                lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5],
  2530.                lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10],
  2531.                lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15],
  2532.                lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20],
  2533.                lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25]
  2534.             for round_idx = 1, 24 do
  2535.                local C1_lo = L01_lo ~ L06_lo ~ L11_lo ~ L16_lo ~ L21_lo
  2536.                local C1_hi = L01_hi ~ L06_hi ~ L11_hi ~ L16_hi ~ L21_hi
  2537.                local C2_lo = L02_lo ~ L07_lo ~ L12_lo ~ L17_lo ~ L22_lo
  2538.                local C2_hi = L02_hi ~ L07_hi ~ L12_hi ~ L17_hi ~ L22_hi
  2539.                local C3_lo = L03_lo ~ L08_lo ~ L13_lo ~ L18_lo ~ L23_lo
  2540.                local C3_hi = L03_hi ~ L08_hi ~ L13_hi ~ L18_hi ~ L23_hi
  2541.                local C4_lo = L04_lo ~ L09_lo ~ L14_lo ~ L19_lo ~ L24_lo
  2542.                local C4_hi = L04_hi ~ L09_hi ~ L14_hi ~ L19_hi ~ L24_hi
  2543.                local C5_lo = L05_lo ~ L10_lo ~ L15_lo ~ L20_lo ~ L25_lo
  2544.                local C5_hi = L05_hi ~ L10_hi ~ L15_hi ~ L20_hi ~ L25_hi
  2545.                local D_lo = C1_lo ~ C3_lo<<1 ~ C3_hi>>31
  2546.                local D_hi = C1_hi ~ C3_hi<<1 ~ C3_lo>>31
  2547.                local T0_lo = D_lo ~ L02_lo
  2548.                local T0_hi = D_hi ~ L02_hi
  2549.                local T1_lo = D_lo ~ L07_lo
  2550.                local T1_hi = D_hi ~ L07_hi
  2551.                local T2_lo = D_lo ~ L12_lo
  2552.                local T2_hi = D_hi ~ L12_hi
  2553.                local T3_lo = D_lo ~ L17_lo
  2554.                local T3_hi = D_hi ~ L17_hi
  2555.                local T4_lo = D_lo ~ L22_lo
  2556.                local T4_hi = D_hi ~ L22_hi
  2557.                L02_lo = T1_lo>>20 ~ T1_hi<<12
  2558.                L02_hi = T1_hi>>20 ~ T1_lo<<12
  2559.                L07_lo = T3_lo>>19 ~ T3_hi<<13
  2560.                L07_hi = T3_hi>>19 ~ T3_lo<<13
  2561.                L12_lo = T0_lo<<1 ~ T0_hi>>31
  2562.                L12_hi = T0_hi<<1 ~ T0_lo>>31
  2563.                L17_lo = T2_lo<<10 ~ T2_hi>>22
  2564.                L17_hi = T2_hi<<10 ~ T2_lo>>22
  2565.                L22_lo = T4_lo<<2 ~ T4_hi>>30
  2566.                L22_hi = T4_hi<<2 ~ T4_lo>>30
  2567.                D_lo = C2_lo ~ C4_lo<<1 ~ C4_hi>>31
  2568.                D_hi = C2_hi ~ C4_hi<<1 ~ C4_lo>>31
  2569.                T0_lo = D_lo ~ L03_lo
  2570.                T0_hi = D_hi ~ L03_hi
  2571.                T1_lo = D_lo ~ L08_lo
  2572.                T1_hi = D_hi ~ L08_hi
  2573.                T2_lo = D_lo ~ L13_lo
  2574.                T2_hi = D_hi ~ L13_hi
  2575.                T3_lo = D_lo ~ L18_lo
  2576.                T3_hi = D_hi ~ L18_hi
  2577.                T4_lo = D_lo ~ L23_lo
  2578.                T4_hi = D_hi ~ L23_hi
  2579.                L03_lo = T2_lo>>21 ~ T2_hi<<11
  2580.                L03_hi = T2_hi>>21 ~ T2_lo<<11
  2581.                L08_lo = T4_lo>>3 ~ T4_hi<<29
  2582.                L08_hi = T4_hi>>3 ~ T4_lo<<29
  2583.                L13_lo = T1_lo<<6 ~ T1_hi>>26
  2584.                L13_hi = T1_hi<<6 ~ T1_lo>>26
  2585.                L18_lo = T3_lo<<15 ~ T3_hi>>17
  2586.                L18_hi = T3_hi<<15 ~ T3_lo>>17
  2587.                L23_lo = T0_lo>>2 ~ T0_hi<<30
  2588.                L23_hi = T0_hi>>2 ~ T0_lo<<30
  2589.                D_lo = C3_lo ~ C5_lo<<1 ~ C5_hi>>31
  2590.                D_hi = C3_hi ~ C5_hi<<1 ~ C5_lo>>31
  2591.                T0_lo = D_lo ~ L04_lo
  2592.                T0_hi = D_hi ~ L04_hi
  2593.                T1_lo = D_lo ~ L09_lo
  2594.                T1_hi = D_hi ~ L09_hi
  2595.                T2_lo = D_lo ~ L14_lo
  2596.                T2_hi = D_hi ~ L14_hi
  2597.                T3_lo = D_lo ~ L19_lo
  2598.                T3_hi = D_hi ~ L19_hi
  2599.                T4_lo = D_lo ~ L24_lo
  2600.                T4_hi = D_hi ~ L24_hi
  2601.                L04_lo = T3_lo<<21 ~ T3_hi>>11
  2602.                L04_hi = T3_hi<<21 ~ T3_lo>>11
  2603.                L09_lo = T0_lo<<28 ~ T0_hi>>4
  2604.                L09_hi = T0_hi<<28 ~ T0_lo>>4
  2605.                L14_lo = T2_lo<<25 ~ T2_hi>>7
  2606.                L14_hi = T2_hi<<25 ~ T2_lo>>7
  2607.                L19_lo = T4_lo>>8 ~ T4_hi<<24
  2608.                L19_hi = T4_hi>>8 ~ T4_lo<<24
  2609.                L24_lo = T1_lo>>9 ~ T1_hi<<23
  2610.                L24_hi = T1_hi>>9 ~ T1_lo<<23
  2611.                D_lo = C4_lo ~ C1_lo<<1 ~ C1_hi>>31
  2612.                D_hi = C4_hi ~ C1_hi<<1 ~ C1_lo>>31
  2613.                T0_lo = D_lo ~ L05_lo
  2614.                T0_hi = D_hi ~ L05_hi
  2615.                T1_lo = D_lo ~ L10_lo
  2616.                T1_hi = D_hi ~ L10_hi
  2617.                T2_lo = D_lo ~ L15_lo
  2618.                T2_hi = D_hi ~ L15_hi
  2619.                T3_lo = D_lo ~ L20_lo
  2620.                T3_hi = D_hi ~ L20_hi
  2621.                T4_lo = D_lo ~ L25_lo
  2622.                T4_hi = D_hi ~ L25_hi
  2623.                L05_lo = T4_lo<<14 ~ T4_hi>>18
  2624.                L05_hi = T4_hi<<14 ~ T4_lo>>18
  2625.                L10_lo = T1_lo<<20 ~ T1_hi>>12
  2626.                L10_hi = T1_hi<<20 ~ T1_lo>>12
  2627.                L15_lo = T3_lo<<8 ~ T3_hi>>24
  2628.                L15_hi = T3_hi<<8 ~ T3_lo>>24
  2629.                L20_lo = T0_lo<<27 ~ T0_hi>>5
  2630.                L20_hi = T0_hi<<27 ~ T0_lo>>5
  2631.                L25_lo = T2_lo>>25 ~ T2_hi<<7
  2632.                L25_hi = T2_hi>>25 ~ T2_lo<<7
  2633.                D_lo = C5_lo ~ C2_lo<<1 ~ C2_hi>>31
  2634.                D_hi = C5_hi ~ C2_hi<<1 ~ C2_lo>>31
  2635.                T1_lo = D_lo ~ L06_lo
  2636.                T1_hi = D_hi ~ L06_hi
  2637.                T2_lo = D_lo ~ L11_lo
  2638.                T2_hi = D_hi ~ L11_hi
  2639.                T3_lo = D_lo ~ L16_lo
  2640.                T3_hi = D_hi ~ L16_hi
  2641.                T4_lo = D_lo ~ L21_lo
  2642.                T4_hi = D_hi ~ L21_hi
  2643.                L06_lo = T2_lo<<3 ~ T2_hi>>29
  2644.                L06_hi = T2_hi<<3 ~ T2_lo>>29
  2645.                L11_lo = T4_lo<<18 ~ T4_hi>>14
  2646.                L11_hi = T4_hi<<18 ~ T4_lo>>14
  2647.                L16_lo = T1_lo>>28 ~ T1_hi<<4
  2648.                L16_hi = T1_hi>>28 ~ T1_lo<<4
  2649.                L21_lo = T3_lo>>23 ~ T3_hi<<9
  2650.                L21_hi = T3_hi>>23 ~ T3_lo<<9
  2651.                L01_lo = D_lo ~ L01_lo
  2652.                L01_hi = D_hi ~ L01_hi
  2653.                L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = L01_lo ~ ~L02_lo & L03_lo, L02_lo ~ ~L03_lo & L04_lo, L03_lo ~ ~L04_lo & L05_lo, L04_lo ~ ~L05_lo & L01_lo, L05_lo ~ ~L01_lo & L02_lo
  2654.                L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = L01_hi ~ ~L02_hi & L03_hi, L02_hi ~ ~L03_hi & L04_hi, L03_hi ~ ~L04_hi & L05_hi, L04_hi ~ ~L05_hi & L01_hi, L05_hi ~ ~L01_hi & L02_hi
  2655.                L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = L09_lo ~ ~L10_lo & L06_lo, L10_lo ~ ~L06_lo & L07_lo, L06_lo ~ ~L07_lo & L08_lo, L07_lo ~ ~L08_lo & L09_lo, L08_lo ~ ~L09_lo & L10_lo
  2656.                L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = L09_hi ~ ~L10_hi & L06_hi, L10_hi ~ ~L06_hi & L07_hi, L06_hi ~ ~L07_hi & L08_hi, L07_hi ~ ~L08_hi & L09_hi, L08_hi ~ ~L09_hi & L10_hi
  2657.                L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = L12_lo ~ ~L13_lo & L14_lo, L13_lo ~ ~L14_lo & L15_lo, L14_lo ~ ~L15_lo & L11_lo, L15_lo ~ ~L11_lo & L12_lo, L11_lo ~ ~L12_lo & L13_lo
  2658.                L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = L12_hi ~ ~L13_hi & L14_hi, L13_hi ~ ~L14_hi & L15_hi, L14_hi ~ ~L15_hi & L11_hi, L15_hi ~ ~L11_hi & L12_hi, L11_hi ~ ~L12_hi & L13_hi
  2659.                L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = L20_lo ~ ~L16_lo & L17_lo, L16_lo ~ ~L17_lo & L18_lo, L17_lo ~ ~L18_lo & L19_lo, L18_lo ~ ~L19_lo & L20_lo, L19_lo ~ ~L20_lo & L16_lo
  2660.                L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = L20_hi ~ ~L16_hi & L17_hi, L16_hi ~ ~L17_hi & L18_hi, L17_hi ~ ~L18_hi & L19_hi, L18_hi ~ ~L19_hi & L20_hi, L19_hi ~ ~L20_hi & L16_hi
  2661.                L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = L23_lo ~ ~L24_lo & L25_lo, L24_lo ~ ~L25_lo & L21_lo, L25_lo ~ ~L21_lo & L22_lo, L21_lo ~ ~L22_lo & L23_lo, L22_lo ~ ~L23_lo & L24_lo
  2662.                L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = L23_hi ~ ~L24_hi & L25_hi, L24_hi ~ ~L25_hi & L21_hi, L25_hi ~ ~L21_hi & L22_hi, L21_hi ~ ~L22_hi & L23_hi, L22_hi ~ ~L23_hi & L24_hi
  2663.                L01_lo = L01_lo ~ RC_lo[round_idx]
  2664.                L01_hi = L01_hi ~ RC_hi[round_idx]
  2665.             end
  2666.             lanes_lo[1]  = L01_lo;  lanes_hi[1]  = L01_hi
  2667.             lanes_lo[2]  = L02_lo;  lanes_hi[2]  = L02_hi
  2668.             lanes_lo[3]  = L03_lo;  lanes_hi[3]  = L03_hi
  2669.             lanes_lo[4]  = L04_lo;  lanes_hi[4]  = L04_hi
  2670.             lanes_lo[5]  = L05_lo;  lanes_hi[5]  = L05_hi
  2671.             lanes_lo[6]  = L06_lo;  lanes_hi[6]  = L06_hi
  2672.             lanes_lo[7]  = L07_lo;  lanes_hi[7]  = L07_hi
  2673.             lanes_lo[8]  = L08_lo;  lanes_hi[8]  = L08_hi
  2674.             lanes_lo[9]  = L09_lo;  lanes_hi[9]  = L09_hi
  2675.             lanes_lo[10] = L10_lo;  lanes_hi[10] = L10_hi
  2676.             lanes_lo[11] = L11_lo;  lanes_hi[11] = L11_hi
  2677.             lanes_lo[12] = L12_lo;  lanes_hi[12] = L12_hi
  2678.             lanes_lo[13] = L13_lo;  lanes_hi[13] = L13_hi
  2679.             lanes_lo[14] = L14_lo;  lanes_hi[14] = L14_hi
  2680.             lanes_lo[15] = L15_lo;  lanes_hi[15] = L15_hi
  2681.             lanes_lo[16] = L16_lo;  lanes_hi[16] = L16_hi
  2682.             lanes_lo[17] = L17_lo;  lanes_hi[17] = L17_hi
  2683.             lanes_lo[18] = L18_lo;  lanes_hi[18] = L18_hi
  2684.             lanes_lo[19] = L19_lo;  lanes_hi[19] = L19_hi
  2685.             lanes_lo[20] = L20_lo;  lanes_hi[20] = L20_hi
  2686.             lanes_lo[21] = L21_lo;  lanes_hi[21] = L21_hi
  2687.             lanes_lo[22] = L22_lo;  lanes_hi[22] = L22_hi
  2688.             lanes_lo[23] = L23_lo;  lanes_hi[23] = L23_hi
  2689.             lanes_lo[24] = L24_lo;  lanes_hi[24] = L24_hi
  2690.             lanes_lo[25] = L25_lo;  lanes_hi[25] = L25_hi
  2691.          end
  2692.       end
  2693.  
  2694.       local function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  2695.          -- offs >= 0, size >= 0, size is multiple of 64
  2696.          local W = common_W
  2697.          local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  2698.          for pos = offs + 1, offs + size, 64 do
  2699.             if str then
  2700.                W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  2701.                   string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2702.             end
  2703.             local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  2704.             local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  2705.             bytes_compressed = bytes_compressed + (last_block_size or 64)
  2706.             local t0 = bytes_compressed % 2^32
  2707.             local t1 = (bytes_compressed - t0) / 2^32
  2708.             t0 = (t0 + 2^31) % 2^32 - 2^31  -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while XORing
  2709.             vC = vC ~ t0  -- t0 = low_4_bytes(bytes_compressed)
  2710.             vD = vD ~ t1  -- t1 = high_4_bytes(bytes_compressed)
  2711.             if last_block_size then  -- flag f0
  2712.                vE = ~vE
  2713.             end
  2714.             if is_last_node then  -- flag f1
  2715.                vF = ~vF
  2716.             end
  2717.             for j = 1, 10 do
  2718.                local row = sigma[j]
  2719.                v0 = v0 + v4 + W[row[1]]
  2720.                vC = vC ~ v0
  2721.                vC = vC >> 16 | vC << 16
  2722.                v8 = v8 + vC
  2723.                v4 = v4 ~ v8
  2724.                v4 = v4 >> 12 | v4 << 20
  2725.                v0 = v0 + v4 + W[row[2]]
  2726.                vC = vC ~ v0
  2727.                vC = vC >> 8 | vC << 24
  2728.                v8 = v8 + vC
  2729.                v4 = v4 ~ v8
  2730.                v4 = v4 >> 7 | v4 << 25
  2731.                v1 = v1 + v5 + W[row[3]]
  2732.                vD = vD ~ v1
  2733.                vD = vD >> 16 | vD << 16
  2734.                v9 = v9 + vD
  2735.                v5 = v5 ~ v9
  2736.                v5 = v5 >> 12 | v5 << 20
  2737.                v1 = v1 + v5 + W[row[4]]
  2738.                vD = vD ~ v1
  2739.                vD = vD >> 8 | vD << 24
  2740.                v9 = v9 + vD
  2741.                v5 = v5 ~ v9
  2742.                v5 = v5 >> 7 | v5 << 25
  2743.                v2 = v2 + v6 + W[row[5]]
  2744.                vE = vE ~ v2
  2745.                vE = vE >> 16 | vE << 16
  2746.                vA = vA + vE
  2747.                v6 = v6 ~ vA
  2748.                v6 = v6 >> 12 | v6 << 20
  2749.                v2 = v2 + v6 + W[row[6]]
  2750.                vE = vE ~ v2
  2751.                vE = vE >> 8 | vE << 24
  2752.                vA = vA + vE
  2753.                v6 = v6 ~ vA
  2754.                v6 = v6 >> 7 | v6 << 25
  2755.                v3 = v3 + v7 + W[row[7]]
  2756.                vF = vF ~ v3
  2757.                vF = vF >> 16 | vF << 16
  2758.                vB = vB + vF
  2759.                v7 = v7 ~ vB
  2760.                v7 = v7 >> 12 | v7 << 20
  2761.                v3 = v3 + v7 + W[row[8]]
  2762.                vF = vF ~ v3
  2763.                vF = vF >> 8 | vF << 24
  2764.                vB = vB + vF
  2765.                v7 = v7 ~ vB
  2766.                v7 = v7 >> 7 | v7 << 25
  2767.                v0 = v0 + v5 + W[row[9]]
  2768.                vF = vF ~ v0
  2769.                vF = vF >> 16 | vF << 16
  2770.                vA = vA + vF
  2771.                v5 = v5 ~ vA
  2772.                v5 = v5 >> 12 | v5 << 20
  2773.                v0 = v0 + v5 + W[row[10]]
  2774.                vF = vF ~ v0
  2775.                vF = vF >> 8 | vF << 24
  2776.                vA = vA + vF
  2777.                v5 = v5 ~ vA
  2778.                v5 = v5 >> 7 | v5 << 25
  2779.                v1 = v1 + v6 + W[row[11]]
  2780.                vC = vC ~ v1
  2781.                vC = vC >> 16 | vC << 16
  2782.                vB = vB + vC
  2783.                v6 = v6 ~ vB
  2784.                v6 = v6 >> 12 | v6 << 20
  2785.                v1 = v1 + v6 + W[row[12]]
  2786.                vC = vC ~ v1
  2787.                vC = vC >> 8 | vC << 24
  2788.                vB = vB + vC
  2789.                v6 = v6 ~ vB
  2790.                v6 = v6 >> 7 | v6 << 25
  2791.                v2 = v2 + v7 + W[row[13]]
  2792.                vD = vD ~ v2
  2793.                vD = vD >> 16 | vD << 16
  2794.                v8 = v8 + vD
  2795.                v7 = v7 ~ v8
  2796.                v7 = v7 >> 12 | v7 << 20
  2797.                v2 = v2 + v7 + W[row[14]]
  2798.                vD = vD ~ v2
  2799.                vD = vD >> 8 | vD << 24
  2800.                v8 = v8 + vD
  2801.                v7 = v7 ~ v8
  2802.                v7 = v7 >> 7 | v7 << 25
  2803.                v3 = v3 + v4 + W[row[15]]
  2804.                vE = vE ~ v3
  2805.                vE = vE >> 16 | vE << 16
  2806.                v9 = v9 + vE
  2807.                v4 = v4 ~ v9
  2808.                v4 = v4 >> 12 | v4 << 20
  2809.                v3 = v3 + v4 + W[row[16]]
  2810.                vE = vE ~ v3
  2811.                vE = vE >> 8 | vE << 24
  2812.                v9 = v9 + vE
  2813.                v4 = v4 ~ v9
  2814.                v4 = v4 >> 7 | v4 << 25
  2815.             end
  2816.             h1 = h1 ~ v0 ~ v8
  2817.             h2 = h2 ~ v1 ~ v9
  2818.             h3 = h3 ~ v2 ~ vA
  2819.             h4 = h4 ~ v3 ~ vB
  2820.             h5 = h5 ~ v4 ~ vC
  2821.             h6 = h6 ~ v5 ~ vD
  2822.             h7 = h7 ~ v6 ~ vE
  2823.             h8 = h8 ~ v7 ~ vF
  2824.          end
  2825.          H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  2826.          return bytes_compressed
  2827.       end
  2828.  
  2829.       local function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  2830.          -- offs >= 0, size >= 0, size is multiple of 128
  2831.          local W = common_W
  2832.          local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  2833.          local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  2834.          for pos = offs + 1, offs + size, 128 do
  2835.             if str then
  2836.                W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16],
  2837.                W[17], W[18], W[19], W[20], W[21], W[22], W[23], W[24], W[25], W[26], W[27], W[28], W[29], W[30], W[31], W[32] =
  2838.                   string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  2839.             end
  2840.             local v0_lo, v1_lo, v2_lo, v3_lo, v4_lo, v5_lo, v6_lo, v7_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  2841.             local v0_hi, v1_hi, v2_hi, v3_hi, v4_hi, v5_hi, v6_hi, v7_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  2842.             local v8_lo, v9_lo, vA_lo, vB_lo, vC_lo, vD_lo, vE_lo, vF_lo = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  2843.             local v8_hi, v9_hi, vA_hi, vB_hi, vC_hi, vD_hi, vE_hi, vF_hi = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  2844.             bytes_compressed = bytes_compressed + (last_block_size or 128)
  2845.             local t0_lo = bytes_compressed % 2^32
  2846.             local t0_hi = (bytes_compressed - t0_lo) / 2^32
  2847.             t0_lo = (t0_lo + 2^31) % 2^32 - 2^31  -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while XORing
  2848.             vC_lo = vC_lo ~ t0_lo  -- t0 = low_8_bytes(bytes_compressed)
  2849.             vC_hi = vC_hi ~ t0_hi
  2850.             -- t1 = high_8_bytes(bytes_compressed) = 0,  message length is always below 2^53 bytes
  2851.             if last_block_size then  -- flag f0
  2852.                vE_lo = ~vE_lo
  2853.                vE_hi = ~vE_hi
  2854.             end
  2855.             if is_last_node then  -- flag f1
  2856.                vF_lo = ~vF_lo
  2857.                vF_hi = ~vF_hi
  2858.             end
  2859.             for j = 1, 12 do
  2860.                local row = sigma[j]
  2861.                local k = row[1] * 2
  2862.                v0_lo = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  2863.                v0_hi = v0_hi + v4_hi + floor(v0_lo / 2^32) + W[k]
  2864.                v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2865.                vC_lo, vC_hi = vC_hi ~ v0_hi, vC_lo ~ v0_lo
  2866.                v8_lo = v8_lo % 2^32 + vC_lo % 2^32
  2867.                v8_hi = v8_hi + vC_hi + floor(v8_lo / 2^32)
  2868.                v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2869.                v4_lo, v4_hi = v4_lo ~ v8_lo, v4_hi ~ v8_hi
  2870.                v4_lo, v4_hi = v4_lo >> 24 | v4_hi << 8, v4_hi >> 24 | v4_lo << 8
  2871.                k = row[2] * 2
  2872.                v0_lo = v0_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  2873.                v0_hi = v0_hi + v4_hi + floor(v0_lo / 2^32) + W[k]
  2874.                v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2875.                vC_lo, vC_hi = vC_lo ~ v0_lo, vC_hi ~ v0_hi
  2876.                vC_lo, vC_hi = vC_lo >> 16 | vC_hi << 16, vC_hi >> 16 | vC_lo << 16
  2877.                v8_lo = v8_lo % 2^32 + vC_lo % 2^32
  2878.                v8_hi = v8_hi + vC_hi + floor(v8_lo / 2^32)
  2879.                v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2880.                v4_lo, v4_hi = v4_lo ~ v8_lo, v4_hi ~ v8_hi
  2881.                v4_lo, v4_hi = v4_lo << 1 | v4_hi >> 31, v4_hi << 1 | v4_lo >> 31
  2882.                k = row[3] * 2
  2883.                v1_lo = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2884.                v1_hi = v1_hi + v5_hi + floor(v1_lo / 2^32) + W[k]
  2885.                v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2886.                vD_lo, vD_hi = vD_hi ~ v1_hi, vD_lo ~ v1_lo
  2887.                v9_lo = v9_lo % 2^32 + vD_lo % 2^32
  2888.                v9_hi = v9_hi + vD_hi + floor(v9_lo / 2^32)
  2889.                v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  2890.                v5_lo, v5_hi = v5_lo ~ v9_lo, v5_hi ~ v9_hi
  2891.                v5_lo, v5_hi = v5_lo >> 24 | v5_hi << 8, v5_hi >> 24 | v5_lo << 8
  2892.                k = row[4] * 2
  2893.                v1_lo = v1_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2894.                v1_hi = v1_hi + v5_hi + floor(v1_lo / 2^32) + W[k]
  2895.                v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2896.                vD_lo, vD_hi = vD_lo ~ v1_lo, vD_hi ~ v1_hi
  2897.                vD_lo, vD_hi = vD_lo >> 16 | vD_hi << 16, vD_hi >> 16 | vD_lo << 16
  2898.                v9_lo = v9_lo % 2^32 + vD_lo % 2^32
  2899.                v9_hi = v9_hi + vD_hi + floor(v9_lo / 2^32)
  2900.                v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  2901.                v5_lo, v5_hi = v5_lo ~ v9_lo, v5_hi ~ v9_hi
  2902.                v5_lo, v5_hi = v5_lo << 1 | v5_hi >> 31, v5_hi << 1 | v5_lo >> 31
  2903.                k = row[5] * 2
  2904.                v2_lo = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2905.                v2_hi = v2_hi + v6_hi + floor(v2_lo / 2^32) + W[k]
  2906.                v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2907.                vE_lo, vE_hi = vE_hi ~ v2_hi, vE_lo ~ v2_lo
  2908.                vA_lo = vA_lo % 2^32 + vE_lo % 2^32
  2909.                vA_hi = vA_hi + vE_hi + floor(vA_lo / 2^32)
  2910.                vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2911.                v6_lo, v6_hi = v6_lo ~ vA_lo, v6_hi ~ vA_hi
  2912.                v6_lo, v6_hi = v6_lo >> 24 | v6_hi << 8, v6_hi >> 24 | v6_lo << 8
  2913.                k = row[6] * 2
  2914.                v2_lo = v2_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2915.                v2_hi = v2_hi + v6_hi + floor(v2_lo / 2^32) + W[k]
  2916.                v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2917.                vE_lo, vE_hi = vE_lo ~ v2_lo, vE_hi ~ v2_hi
  2918.                vE_lo, vE_hi = vE_lo >> 16 | vE_hi << 16, vE_hi >> 16 | vE_lo << 16
  2919.                vA_lo = vA_lo % 2^32 + vE_lo % 2^32
  2920.                vA_hi = vA_hi + vE_hi + floor(vA_lo / 2^32)
  2921.                vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2922.                v6_lo, v6_hi = v6_lo ~ vA_lo, v6_hi ~ vA_hi
  2923.                v6_lo, v6_hi = v6_lo << 1 | v6_hi >> 31, v6_hi << 1 | v6_lo >> 31
  2924.                k = row[7] * 2
  2925.                v3_lo = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2926.                v3_hi = v3_hi + v7_hi + floor(v3_lo / 2^32) + W[k]
  2927.                v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  2928.                vF_lo, vF_hi = vF_hi ~ v3_hi, vF_lo ~ v3_lo
  2929.                vB_lo = vB_lo % 2^32 + vF_lo % 2^32
  2930.                vB_hi = vB_hi + vF_hi + floor(vB_lo / 2^32)
  2931.                vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2932.                v7_lo, v7_hi = v7_lo ~ vB_lo, v7_hi ~ vB_hi
  2933.                v7_lo, v7_hi = v7_lo >> 24 | v7_hi << 8, v7_hi >> 24 | v7_lo << 8
  2934.                k = row[8] * 2
  2935.                v3_lo = v3_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2936.                v3_hi = v3_hi + v7_hi + floor(v3_lo / 2^32) + W[k]
  2937.                v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  2938.                vF_lo, vF_hi = vF_lo ~ v3_lo, vF_hi ~ v3_hi
  2939.                vF_lo, vF_hi = vF_lo >> 16 | vF_hi << 16, vF_hi >> 16 | vF_lo << 16
  2940.                vB_lo = vB_lo % 2^32 + vF_lo % 2^32
  2941.                vB_hi = vB_hi + vF_hi + floor(vB_lo / 2^32)
  2942.                vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2943.                v7_lo, v7_hi = v7_lo ~ vB_lo, v7_hi ~ vB_hi
  2944.                v7_lo, v7_hi = v7_lo << 1 | v7_hi >> 31, v7_hi << 1 | v7_lo >> 31
  2945.                k = row[9] * 2
  2946.                v0_lo = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2947.                v0_hi = v0_hi + v5_hi + floor(v0_lo / 2^32) + W[k]
  2948.                v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2949.                vF_lo, vF_hi = vF_hi ~ v0_hi, vF_lo ~ v0_lo
  2950.                vA_lo = vA_lo % 2^32 + vF_lo % 2^32
  2951.                vA_hi = vA_hi + vF_hi + floor(vA_lo / 2^32)
  2952.                vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2953.                v5_lo, v5_hi = v5_lo ~ vA_lo, v5_hi ~ vA_hi
  2954.                v5_lo, v5_hi = v5_lo >> 24 | v5_hi << 8, v5_hi >> 24 | v5_lo << 8
  2955.                k = row[10] * 2
  2956.                v0_lo = v0_lo % 2^32 + v5_lo % 2^32 + W[k-1] % 2^32
  2957.                v0_hi = v0_hi + v5_hi + floor(v0_lo / 2^32) + W[k]
  2958.                v0_lo = 0|((v0_lo + 2^31) % 2^32 - 2^31)
  2959.                vF_lo, vF_hi = vF_lo ~ v0_lo, vF_hi ~ v0_hi
  2960.                vF_lo, vF_hi = vF_lo >> 16 | vF_hi << 16, vF_hi >> 16 | vF_lo << 16
  2961.                vA_lo = vA_lo % 2^32 + vF_lo % 2^32
  2962.                vA_hi = vA_hi + vF_hi + floor(vA_lo / 2^32)
  2963.                vA_lo = 0|((vA_lo + 2^31) % 2^32 - 2^31)
  2964.                v5_lo, v5_hi = v5_lo ~ vA_lo, v5_hi ~ vA_hi
  2965.                v5_lo, v5_hi = v5_lo << 1 | v5_hi >> 31, v5_hi << 1 | v5_lo >> 31
  2966.                k = row[11] * 2
  2967.                v1_lo = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2968.                v1_hi = v1_hi + v6_hi + floor(v1_lo / 2^32) + W[k]
  2969.                v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2970.                vC_lo, vC_hi = vC_hi ~ v1_hi, vC_lo ~ v1_lo
  2971.                vB_lo = vB_lo % 2^32 + vC_lo % 2^32
  2972.                vB_hi = vB_hi + vC_hi + floor(vB_lo / 2^32)
  2973.                vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2974.                v6_lo, v6_hi = v6_lo ~ vB_lo, v6_hi ~ vB_hi
  2975.                v6_lo, v6_hi = v6_lo >> 24 | v6_hi << 8, v6_hi >> 24 | v6_lo << 8
  2976.                k = row[12] * 2
  2977.                v1_lo = v1_lo % 2^32 + v6_lo % 2^32 + W[k-1] % 2^32
  2978.                v1_hi = v1_hi + v6_hi + floor(v1_lo / 2^32) + W[k]
  2979.                v1_lo = 0|((v1_lo + 2^31) % 2^32 - 2^31)
  2980.                vC_lo, vC_hi = vC_lo ~ v1_lo, vC_hi ~ v1_hi
  2981.                vC_lo, vC_hi = vC_lo >> 16 | vC_hi << 16, vC_hi >> 16 | vC_lo << 16
  2982.                vB_lo = vB_lo % 2^32 + vC_lo % 2^32
  2983.                vB_hi = vB_hi + vC_hi + floor(vB_lo / 2^32)
  2984.                vB_lo = 0|((vB_lo + 2^31) % 2^32 - 2^31)
  2985.                v6_lo, v6_hi = v6_lo ~ vB_lo, v6_hi ~ vB_hi
  2986.                v6_lo, v6_hi = v6_lo << 1 | v6_hi >> 31, v6_hi << 1 | v6_lo >> 31
  2987.                k = row[13] * 2
  2988.                v2_lo = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2989.                v2_hi = v2_hi + v7_hi + floor(v2_lo / 2^32) + W[k]
  2990.                v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  2991.                vD_lo, vD_hi = vD_hi ~ v2_hi, vD_lo ~ v2_lo
  2992.                v8_lo = v8_lo % 2^32 + vD_lo % 2^32
  2993.                v8_hi = v8_hi + vD_hi + floor(v8_lo / 2^32)
  2994.                v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  2995.                v7_lo, v7_hi = v7_lo ~ v8_lo, v7_hi ~ v8_hi
  2996.                v7_lo, v7_hi = v7_lo >> 24 | v7_hi << 8, v7_hi >> 24 | v7_lo << 8
  2997.                k = row[14] * 2
  2998.                v2_lo = v2_lo % 2^32 + v7_lo % 2^32 + W[k-1] % 2^32
  2999.                v2_hi = v2_hi + v7_hi + floor(v2_lo / 2^32) + W[k]
  3000.                v2_lo = 0|((v2_lo + 2^31) % 2^32 - 2^31)
  3001.                vD_lo, vD_hi = vD_lo ~ v2_lo, vD_hi ~ v2_hi
  3002.                vD_lo, vD_hi = vD_lo >> 16 | vD_hi << 16, vD_hi >> 16 | vD_lo << 16
  3003.                v8_lo = v8_lo % 2^32 + vD_lo % 2^32
  3004.                v8_hi = v8_hi + vD_hi + floor(v8_lo / 2^32)
  3005.                v8_lo = 0|((v8_lo + 2^31) % 2^32 - 2^31)
  3006.                v7_lo, v7_hi = v7_lo ~ v8_lo, v7_hi ~ v8_hi
  3007.                v7_lo, v7_hi = v7_lo << 1 | v7_hi >> 31, v7_hi << 1 | v7_lo >> 31
  3008.                k = row[15] * 2
  3009.                v3_lo = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  3010.                v3_hi = v3_hi + v4_hi + floor(v3_lo / 2^32) + W[k]
  3011.                v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  3012.                vE_lo, vE_hi = vE_hi ~ v3_hi, vE_lo ~ v3_lo
  3013.                v9_lo = v9_lo % 2^32 + vE_lo % 2^32
  3014.                v9_hi = v9_hi + vE_hi + floor(v9_lo / 2^32)
  3015.                v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  3016.                v4_lo, v4_hi = v4_lo ~ v9_lo, v4_hi ~ v9_hi
  3017.                v4_lo, v4_hi = v4_lo >> 24 | v4_hi << 8, v4_hi >> 24 | v4_lo << 8
  3018.                k = row[16] * 2
  3019.                v3_lo = v3_lo % 2^32 + v4_lo % 2^32 + W[k-1] % 2^32
  3020.                v3_hi = v3_hi + v4_hi + floor(v3_lo / 2^32) + W[k]
  3021.                v3_lo = 0|((v3_lo + 2^31) % 2^32 - 2^31)
  3022.                vE_lo, vE_hi = vE_lo ~ v3_lo, vE_hi ~ v3_hi
  3023.                vE_lo, vE_hi = vE_lo >> 16 | vE_hi << 16, vE_hi >> 16 | vE_lo << 16
  3024.                v9_lo = v9_lo % 2^32 + vE_lo % 2^32
  3025.                v9_hi = v9_hi + vE_hi + floor(v9_lo / 2^32)
  3026.                v9_lo = 0|((v9_lo + 2^31) % 2^32 - 2^31)
  3027.                v4_lo, v4_hi = v4_lo ~ v9_lo, v4_hi ~ v9_hi
  3028.                v4_lo, v4_hi = v4_lo << 1 | v4_hi >> 31, v4_hi << 1 | v4_lo >> 31
  3029.             end
  3030.             h1_lo = h1_lo ~ v0_lo ~ v8_lo
  3031.             h2_lo = h2_lo ~ v1_lo ~ v9_lo
  3032.             h3_lo = h3_lo ~ v2_lo ~ vA_lo
  3033.             h4_lo = h4_lo ~ v3_lo ~ vB_lo
  3034.             h5_lo = h5_lo ~ v4_lo ~ vC_lo
  3035.             h6_lo = h6_lo ~ v5_lo ~ vD_lo
  3036.             h7_lo = h7_lo ~ v6_lo ~ vE_lo
  3037.             h8_lo = h8_lo ~ v7_lo ~ vF_lo
  3038.             h1_hi = h1_hi ~ v0_hi ~ v8_hi
  3039.             h2_hi = h2_hi ~ v1_hi ~ v9_hi
  3040.             h3_hi = h3_hi ~ v2_hi ~ vA_hi
  3041.             h4_hi = h4_hi ~ v3_hi ~ vB_hi
  3042.             h5_hi = h5_hi ~ v4_hi ~ vC_hi
  3043.             h6_hi = h6_hi ~ v5_hi ~ vD_hi
  3044.             h7_hi = h7_hi ~ v6_hi ~ vE_hi
  3045.             h8_hi = h8_hi ~ v7_hi ~ vF_hi
  3046.          end
  3047.          H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3048.          H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3049.          return bytes_compressed
  3050.       end
  3051.  
  3052.       local function blake3_feed_64(str, offs, size, flags, chunk_index, H_in, H_out, wide_output, block_length)
  3053.          -- offs >= 0, size >= 0, size is multiple of 64
  3054.          block_length = block_length or 64
  3055.          local W = common_W
  3056.          local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
  3057.          H_out = H_out or H_in
  3058.          for pos = offs + 1, offs + size, 64 do
  3059.             if str then
  3060.                W[1], W[2], W[3], W[4], W[5], W[6], W[7], W[8], W[9], W[10], W[11], W[12], W[13], W[14], W[15], W[16] =
  3061.                   string_unpack("<i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4i4", str, pos)
  3062.             end
  3063.             local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  3064.             local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
  3065.             local t0 = chunk_index % 2^32         -- t0 = low_4_bytes(chunk_index)
  3066.             local t1 = (chunk_index - t0) / 2^32  -- t1 = high_4_bytes(chunk_index)
  3067.             t0 = (t0 + 2^31) % 2^32 - 2^31  -- convert to int32 range (-2^31)..(2^31-1) to avoid "number has no integer representation" error while ORing
  3068.             local vC, vD, vE, vF = 0|t0, 0|t1, block_length, flags
  3069.             for j = 1, 7 do
  3070.                v0 = v0 + v4 + W[perm_blake3[j]]
  3071.                vC = vC ~ v0
  3072.                vC = vC >> 16 | vC << 16
  3073.                v8 = v8 + vC
  3074.                v4 = v4 ~ v8
  3075.                v4 = v4 >> 12 | v4 << 20
  3076.                v0 = v0 + v4 + W[perm_blake3[j + 14]]
  3077.                vC = vC ~ v0
  3078.                vC = vC >> 8 | vC << 24
  3079.                v8 = v8 + vC
  3080.                v4 = v4 ~ v8
  3081.                v4 = v4 >> 7 | v4 << 25
  3082.                v1 = v1 + v5 + W[perm_blake3[j + 1]]
  3083.                vD = vD ~ v1
  3084.                vD = vD >> 16 | vD << 16
  3085.                v9 = v9 + vD
  3086.                v5 = v5 ~ v9
  3087.                v5 = v5 >> 12 | v5 << 20
  3088.                v1 = v1 + v5 + W[perm_blake3[j + 2]]
  3089.                vD = vD ~ v1
  3090.                vD = vD >> 8 | vD << 24
  3091.                v9 = v9 + vD
  3092.                v5 = v5 ~ v9
  3093.                v5 = v5 >> 7 | v5 << 25
  3094.                v2 = v2 + v6 + W[perm_blake3[j + 16]]
  3095.                vE = vE ~ v2
  3096.                vE = vE >> 16 | vE << 16
  3097.                vA = vA + vE
  3098.                v6 = v6 ~ vA
  3099.                v6 = v6 >> 12 | v6 << 20
  3100.                v2 = v2 + v6 + W[perm_blake3[j + 7]]
  3101.                vE = vE ~ v2
  3102.                vE = vE >> 8 | vE << 24
  3103.                vA = vA + vE
  3104.                v6 = v6 ~ vA
  3105.                v6 = v6 >> 7 | v6 << 25
  3106.                v3 = v3 + v7 + W[perm_blake3[j + 15]]
  3107.                vF = vF ~ v3
  3108.                vF = vF >> 16 | vF << 16
  3109.                vB = vB + vF
  3110.                v7 = v7 ~ vB
  3111.                v7 = v7 >> 12 | v7 << 20
  3112.                v3 = v3 + v7 + W[perm_blake3[j + 17]]
  3113.                vF = vF ~ v3
  3114.                vF = vF >> 8 | vF << 24
  3115.                vB = vB + vF
  3116.                v7 = v7 ~ vB
  3117.                v7 = v7 >> 7 | v7 << 25
  3118.                v0 = v0 + v5 + W[perm_blake3[j + 21]]
  3119.                vF = vF ~ v0
  3120.                vF = vF >> 16 | vF << 16
  3121.                vA = vA + vF
  3122.                v5 = v5 ~ vA
  3123.                v5 = v5 >> 12 | v5 << 20
  3124.                v0 = v0 + v5 + W[perm_blake3[j + 5]]
  3125.                vF = vF ~ v0
  3126.                vF = vF >> 8 | vF << 24
  3127.                vA = vA + vF
  3128.                v5 = v5 ~ vA
  3129.                v5 = v5 >> 7 | v5 << 25
  3130.                v1 = v1 + v6 + W[perm_blake3[j + 3]]
  3131.                vC = vC ~ v1
  3132.                vC = vC >> 16 | vC << 16
  3133.                vB = vB + vC
  3134.                v6 = v6 ~ vB
  3135.                v6 = v6 >> 12 | v6 << 20
  3136.                v1 = v1 + v6 + W[perm_blake3[j + 6]]
  3137.                vC = vC ~ v1
  3138.                vC = vC >> 8 | vC << 24
  3139.                vB = vB + vC
  3140.                v6 = v6 ~ vB
  3141.                v6 = v6 >> 7 | v6 << 25
  3142.                v2 = v2 + v7 + W[perm_blake3[j + 4]]
  3143.                vD = vD ~ v2
  3144.                vD = vD >> 16 | vD << 16
  3145.                v8 = v8 + vD
  3146.                v7 = v7 ~ v8
  3147.                v7 = v7 >> 12 | v7 << 20
  3148.                v2 = v2 + v7 + W[perm_blake3[j + 18]]
  3149.                vD = vD ~ v2
  3150.                vD = vD >> 8 | vD << 24
  3151.                v8 = v8 + vD
  3152.                v7 = v7 ~ v8
  3153.                v7 = v7 >> 7 | v7 << 25
  3154.                v3 = v3 + v4 + W[perm_blake3[j + 19]]
  3155.                vE = vE ~ v3
  3156.                vE = vE >> 16 | vE << 16
  3157.                v9 = v9 + vE
  3158.                v4 = v4 ~ v9
  3159.                v4 = v4 >> 12 | v4 << 20
  3160.                v3 = v3 + v4 + W[perm_blake3[j + 20]]
  3161.                vE = vE ~ v3
  3162.                vE = vE >> 8 | vE << 24
  3163.                v9 = v9 + vE
  3164.                v4 = v4 ~ v9
  3165.                v4 = v4 >> 7 | v4 << 25
  3166.             end
  3167.             if wide_output then
  3168.                H_out[ 9] = h1 ~ v8
  3169.                H_out[10] = h2 ~ v9
  3170.                H_out[11] = h3 ~ vA
  3171.                H_out[12] = h4 ~ vB
  3172.                H_out[13] = h5 ~ vC
  3173.                H_out[14] = h6 ~ vD
  3174.                H_out[15] = h7 ~ vE
  3175.                H_out[16] = h8 ~ vF
  3176.             end
  3177.             h1 = v0 ~ v8
  3178.             h2 = v1 ~ v9
  3179.             h3 = v2 ~ vA
  3180.             h4 = v3 ~ vB
  3181.             h5 = v4 ~ vC
  3182.             h6 = v5 ~ vD
  3183.             h7 = v6 ~ vE
  3184.             h8 = v7 ~ vF
  3185.          end
  3186.          H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3187.       end
  3188.  
  3189.       return XORA5, XOR_BYTE, sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed, blake2s_feed_64, blake2b_feed_128, blake3_feed_64
  3190.    ]=](md5_next_shift, md5_K, sha2_K_lo, sha2_K_hi, build_keccak_format, sha3_RC_lo, sha3_RC_hi, sigma, common_W, sha2_H_lo, sha2_H_hi, perm_blake3)
  3191.  
  3192. end
  3193.  
  3194. XOR = XOR or XORA5
  3195.  
  3196. if branch == "LIB32" or branch == "EMUL" then
  3197.  
  3198.  
  3199.    -- implementation for Lua 5.1/5.2 (with or without bitwise library available)
  3200.  
  3201.    function sha256_feed_64(H, str, offs, size)
  3202.       -- offs >= 0, size >= 0, size is multiple of 64
  3203.       local W, K = common_W, sha2_K_hi
  3204.       local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  3205.       for pos = offs, offs + size - 1, 64 do
  3206.          for j = 1, 16 do
  3207.             pos = pos + 4
  3208.             local a, b, c, d = byte(str, pos - 3, pos)
  3209.             W[j] = ((a * 256 + b) * 256 + c) * 256 + d
  3210.          end
  3211.          for j = 17, 64 do
  3212.             local a, b = W[j-15], W[j-2]
  3213.             local a7, a18, b17, b19 = a / 2^7, a / 2^18, b / 2^17, b / 2^19
  3214.             W[j] = (XOR(a7 % 1 * (2^32 - 1) + a7, a18 % 1 * (2^32 - 1) + a18, (a - a % 2^3) / 2^3) + W[j-16] + W[j-7]
  3215.                + XOR(b17 % 1 * (2^32 - 1) + b17, b19 % 1 * (2^32 - 1) + b19, (b - b % 2^10) / 2^10)) % 2^32
  3216.          end
  3217.          local a, b, c, d, e, f, g, h = h1, h2, h3, h4, h5, h6, h7, h8
  3218.          for j = 1, 64 do
  3219.             e = e % 2^32
  3220.             local e6, e11, e7 = e / 2^6, e / 2^11, e * 2^7
  3221.             local e7_lo = e7 % 2^32
  3222.             local z = AND(e, f) + AND(-1-e, g) + h + K[j] + W[j]
  3223.                + XOR(e6 % 1 * (2^32 - 1) + e6, e11 % 1 * (2^32 - 1) + e11, e7_lo + (e7 - e7_lo) / 2^32)
  3224.             h = g
  3225.             g = f
  3226.             f = e
  3227.             e = z + d
  3228.             d = c
  3229.             c = b
  3230.             b = a % 2^32
  3231.             local b2, b13, b10 = b / 2^2, b / 2^13, b * 2^10
  3232.             local b10_lo = b10 % 2^32
  3233.             a = z + AND(d, c) + AND(b, XOR(d, c)) +
  3234.                XOR(b2 % 1 * (2^32 - 1) + b2, b13 % 1 * (2^32 - 1) + b13, b10_lo + (b10 - b10_lo) / 2^32)
  3235.          end
  3236.          h1, h2, h3, h4 = (a + h1) % 2^32, (b + h2) % 2^32, (c + h3) % 2^32, (d + h4) % 2^32
  3237.          h5, h6, h7, h8 = (e + h5) % 2^32, (f + h6) % 2^32, (g + h7) % 2^32, (h + h8) % 2^32
  3238.       end
  3239.       H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3240.    end
  3241.  
  3242.  
  3243.    function sha512_feed_128(H_lo, H_hi, str, offs, size)
  3244.       -- offs >= 0, size >= 0, size is multiple of 128
  3245.       -- W1_hi, W1_lo, W2_hi, W2_lo, ...   Wk_hi = W[2*k-1], Wk_lo = W[2*k]
  3246.       local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
  3247.       local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  3248.       local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  3249.       for pos = offs, offs + size - 1, 128 do
  3250.          for j = 1, 16*2 do
  3251.             pos = pos + 4
  3252.             local a, b, c, d = byte(str, pos - 3, pos)
  3253.             W[j] = ((a * 256 + b) * 256 + c) * 256 + d
  3254.          end
  3255.          for jj = 17*2, 80*2, 2 do
  3256.             local a_hi, a_lo, b_hi, b_lo = W[jj-31], W[jj-30], W[jj-5], W[jj-4]
  3257.             local b_hi_6, b_hi_19, b_hi_29, b_lo_19, b_lo_29, a_hi_1, a_hi_7, a_hi_8, a_lo_1, a_lo_8 =
  3258.                b_hi % 2^6, b_hi % 2^19, b_hi % 2^29, b_lo % 2^19, b_lo % 2^29, a_hi % 2^1, a_hi % 2^7, a_hi % 2^8, a_lo % 2^1, a_lo % 2^8
  3259.             local tmp1 = XOR((a_lo - a_lo_1) / 2^1 + a_hi_1 * 2^31, (a_lo - a_lo_8) / 2^8 + a_hi_8 * 2^24, (a_lo - a_lo % 2^7) / 2^7 + a_hi_7 * 2^25) % 2^32
  3260.                + XOR((b_lo - b_lo_19) / 2^19 + b_hi_19 * 2^13, b_lo_29 * 2^3 + (b_hi - b_hi_29) / 2^29, (b_lo - b_lo % 2^6) / 2^6 + b_hi_6 * 2^26) % 2^32
  3261.                + W[jj-14] + W[jj-32]
  3262.             local tmp2 = tmp1 % 2^32
  3263.             W[jj-1] = (XOR((a_hi - a_hi_1) / 2^1 + a_lo_1 * 2^31, (a_hi - a_hi_8) / 2^8 + a_lo_8 * 2^24, (a_hi - a_hi_7) / 2^7)
  3264.                + XOR((b_hi - b_hi_19) / 2^19 + b_lo_19 * 2^13, b_hi_29 * 2^3 + (b_lo - b_lo_29) / 2^29, (b_hi - b_hi_6) / 2^6)
  3265.                + W[jj-15] + W[jj-33] + (tmp1 - tmp2) / 2^32) % 2^32
  3266.             W[jj] = tmp2
  3267.          end
  3268.          local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3269.          local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3270.          for j = 1, 80 do
  3271.             local jj = 2*j
  3272.             local e_lo_9, e_lo_14, e_lo_18, e_hi_9, e_hi_14, e_hi_18 = e_lo % 2^9, e_lo % 2^14, e_lo % 2^18, e_hi % 2^9, e_hi % 2^14, e_hi % 2^18
  3273.             local tmp1 = (AND(e_lo, f_lo) + AND(-1-e_lo, g_lo)) % 2^32 + h_lo + K_lo[j] + W[jj]
  3274.                + XOR((e_lo - e_lo_14) / 2^14 + e_hi_14 * 2^18, (e_lo - e_lo_18) / 2^18 + e_hi_18 * 2^14, e_lo_9 * 2^23 + (e_hi - e_hi_9) / 2^9) % 2^32
  3275.             local z_lo = tmp1 % 2^32
  3276.             local z_hi = AND(e_hi, f_hi) + AND(-1-e_hi, g_hi) + h_hi + K_hi[j] + W[jj-1] + (tmp1 - z_lo) / 2^32
  3277.                + XOR((e_hi - e_hi_14) / 2^14 + e_lo_14 * 2^18, (e_hi - e_hi_18) / 2^18 + e_lo_18 * 2^14, e_hi_9 * 2^23 + (e_lo - e_lo_9) / 2^9)
  3278.             h_lo = g_lo;  h_hi = g_hi
  3279.             g_lo = f_lo;  g_hi = f_hi
  3280.             f_lo = e_lo;  f_hi = e_hi
  3281.             tmp1 = z_lo + d_lo
  3282.             e_lo = tmp1 % 2^32
  3283.             e_hi = (z_hi + d_hi + (tmp1 - e_lo) / 2^32) % 2^32
  3284.             d_lo = c_lo;  d_hi = c_hi
  3285.             c_lo = b_lo;  c_hi = b_hi
  3286.             b_lo = a_lo;  b_hi = a_hi
  3287.             local b_lo_2, b_lo_7, b_lo_28, b_hi_2, b_hi_7, b_hi_28 = b_lo % 2^2, b_lo % 2^7, b_lo % 2^28, b_hi % 2^2, b_hi % 2^7, b_hi % 2^28
  3288.             tmp1 = z_lo + (AND(d_lo, c_lo) + AND(b_lo, XOR(d_lo, c_lo))) % 2^32
  3289.                + XOR((b_lo - b_lo_28) / 2^28 + b_hi_28 * 2^4, b_lo_2 * 2^30 + (b_hi - b_hi_2) / 2^2, b_lo_7 * 2^25 + (b_hi - b_hi_7) / 2^7) % 2^32
  3290.             a_lo = tmp1 % 2^32
  3291.             a_hi = (z_hi + AND(d_hi, c_hi) + AND(b_hi, XOR(d_hi, c_hi)) + (tmp1 - a_lo) / 2^32
  3292.                + XOR((b_hi - b_hi_28) / 2^28 + b_lo_28 * 2^4, b_hi_2 * 2^30 + (b_lo - b_lo_2) / 2^2, b_hi_7 * 2^25 + (b_lo - b_lo_7) / 2^7)) % 2^32
  3293.          end
  3294.          a_lo = h1_lo + a_lo
  3295.          h1_lo = a_lo % 2^32
  3296.          h1_hi = (h1_hi + a_hi + (a_lo - h1_lo) / 2^32) % 2^32
  3297.          a_lo = h2_lo + b_lo
  3298.          h2_lo = a_lo % 2^32
  3299.          h2_hi = (h2_hi + b_hi + (a_lo - h2_lo) / 2^32) % 2^32
  3300.          a_lo = h3_lo + c_lo
  3301.          h3_lo = a_lo % 2^32
  3302.          h3_hi = (h3_hi + c_hi + (a_lo - h3_lo) / 2^32) % 2^32
  3303.          a_lo = h4_lo + d_lo
  3304.          h4_lo = a_lo % 2^32
  3305.          h4_hi = (h4_hi + d_hi + (a_lo - h4_lo) / 2^32) % 2^32
  3306.          a_lo = h5_lo + e_lo
  3307.          h5_lo = a_lo % 2^32
  3308.          h5_hi = (h5_hi + e_hi + (a_lo - h5_lo) / 2^32) % 2^32
  3309.          a_lo = h6_lo + f_lo
  3310.          h6_lo = a_lo % 2^32
  3311.          h6_hi = (h6_hi + f_hi + (a_lo - h6_lo) / 2^32) % 2^32
  3312.          a_lo = h7_lo + g_lo
  3313.          h7_lo = a_lo % 2^32
  3314.          h7_hi = (h7_hi + g_hi + (a_lo - h7_lo) / 2^32) % 2^32
  3315.          a_lo = h8_lo + h_lo
  3316.          h8_lo = a_lo % 2^32
  3317.          h8_hi = (h8_hi + h_hi + (a_lo - h8_lo) / 2^32) % 2^32
  3318.       end
  3319.       H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8] = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3320.       H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8] = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3321.    end
  3322.  
  3323.  
  3324.    if branch == "LIB32" then
  3325.  
  3326.       function md5_feed_64(H, str, offs, size)
  3327.          -- offs >= 0, size >= 0, size is multiple of 64
  3328.          local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  3329.          local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  3330.          for pos = offs, offs + size - 1, 64 do
  3331.             for j = 1, 16 do
  3332.                pos = pos + 4
  3333.                local a, b, c, d = byte(str, pos - 3, pos)
  3334.                W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3335.             end
  3336.             local a, b, c, d = h1, h2, h3, h4
  3337.             local s = 25
  3338.             for j = 1, 16 do
  3339.                local F = ROR(AND(b, c) + AND(-1-b, d) + a + K[j] + W[j], s) + b
  3340.                s = md5_next_shift[s]
  3341.                a = d
  3342.                d = c
  3343.                c = b
  3344.                b = F
  3345.             end
  3346.             s = 27
  3347.             for j = 17, 32 do
  3348.                local F = ROR(AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1], s) + b
  3349.                s = md5_next_shift[s]
  3350.                a = d
  3351.                d = c
  3352.                c = b
  3353.                b = F
  3354.             end
  3355.             s = 28
  3356.             for j = 33, 48 do
  3357.                local F = ROR(XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1], s) + b
  3358.                s = md5_next_shift[s]
  3359.                a = d
  3360.                d = c
  3361.                c = b
  3362.                b = F
  3363.             end
  3364.             s = 26
  3365.             for j = 49, 64 do
  3366.                local F = ROR(XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1], s) + b
  3367.                s = md5_next_shift[s]
  3368.                a = d
  3369.                d = c
  3370.                c = b
  3371.                b = F
  3372.             end
  3373.             h1 = (a + h1) % 2^32
  3374.             h2 = (b + h2) % 2^32
  3375.             h3 = (c + h3) % 2^32
  3376.             h4 = (d + h4) % 2^32
  3377.          end
  3378.          H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  3379.       end
  3380.  
  3381.    elseif branch == "EMUL" then
  3382.  
  3383.       function md5_feed_64(H, str, offs, size)
  3384.          -- offs >= 0, size >= 0, size is multiple of 64
  3385.          local W, K, md5_next_shift = common_W, md5_K, md5_next_shift
  3386.          local h1, h2, h3, h4 = H[1], H[2], H[3], H[4]
  3387.          for pos = offs, offs + size - 1, 64 do
  3388.             for j = 1, 16 do
  3389.                pos = pos + 4
  3390.                local a, b, c, d = byte(str, pos - 3, pos)
  3391.                W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3392.             end
  3393.             local a, b, c, d = h1, h2, h3, h4
  3394.             local s = 25
  3395.             for j = 1, 16 do
  3396.                local z = (AND(b, c) + AND(-1-b, d) + a + K[j] + W[j]) % 2^32 / 2^s
  3397.                local y = z % 1
  3398.                s = md5_next_shift[s]
  3399.                a = d
  3400.                d = c
  3401.                c = b
  3402.                b = y * 2^32 + (z - y) + b
  3403.             end
  3404.             s = 27
  3405.             for j = 17, 32 do
  3406.                local z = (AND(d, b) + AND(-1-d, c) + a + K[j] + W[(5*j-4) % 16 + 1]) % 2^32 / 2^s
  3407.                local y = z % 1
  3408.                s = md5_next_shift[s]
  3409.                a = d
  3410.                d = c
  3411.                c = b
  3412.                b = y * 2^32 + (z - y) + b
  3413.             end
  3414.             s = 28
  3415.             for j = 33, 48 do
  3416.                local z = (XOR(XOR(b, c), d) + a + K[j] + W[(3*j+2) % 16 + 1]) % 2^32 / 2^s
  3417.                local y = z % 1
  3418.                s = md5_next_shift[s]
  3419.                a = d
  3420.                d = c
  3421.                c = b
  3422.                b = y * 2^32 + (z - y) + b
  3423.             end
  3424.             s = 26
  3425.             for j = 49, 64 do
  3426.                local z = (XOR(c, OR(b, -1-d)) + a + K[j] + W[(j*7-7) % 16 + 1]) % 2^32 / 2^s
  3427.                local y = z % 1
  3428.                s = md5_next_shift[s]
  3429.                a = d
  3430.                d = c
  3431.                c = b
  3432.                b = y * 2^32 + (z - y) + b
  3433.             end
  3434.             h1 = (a + h1) % 2^32
  3435.             h2 = (b + h2) % 2^32
  3436.             h3 = (c + h3) % 2^32
  3437.             h4 = (d + h4) % 2^32
  3438.          end
  3439.          H[1], H[2], H[3], H[4] = h1, h2, h3, h4
  3440.       end
  3441.  
  3442.    end
  3443.  
  3444.  
  3445.    function sha1_feed_64(H, str, offs, size)
  3446.       -- offs >= 0, size >= 0, size is multiple of 64
  3447.       local W = common_W
  3448.       local h1, h2, h3, h4, h5 = H[1], H[2], H[3], H[4], H[5]
  3449.       for pos = offs, offs + size - 1, 64 do
  3450.          for j = 1, 16 do
  3451.             pos = pos + 4
  3452.             local a, b, c, d = byte(str, pos - 3, pos)
  3453.             W[j] = ((a * 256 + b) * 256 + c) * 256 + d
  3454.          end
  3455.          for j = 17, 80 do
  3456.             local a = XOR(W[j-3], W[j-8], W[j-14], W[j-16]) % 2^32 * 2
  3457.             local b = a % 2^32
  3458.             W[j] = b + (a - b) / 2^32
  3459.          end
  3460.          local a, b, c, d, e = h1, h2, h3, h4, h5
  3461.          for j = 1, 20 do
  3462.             local a5 = a * 2^5
  3463.             local z = a5 % 2^32
  3464.             z = z + (a5 - z) / 2^32 + AND(b, c) + AND(-1-b, d) + 0x5A827999 + W[j] + e        -- constant = floor(2^30 * sqrt(2))
  3465.             e = d
  3466.             d = c
  3467.             c = b / 2^2
  3468.             c = c % 1 * (2^32 - 1) + c
  3469.             b = a
  3470.             a = z % 2^32
  3471.          end
  3472.          for j = 21, 40 do
  3473.             local a5 = a * 2^5
  3474.             local z = a5 % 2^32
  3475.             z = z + (a5 - z) / 2^32 + XOR(b, c, d) + 0x6ED9EBA1 + W[j] + e                    -- 2^30 * sqrt(3)
  3476.             e = d
  3477.             d = c
  3478.             c = b / 2^2
  3479.             c = c % 1 * (2^32 - 1) + c
  3480.             b = a
  3481.             a = z % 2^32
  3482.          end
  3483.          for j = 41, 60 do
  3484.             local a5 = a * 2^5
  3485.             local z = a5 % 2^32
  3486.             z = z + (a5 - z) / 2^32 + AND(d, c) + AND(b, XOR(d, c)) + 0x8F1BBCDC + W[j] + e   -- 2^30 * sqrt(5)
  3487.             e = d
  3488.             d = c
  3489.             c = b / 2^2
  3490.             c = c % 1 * (2^32 - 1) + c
  3491.             b = a
  3492.             a = z % 2^32
  3493.          end
  3494.          for j = 61, 80 do
  3495.             local a5 = a * 2^5
  3496.             local z = a5 % 2^32
  3497.             z = z + (a5 - z) / 2^32 + XOR(b, c, d) + 0xCA62C1D6 + W[j] + e                    -- 2^30 * sqrt(10)
  3498.             e = d
  3499.             d = c
  3500.             c = b / 2^2
  3501.             c = c % 1 * (2^32 - 1) + c
  3502.             b = a
  3503.             a = z % 2^32
  3504.          end
  3505.          h1 = (a + h1) % 2^32
  3506.          h2 = (b + h2) % 2^32
  3507.          h3 = (c + h3) % 2^32
  3508.          h4 = (d + h4) % 2^32
  3509.          h5 = (e + h5) % 2^32
  3510.       end
  3511.       H[1], H[2], H[3], H[4], H[5] = h1, h2, h3, h4, h5
  3512.    end
  3513.  
  3514.  
  3515.    function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
  3516.       -- This is an example of a Lua function having 79 local variables :-)
  3517.       -- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
  3518.       local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
  3519.       local qwords_qty = block_size_in_bytes / 8
  3520.       for pos = offs, offs + size - 1, block_size_in_bytes do
  3521.          for j = 1, qwords_qty do
  3522.             local a, b, c, d = byte(str, pos + 1, pos + 4)
  3523.             lanes_lo[j] = XOR(lanes_lo[j], ((d * 256 + c) * 256 + b) * 256 + a)
  3524.             pos = pos + 8
  3525.             a, b, c, d = byte(str, pos - 3, pos)
  3526.             lanes_hi[j] = XOR(lanes_hi[j], ((d * 256 + c) * 256 + b) * 256 + a)
  3527.          end
  3528.          local L01_lo, L01_hi, L02_lo, L02_hi, L03_lo, L03_hi, L04_lo, L04_hi, L05_lo, L05_hi, L06_lo, L06_hi, L07_lo, L07_hi, L08_lo, L08_hi,
  3529.             L09_lo, L09_hi, L10_lo, L10_hi, L11_lo, L11_hi, L12_lo, L12_hi, L13_lo, L13_hi, L14_lo, L14_hi, L15_lo, L15_hi, L16_lo, L16_hi,
  3530.             L17_lo, L17_hi, L18_lo, L18_hi, L19_lo, L19_hi, L20_lo, L20_hi, L21_lo, L21_hi, L22_lo, L22_hi, L23_lo, L23_hi, L24_lo, L24_hi, L25_lo, L25_hi =
  3531.             lanes_lo[1], lanes_hi[1], lanes_lo[2], lanes_hi[2], lanes_lo[3], lanes_hi[3], lanes_lo[4], lanes_hi[4], lanes_lo[5], lanes_hi[5],
  3532.             lanes_lo[6], lanes_hi[6], lanes_lo[7], lanes_hi[7], lanes_lo[8], lanes_hi[8], lanes_lo[9], lanes_hi[9], lanes_lo[10], lanes_hi[10],
  3533.             lanes_lo[11], lanes_hi[11], lanes_lo[12], lanes_hi[12], lanes_lo[13], lanes_hi[13], lanes_lo[14], lanes_hi[14], lanes_lo[15], lanes_hi[15],
  3534.             lanes_lo[16], lanes_hi[16], lanes_lo[17], lanes_hi[17], lanes_lo[18], lanes_hi[18], lanes_lo[19], lanes_hi[19], lanes_lo[20], lanes_hi[20],
  3535.             lanes_lo[21], lanes_hi[21], lanes_lo[22], lanes_hi[22], lanes_lo[23], lanes_hi[23], lanes_lo[24], lanes_hi[24], lanes_lo[25], lanes_hi[25]
  3536.          for round_idx = 1, 24 do
  3537.             local C1_lo = XOR(L01_lo, L06_lo, L11_lo, L16_lo, L21_lo)
  3538.             local C1_hi = XOR(L01_hi, L06_hi, L11_hi, L16_hi, L21_hi)
  3539.             local C2_lo = XOR(L02_lo, L07_lo, L12_lo, L17_lo, L22_lo)
  3540.             local C2_hi = XOR(L02_hi, L07_hi, L12_hi, L17_hi, L22_hi)
  3541.             local C3_lo = XOR(L03_lo, L08_lo, L13_lo, L18_lo, L23_lo)
  3542.             local C3_hi = XOR(L03_hi, L08_hi, L13_hi, L18_hi, L23_hi)
  3543.             local C4_lo = XOR(L04_lo, L09_lo, L14_lo, L19_lo, L24_lo)
  3544.             local C4_hi = XOR(L04_hi, L09_hi, L14_hi, L19_hi, L24_hi)
  3545.             local C5_lo = XOR(L05_lo, L10_lo, L15_lo, L20_lo, L25_lo)
  3546.             local C5_hi = XOR(L05_hi, L10_hi, L15_hi, L20_hi, L25_hi)
  3547.             local D_lo = XOR(C1_lo, C3_lo * 2 + (C3_hi % 2^32 - C3_hi % 2^31) / 2^31)
  3548.             local D_hi = XOR(C1_hi, C3_hi * 2 + (C3_lo % 2^32 - C3_lo % 2^31) / 2^31)
  3549.             local T0_lo = XOR(D_lo, L02_lo)
  3550.             local T0_hi = XOR(D_hi, L02_hi)
  3551.             local T1_lo = XOR(D_lo, L07_lo)
  3552.             local T1_hi = XOR(D_hi, L07_hi)
  3553.             local T2_lo = XOR(D_lo, L12_lo)
  3554.             local T2_hi = XOR(D_hi, L12_hi)
  3555.             local T3_lo = XOR(D_lo, L17_lo)
  3556.             local T3_hi = XOR(D_hi, L17_hi)
  3557.             local T4_lo = XOR(D_lo, L22_lo)
  3558.             local T4_hi = XOR(D_hi, L22_hi)
  3559.             L02_lo = (T1_lo % 2^32 - T1_lo % 2^20) / 2^20 + T1_hi * 2^12
  3560.             L02_hi = (T1_hi % 2^32 - T1_hi % 2^20) / 2^20 + T1_lo * 2^12
  3561.             L07_lo = (T3_lo % 2^32 - T3_lo % 2^19) / 2^19 + T3_hi * 2^13
  3562.             L07_hi = (T3_hi % 2^32 - T3_hi % 2^19) / 2^19 + T3_lo * 2^13
  3563.             L12_lo = T0_lo * 2 + (T0_hi % 2^32 - T0_hi % 2^31) / 2^31
  3564.             L12_hi = T0_hi * 2 + (T0_lo % 2^32 - T0_lo % 2^31) / 2^31
  3565.             L17_lo = T2_lo * 2^10 + (T2_hi % 2^32 - T2_hi % 2^22) / 2^22
  3566.             L17_hi = T2_hi * 2^10 + (T2_lo % 2^32 - T2_lo % 2^22) / 2^22
  3567.             L22_lo = T4_lo * 2^2 + (T4_hi % 2^32 - T4_hi % 2^30) / 2^30
  3568.             L22_hi = T4_hi * 2^2 + (T4_lo % 2^32 - T4_lo % 2^30) / 2^30
  3569.             D_lo = XOR(C2_lo, C4_lo * 2 + (C4_hi % 2^32 - C4_hi % 2^31) / 2^31)
  3570.             D_hi = XOR(C2_hi, C4_hi * 2 + (C4_lo % 2^32 - C4_lo % 2^31) / 2^31)
  3571.             T0_lo = XOR(D_lo, L03_lo)
  3572.             T0_hi = XOR(D_hi, L03_hi)
  3573.             T1_lo = XOR(D_lo, L08_lo)
  3574.             T1_hi = XOR(D_hi, L08_hi)
  3575.             T2_lo = XOR(D_lo, L13_lo)
  3576.             T2_hi = XOR(D_hi, L13_hi)
  3577.             T3_lo = XOR(D_lo, L18_lo)
  3578.             T3_hi = XOR(D_hi, L18_hi)
  3579.             T4_lo = XOR(D_lo, L23_lo)
  3580.             T4_hi = XOR(D_hi, L23_hi)
  3581.             L03_lo = (T2_lo % 2^32 - T2_lo % 2^21) / 2^21 + T2_hi * 2^11
  3582.             L03_hi = (T2_hi % 2^32 - T2_hi % 2^21) / 2^21 + T2_lo * 2^11
  3583.             L08_lo = (T4_lo % 2^32 - T4_lo % 2^3) / 2^3 + T4_hi * 2^29 % 2^32
  3584.             L08_hi = (T4_hi % 2^32 - T4_hi % 2^3) / 2^3 + T4_lo * 2^29 % 2^32
  3585.             L13_lo = T1_lo * 2^6 + (T1_hi % 2^32 - T1_hi % 2^26) / 2^26
  3586.             L13_hi = T1_hi * 2^6 + (T1_lo % 2^32 - T1_lo % 2^26) / 2^26
  3587.             L18_lo = T3_lo * 2^15 + (T3_hi % 2^32 - T3_hi % 2^17) / 2^17
  3588.             L18_hi = T3_hi * 2^15 + (T3_lo % 2^32 - T3_lo % 2^17) / 2^17
  3589.             L23_lo = (T0_lo % 2^32 - T0_lo % 2^2) / 2^2 + T0_hi * 2^30 % 2^32
  3590.             L23_hi = (T0_hi % 2^32 - T0_hi % 2^2) / 2^2 + T0_lo * 2^30 % 2^32
  3591.             D_lo = XOR(C3_lo, C5_lo * 2 + (C5_hi % 2^32 - C5_hi % 2^31) / 2^31)
  3592.             D_hi = XOR(C3_hi, C5_hi * 2 + (C5_lo % 2^32 - C5_lo % 2^31) / 2^31)
  3593.             T0_lo = XOR(D_lo, L04_lo)
  3594.             T0_hi = XOR(D_hi, L04_hi)
  3595.             T1_lo = XOR(D_lo, L09_lo)
  3596.             T1_hi = XOR(D_hi, L09_hi)
  3597.             T2_lo = XOR(D_lo, L14_lo)
  3598.             T2_hi = XOR(D_hi, L14_hi)
  3599.             T3_lo = XOR(D_lo, L19_lo)
  3600.             T3_hi = XOR(D_hi, L19_hi)
  3601.             T4_lo = XOR(D_lo, L24_lo)
  3602.             T4_hi = XOR(D_hi, L24_hi)
  3603.             L04_lo = T3_lo * 2^21 % 2^32 + (T3_hi % 2^32 - T3_hi % 2^11) / 2^11
  3604.             L04_hi = T3_hi * 2^21 % 2^32 + (T3_lo % 2^32 - T3_lo % 2^11) / 2^11
  3605.             L09_lo = T0_lo * 2^28 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^4) / 2^4
  3606.             L09_hi = T0_hi * 2^28 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^4) / 2^4
  3607.             L14_lo = T2_lo * 2^25 % 2^32 + (T2_hi % 2^32 - T2_hi % 2^7) / 2^7
  3608.             L14_hi = T2_hi * 2^25 % 2^32 + (T2_lo % 2^32 - T2_lo % 2^7) / 2^7
  3609.             L19_lo = (T4_lo % 2^32 - T4_lo % 2^8) / 2^8 + T4_hi * 2^24 % 2^32
  3610.             L19_hi = (T4_hi % 2^32 - T4_hi % 2^8) / 2^8 + T4_lo * 2^24 % 2^32
  3611.             L24_lo = (T1_lo % 2^32 - T1_lo % 2^9) / 2^9 + T1_hi * 2^23 % 2^32
  3612.             L24_hi = (T1_hi % 2^32 - T1_hi % 2^9) / 2^9 + T1_lo * 2^23 % 2^32
  3613.             D_lo = XOR(C4_lo, C1_lo * 2 + (C1_hi % 2^32 - C1_hi % 2^31) / 2^31)
  3614.             D_hi = XOR(C4_hi, C1_hi * 2 + (C1_lo % 2^32 - C1_lo % 2^31) / 2^31)
  3615.             T0_lo = XOR(D_lo, L05_lo)
  3616.             T0_hi = XOR(D_hi, L05_hi)
  3617.             T1_lo = XOR(D_lo, L10_lo)
  3618.             T1_hi = XOR(D_hi, L10_hi)
  3619.             T2_lo = XOR(D_lo, L15_lo)
  3620.             T2_hi = XOR(D_hi, L15_hi)
  3621.             T3_lo = XOR(D_lo, L20_lo)
  3622.             T3_hi = XOR(D_hi, L20_hi)
  3623.             T4_lo = XOR(D_lo, L25_lo)
  3624.             T4_hi = XOR(D_hi, L25_hi)
  3625.             L05_lo = T4_lo * 2^14 + (T4_hi % 2^32 - T4_hi % 2^18) / 2^18
  3626.             L05_hi = T4_hi * 2^14 + (T4_lo % 2^32 - T4_lo % 2^18) / 2^18
  3627.             L10_lo = T1_lo * 2^20 % 2^32 + (T1_hi % 2^32 - T1_hi % 2^12) / 2^12
  3628.             L10_hi = T1_hi * 2^20 % 2^32 + (T1_lo % 2^32 - T1_lo % 2^12) / 2^12
  3629.             L15_lo = T3_lo * 2^8 + (T3_hi % 2^32 - T3_hi % 2^24) / 2^24
  3630.             L15_hi = T3_hi * 2^8 + (T3_lo % 2^32 - T3_lo % 2^24) / 2^24
  3631.             L20_lo = T0_lo * 2^27 % 2^32 + (T0_hi % 2^32 - T0_hi % 2^5) / 2^5
  3632.             L20_hi = T0_hi * 2^27 % 2^32 + (T0_lo % 2^32 - T0_lo % 2^5) / 2^5
  3633.             L25_lo = (T2_lo % 2^32 - T2_lo % 2^25) / 2^25 + T2_hi * 2^7
  3634.             L25_hi = (T2_hi % 2^32 - T2_hi % 2^25) / 2^25 + T2_lo * 2^7
  3635.             D_lo = XOR(C5_lo, C2_lo * 2 + (C2_hi % 2^32 - C2_hi % 2^31) / 2^31)
  3636.             D_hi = XOR(C5_hi, C2_hi * 2 + (C2_lo % 2^32 - C2_lo % 2^31) / 2^31)
  3637.             T1_lo = XOR(D_lo, L06_lo)
  3638.             T1_hi = XOR(D_hi, L06_hi)
  3639.             T2_lo = XOR(D_lo, L11_lo)
  3640.             T2_hi = XOR(D_hi, L11_hi)
  3641.             T3_lo = XOR(D_lo, L16_lo)
  3642.             T3_hi = XOR(D_hi, L16_hi)
  3643.             T4_lo = XOR(D_lo, L21_lo)
  3644.             T4_hi = XOR(D_hi, L21_hi)
  3645.             L06_lo = T2_lo * 2^3 + (T2_hi % 2^32 - T2_hi % 2^29) / 2^29
  3646.             L06_hi = T2_hi * 2^3 + (T2_lo % 2^32 - T2_lo % 2^29) / 2^29
  3647.             L11_lo = T4_lo * 2^18 + (T4_hi % 2^32 - T4_hi % 2^14) / 2^14
  3648.             L11_hi = T4_hi * 2^18 + (T4_lo % 2^32 - T4_lo % 2^14) / 2^14
  3649.             L16_lo = (T1_lo % 2^32 - T1_lo % 2^28) / 2^28 + T1_hi * 2^4
  3650.             L16_hi = (T1_hi % 2^32 - T1_hi % 2^28) / 2^28 + T1_lo * 2^4
  3651.             L21_lo = (T3_lo % 2^32 - T3_lo % 2^23) / 2^23 + T3_hi * 2^9
  3652.             L21_hi = (T3_hi % 2^32 - T3_hi % 2^23) / 2^23 + T3_lo * 2^9
  3653.             L01_lo = XOR(D_lo, L01_lo)
  3654.             L01_hi = XOR(D_hi, L01_hi)
  3655.             L01_lo, L02_lo, L03_lo, L04_lo, L05_lo = XOR(L01_lo, AND(-1-L02_lo, L03_lo)), XOR(L02_lo, AND(-1-L03_lo, L04_lo)), XOR(L03_lo, AND(-1-L04_lo, L05_lo)), XOR(L04_lo, AND(-1-L05_lo, L01_lo)), XOR(L05_lo, AND(-1-L01_lo, L02_lo))
  3656.             L01_hi, L02_hi, L03_hi, L04_hi, L05_hi = XOR(L01_hi, AND(-1-L02_hi, L03_hi)), XOR(L02_hi, AND(-1-L03_hi, L04_hi)), XOR(L03_hi, AND(-1-L04_hi, L05_hi)), XOR(L04_hi, AND(-1-L05_hi, L01_hi)), XOR(L05_hi, AND(-1-L01_hi, L02_hi))
  3657.             L06_lo, L07_lo, L08_lo, L09_lo, L10_lo = XOR(L09_lo, AND(-1-L10_lo, L06_lo)), XOR(L10_lo, AND(-1-L06_lo, L07_lo)), XOR(L06_lo, AND(-1-L07_lo, L08_lo)), XOR(L07_lo, AND(-1-L08_lo, L09_lo)), XOR(L08_lo, AND(-1-L09_lo, L10_lo))
  3658.             L06_hi, L07_hi, L08_hi, L09_hi, L10_hi = XOR(L09_hi, AND(-1-L10_hi, L06_hi)), XOR(L10_hi, AND(-1-L06_hi, L07_hi)), XOR(L06_hi, AND(-1-L07_hi, L08_hi)), XOR(L07_hi, AND(-1-L08_hi, L09_hi)), XOR(L08_hi, AND(-1-L09_hi, L10_hi))
  3659.             L11_lo, L12_lo, L13_lo, L14_lo, L15_lo = XOR(L12_lo, AND(-1-L13_lo, L14_lo)), XOR(L13_lo, AND(-1-L14_lo, L15_lo)), XOR(L14_lo, AND(-1-L15_lo, L11_lo)), XOR(L15_lo, AND(-1-L11_lo, L12_lo)), XOR(L11_lo, AND(-1-L12_lo, L13_lo))
  3660.             L11_hi, L12_hi, L13_hi, L14_hi, L15_hi = XOR(L12_hi, AND(-1-L13_hi, L14_hi)), XOR(L13_hi, AND(-1-L14_hi, L15_hi)), XOR(L14_hi, AND(-1-L15_hi, L11_hi)), XOR(L15_hi, AND(-1-L11_hi, L12_hi)), XOR(L11_hi, AND(-1-L12_hi, L13_hi))
  3661.             L16_lo, L17_lo, L18_lo, L19_lo, L20_lo = XOR(L20_lo, AND(-1-L16_lo, L17_lo)), XOR(L16_lo, AND(-1-L17_lo, L18_lo)), XOR(L17_lo, AND(-1-L18_lo, L19_lo)), XOR(L18_lo, AND(-1-L19_lo, L20_lo)), XOR(L19_lo, AND(-1-L20_lo, L16_lo))
  3662.             L16_hi, L17_hi, L18_hi, L19_hi, L20_hi = XOR(L20_hi, AND(-1-L16_hi, L17_hi)), XOR(L16_hi, AND(-1-L17_hi, L18_hi)), XOR(L17_hi, AND(-1-L18_hi, L19_hi)), XOR(L18_hi, AND(-1-L19_hi, L20_hi)), XOR(L19_hi, AND(-1-L20_hi, L16_hi))
  3663.             L21_lo, L22_lo, L23_lo, L24_lo, L25_lo = XOR(L23_lo, AND(-1-L24_lo, L25_lo)), XOR(L24_lo, AND(-1-L25_lo, L21_lo)), XOR(L25_lo, AND(-1-L21_lo, L22_lo)), XOR(L21_lo, AND(-1-L22_lo, L23_lo)), XOR(L22_lo, AND(-1-L23_lo, L24_lo))
  3664.             L21_hi, L22_hi, L23_hi, L24_hi, L25_hi = XOR(L23_hi, AND(-1-L24_hi, L25_hi)), XOR(L24_hi, AND(-1-L25_hi, L21_hi)), XOR(L25_hi, AND(-1-L21_hi, L22_hi)), XOR(L21_hi, AND(-1-L22_hi, L23_hi)), XOR(L22_hi, AND(-1-L23_hi, L24_hi))
  3665.             L01_lo = XOR(L01_lo, RC_lo[round_idx])
  3666.             L01_hi = L01_hi + RC_hi[round_idx]      -- RC_hi[] is either 0 or 0x80000000, so we could use fast addition instead of slow XOR
  3667.          end
  3668.          lanes_lo[1]  = L01_lo;  lanes_hi[1]  = L01_hi
  3669.          lanes_lo[2]  = L02_lo;  lanes_hi[2]  = L02_hi
  3670.          lanes_lo[3]  = L03_lo;  lanes_hi[3]  = L03_hi
  3671.          lanes_lo[4]  = L04_lo;  lanes_hi[4]  = L04_hi
  3672.          lanes_lo[5]  = L05_lo;  lanes_hi[5]  = L05_hi
  3673.          lanes_lo[6]  = L06_lo;  lanes_hi[6]  = L06_hi
  3674.          lanes_lo[7]  = L07_lo;  lanes_hi[7]  = L07_hi
  3675.          lanes_lo[8]  = L08_lo;  lanes_hi[8]  = L08_hi
  3676.          lanes_lo[9]  = L09_lo;  lanes_hi[9]  = L09_hi
  3677.          lanes_lo[10] = L10_lo;  lanes_hi[10] = L10_hi
  3678.          lanes_lo[11] = L11_lo;  lanes_hi[11] = L11_hi
  3679.          lanes_lo[12] = L12_lo;  lanes_hi[12] = L12_hi
  3680.          lanes_lo[13] = L13_lo;  lanes_hi[13] = L13_hi
  3681.          lanes_lo[14] = L14_lo;  lanes_hi[14] = L14_hi
  3682.          lanes_lo[15] = L15_lo;  lanes_hi[15] = L15_hi
  3683.          lanes_lo[16] = L16_lo;  lanes_hi[16] = L16_hi
  3684.          lanes_lo[17] = L17_lo;  lanes_hi[17] = L17_hi
  3685.          lanes_lo[18] = L18_lo;  lanes_hi[18] = L18_hi
  3686.          lanes_lo[19] = L19_lo;  lanes_hi[19] = L19_hi
  3687.          lanes_lo[20] = L20_lo;  lanes_hi[20] = L20_hi
  3688.          lanes_lo[21] = L21_lo;  lanes_hi[21] = L21_hi
  3689.          lanes_lo[22] = L22_lo;  lanes_hi[22] = L22_hi
  3690.          lanes_lo[23] = L23_lo;  lanes_hi[23] = L23_hi
  3691.          lanes_lo[24] = L24_lo;  lanes_hi[24] = L24_hi
  3692.          lanes_lo[25] = L25_lo;  lanes_hi[25] = L25_hi
  3693.       end
  3694.    end
  3695.  
  3696.  
  3697.    function blake2s_feed_64(H, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  3698.       -- offs >= 0, size >= 0, size is multiple of 64
  3699.       local W = common_W
  3700.       local h1, h2, h3, h4, h5, h6, h7, h8 = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
  3701.       for pos = offs, offs + size - 1, 64 do
  3702.          if str then
  3703.             for j = 1, 16 do
  3704.                pos = pos + 4
  3705.                local a, b, c, d = byte(str, pos - 3, pos)
  3706.                W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3707.             end
  3708.          end
  3709.          local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  3710.          local v8, v9, vA, vB, vC, vD, vE, vF = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  3711.          bytes_compressed = bytes_compressed + (last_block_size or 64)
  3712.          local t0 = bytes_compressed % 2^32
  3713.          local t1 = (bytes_compressed - t0) / 2^32
  3714.          vC = XOR(vC, t0)  -- t0 = low_4_bytes(bytes_compressed)
  3715.          vD = XOR(vD, t1)  -- t1 = high_4_bytes(bytes_compressed)
  3716.          if last_block_size then  -- flag f0
  3717.             vE = -1 - vE
  3718.          end
  3719.          if is_last_node then  -- flag f1
  3720.             vF = -1 - vF
  3721.          end
  3722.          for j = 1, 10 do
  3723.             local row = sigma[j]
  3724.             v0 = v0 + v4 + W[row[1]]
  3725.             vC = XOR(vC, v0) % 2^32 / 2^16
  3726.             vC = vC % 1 * (2^32 - 1) + vC
  3727.             v8 = v8 + vC
  3728.             v4 = XOR(v4, v8) % 2^32 / 2^12
  3729.             v4 = v4 % 1 * (2^32 - 1) + v4
  3730.             v0 = v0 + v4 + W[row[2]]
  3731.             vC = XOR(vC, v0) % 2^32 / 2^8
  3732.             vC = vC % 1 * (2^32 - 1) + vC
  3733.             v8 = v8 + vC
  3734.             v4 = XOR(v4, v8) % 2^32 / 2^7
  3735.             v4 = v4 % 1 * (2^32 - 1) + v4
  3736.             v1 = v1 + v5 + W[row[3]]
  3737.             vD = XOR(vD, v1) % 2^32 / 2^16
  3738.             vD = vD % 1 * (2^32 - 1) + vD
  3739.             v9 = v9 + vD
  3740.             v5 = XOR(v5, v9) % 2^32 / 2^12
  3741.             v5 = v5 % 1 * (2^32 - 1) + v5
  3742.             v1 = v1 + v5 + W[row[4]]
  3743.             vD = XOR(vD, v1) % 2^32 / 2^8
  3744.             vD = vD % 1 * (2^32 - 1) + vD
  3745.             v9 = v9 + vD
  3746.             v5 = XOR(v5, v9) % 2^32 / 2^7
  3747.             v5 = v5 % 1 * (2^32 - 1) + v5
  3748.             v2 = v2 + v6 + W[row[5]]
  3749.             vE = XOR(vE, v2) % 2^32 / 2^16
  3750.             vE = vE % 1 * (2^32 - 1) + vE
  3751.             vA = vA + vE
  3752.             v6 = XOR(v6, vA) % 2^32 / 2^12
  3753.             v6 = v6 % 1 * (2^32 - 1) + v6
  3754.             v2 = v2 + v6 + W[row[6]]
  3755.             vE = XOR(vE, v2) % 2^32 / 2^8
  3756.             vE = vE % 1 * (2^32 - 1) + vE
  3757.             vA = vA + vE
  3758.             v6 = XOR(v6, vA) % 2^32 / 2^7
  3759.             v6 = v6 % 1 * (2^32 - 1) + v6
  3760.             v3 = v3 + v7 + W[row[7]]
  3761.             vF = XOR(vF, v3) % 2^32 / 2^16
  3762.             vF = vF % 1 * (2^32 - 1) + vF
  3763.             vB = vB + vF
  3764.             v7 = XOR(v7, vB) % 2^32 / 2^12
  3765.             v7 = v7 % 1 * (2^32 - 1) + v7
  3766.             v3 = v3 + v7 + W[row[8]]
  3767.             vF = XOR(vF, v3) % 2^32 / 2^8
  3768.             vF = vF % 1 * (2^32 - 1) + vF
  3769.             vB = vB + vF
  3770.             v7 = XOR(v7, vB) % 2^32 / 2^7
  3771.             v7 = v7 % 1 * (2^32 - 1) + v7
  3772.             v0 = v0 + v5 + W[row[9]]
  3773.             vF = XOR(vF, v0) % 2^32 / 2^16
  3774.             vF = vF % 1 * (2^32 - 1) + vF
  3775.             vA = vA + vF
  3776.             v5 = XOR(v5, vA) % 2^32 / 2^12
  3777.             v5 = v5 % 1 * (2^32 - 1) + v5
  3778.             v0 = v0 + v5 + W[row[10]]
  3779.             vF = XOR(vF, v0) % 2^32 / 2^8
  3780.             vF = vF % 1 * (2^32 - 1) + vF
  3781.             vA = vA + vF
  3782.             v5 = XOR(v5, vA) % 2^32 / 2^7
  3783.             v5 = v5 % 1 * (2^32 - 1) + v5
  3784.             v1 = v1 + v6 + W[row[11]]
  3785.             vC = XOR(vC, v1) % 2^32 / 2^16
  3786.             vC = vC % 1 * (2^32 - 1) + vC
  3787.             vB = vB + vC
  3788.             v6 = XOR(v6, vB) % 2^32 / 2^12
  3789.             v6 = v6 % 1 * (2^32 - 1) + v6
  3790.             v1 = v1 + v6 + W[row[12]]
  3791.             vC = XOR(vC, v1) % 2^32 / 2^8
  3792.             vC = vC % 1 * (2^32 - 1) + vC
  3793.             vB = vB + vC
  3794.             v6 = XOR(v6, vB) % 2^32 / 2^7
  3795.             v6 = v6 % 1 * (2^32 - 1) + v6
  3796.             v2 = v2 + v7 + W[row[13]]
  3797.             vD = XOR(vD, v2) % 2^32 / 2^16
  3798.             vD = vD % 1 * (2^32 - 1) + vD
  3799.             v8 = v8 + vD
  3800.             v7 = XOR(v7, v8) % 2^32 / 2^12
  3801.             v7 = v7 % 1 * (2^32 - 1) + v7
  3802.             v2 = v2 + v7 + W[row[14]]
  3803.             vD = XOR(vD, v2) % 2^32 / 2^8
  3804.             vD = vD % 1 * (2^32 - 1) + vD
  3805.             v8 = v8 + vD
  3806.             v7 = XOR(v7, v8) % 2^32 / 2^7
  3807.             v7 = v7 % 1 * (2^32 - 1) + v7
  3808.             v3 = v3 + v4 + W[row[15]]
  3809.             vE = XOR(vE, v3) % 2^32 / 2^16
  3810.             vE = vE % 1 * (2^32 - 1) + vE
  3811.             v9 = v9 + vE
  3812.             v4 = XOR(v4, v9) % 2^32 / 2^12
  3813.             v4 = v4 % 1 * (2^32 - 1) + v4
  3814.             v3 = v3 + v4 + W[row[16]]
  3815.             vE = XOR(vE, v3) % 2^32 / 2^8
  3816.             vE = vE % 1 * (2^32 - 1) + vE
  3817.             v9 = v9 + vE
  3818.             v4 = XOR(v4, v9) % 2^32 / 2^7
  3819.             v4 = v4 % 1 * (2^32 - 1) + v4
  3820.          end
  3821.          h1 = XOR(h1, v0, v8)
  3822.          h2 = XOR(h2, v1, v9)
  3823.          h3 = XOR(h3, v2, vA)
  3824.          h4 = XOR(h4, v3, vB)
  3825.          h5 = XOR(h5, v4, vC)
  3826.          h6 = XOR(h6, v5, vD)
  3827.          h7 = XOR(h7, v6, vE)
  3828.          h8 = XOR(h8, v7, vF)
  3829.       end
  3830.       H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3831.       return bytes_compressed
  3832.    end
  3833.  
  3834.  
  3835.    function blake2b_feed_128(H_lo, H_hi, str, offs, size, bytes_compressed, last_block_size, is_last_node)
  3836.       -- offs >= 0, size >= 0, size is multiple of 128
  3837.       local W = common_W
  3838.       local h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
  3839.       local h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
  3840.       for pos = offs, offs + size - 1, 128 do
  3841.          if str then
  3842.             for j = 1, 32 do
  3843.                pos = pos + 4
  3844.                local a, b, c, d = byte(str, pos - 3, pos)
  3845.                W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3846.             end
  3847.          end
  3848.          local v0_lo, v1_lo, v2_lo, v3_lo, v4_lo, v5_lo, v6_lo, v7_lo = h1_lo, h2_lo, h3_lo, h4_lo, h5_lo, h6_lo, h7_lo, h8_lo
  3849.          local v0_hi, v1_hi, v2_hi, v3_hi, v4_hi, v5_hi, v6_hi, v7_hi = h1_hi, h2_hi, h3_hi, h4_hi, h5_hi, h6_hi, h7_hi, h8_hi
  3850.          local v8_lo, v9_lo, vA_lo, vB_lo, vC_lo, vD_lo, vE_lo, vF_lo = sha2_H_lo[1], sha2_H_lo[2], sha2_H_lo[3], sha2_H_lo[4], sha2_H_lo[5], sha2_H_lo[6], sha2_H_lo[7], sha2_H_lo[8]
  3851.          local v8_hi, v9_hi, vA_hi, vB_hi, vC_hi, vD_hi, vE_hi, vF_hi = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4], sha2_H_hi[5], sha2_H_hi[6], sha2_H_hi[7], sha2_H_hi[8]
  3852.          bytes_compressed = bytes_compressed + (last_block_size or 128)
  3853.          local t0_lo = bytes_compressed % 2^32
  3854.          local t0_hi = (bytes_compressed - t0_lo) / 2^32
  3855.          vC_lo = XOR(vC_lo, t0_lo)  -- t0 = low_8_bytes(bytes_compressed)
  3856.          vC_hi = XOR(vC_hi, t0_hi)
  3857.          -- t1 = high_8_bytes(bytes_compressed) = 0,  message length is always below 2^53 bytes
  3858.          if last_block_size then  -- flag f0
  3859.             vE_lo = -1 - vE_lo
  3860.             vE_hi = -1 - vE_hi
  3861.          end
  3862.          if is_last_node then  -- flag f1
  3863. -- offs >= 0, size >= 0, size is multiple of 64
  3864.       block_length = block_length or 64
  3865.       local W = common_W
  3866.       local h1, h2, h3, h4, h5, h6, h7, h8 = H_in[1], H_in[2], H_in[3], H_in[4], H_in[5], H_in[6], H_in[7], H_in[8]
  3867.       H_out = H_out or H_in
  3868.       for pos = offs, offs + size - 1, 64 do
  3869.          if str then
  3870.             for j = 1, 16 do
  3871.                pos = pos + 4
  3872.                local a, b, c, d = byte(str, pos - 3, pos)
  3873.                W[j] = ((d * 256 + c) * 256 + b) * 256 + a
  3874.             end
  3875.          end
  3876.          local v0, v1, v2, v3, v4, v5, v6, v7 = h1, h2, h3, h4, h5, h6, h7, h8
  3877.          local v8, v9, vA, vB = sha2_H_hi[1], sha2_H_hi[2], sha2_H_hi[3], sha2_H_hi[4]
  3878.          local vC = chunk_index % 2^32         -- t0 = low_4_bytes(chunk_index)
  3879.          local vD = (chunk_index - vC) / 2^32  -- t1 = high_4_bytes(chunk_index)
  3880.          local vE, vF = block_length, flags
  3881.          for j = 1, 7 do
  3882.             v0 = v0 + v4 + W[perm_blake3[j]]
  3883.             vC = XOR(vC, v0) % 2^32 / 2^16
  3884.             vC = vC % 1 * (2^32 - 1) + vC
  3885.             v8 = v8 + vC
  3886.             v4 = XOR(v4, v8) % 2^32 / 2^12
  3887.             v4 = v4 % 1 * (2^32 - 1) + v4
  3888.             v0 = v0 + v4 + W[perm_blake3[j + 14]]
  3889.             vC = XOR(vC, v0) % 2^32 / 2^8
  3890.             vC = vC % 1 * (2^32 - 1) + vC
  3891.             v8 = v8 + vC
  3892.             v4 = XOR(v4, v8) % 2^32 / 2^7
  3893.             v4 = v4 % 1 * (2^32 - 1) + v4
  3894.             v1 = v1 + v5 + W[perm_blake3[j + 1]]
  3895.             vD = XOR(vD, v1) % 2^32 / 2^16
  3896.             vD = vD % 1 * (2^32 - 1) + vD
  3897.             v9 = v9 + vD
  3898.             v5 = XOR(v5, v9) % 2^32 / 2^12
  3899.             v5 = v5 % 1 * (2^32 - 1) + v5
  3900.             v1 = v1 + v5 + W[perm_blake3[j + 2]]
  3901.             vD = XOR(vD, v1) % 2^32 / 2^8
  3902.             vD = vD % 1 * (2^32 - 1) + vD
  3903.             v9 = v9 + vD
  3904.             v5 = XOR(v5, v9) % 2^32 / 2^7
  3905.             v5 = v5 % 1 * (2^32 - 1) + v5
  3906.             v2 = v2 + v6 + W[perm_blake3[j + 16]]
  3907.             vE = XOR(vE, v2) % 2^32 / 2^16
  3908.             vE = vE % 1 * (2^32 - 1) + vE
  3909.             vA = vA + vE
  3910.             v6 = XOR(v6, vA) % 2^32 / 2^12
  3911.             v6 = v6 % 1 * (2^32 - 1) + v6
  3912.             v2 = v2 + v6 + W[perm_blake3[j + 7]]
  3913.             vE = XOR(vE, v2) % 2^32 / 2^8
  3914.             vE = vE % 1 * (2^32 - 1) + vE
  3915.             vA = vA + vE
  3916.             v6 = XOR(v6, vA) % 2^32 / 2^7
  3917.             v6 = v6 % 1 * (2^32 - 1) + v6
  3918.             v3 = v3 + v7 + W[perm_blake3[j + 15]]
  3919.             vF = XOR(vF, v3) % 2^32 / 2^16
  3920.             vF = vF % 1 * (2^32 - 1) + vF
  3921.             vB = vB + vF
  3922.             v7 = XOR(v7, vB) % 2^32 / 2^12
  3923.             v7 = v7 % 1 * (2^32 - 1) + v7
  3924.             v3 = v3 + v7 + W[perm_blake3[j + 17]]
  3925.             vF = XOR(vF, v3) % 2^32 / 2^8
  3926.             vF = vF % 1 * (2^32 - 1) + vF
  3927.             vB = vB + vF
  3928.             v7 = XOR(v7, vB) % 2^32 / 2^7
  3929.             v7 = v7 % 1 * (2^32 - 1) + v7
  3930.             v0 = v0 + v5 + W[perm_blake3[j + 21]]
  3931.             vF = XOR(vF, v0) % 2^32 / 2^16
  3932.             vF = vF % 1 * (2^32 - 1) + vF
  3933.             vA = vA + vF
  3934.             v5 = XOR(v5, vA) % 2^32 / 2^12
  3935.             v5 = v5 % 1 * (2^32 - 1) + v5
  3936.             v0 = v0 + v5 + W[perm_blake3[j + 5]]
  3937.             vF = XOR(vF, v0) % 2^32 / 2^8
  3938.             vF = vF % 1 * (2^32 - 1) + vF
  3939.             vA = vA + vF
  3940.             v5 = XOR(v5, vA) % 2^32 / 2^7
  3941.             v5 = v5 % 1 * (2^32 - 1) + v5
  3942.             v1 = v1 + v6 + W[perm_blake3[j + 3]]
  3943.             vC = XOR(vC, v1) % 2^32 / 2^16
  3944.             vC = vC % 1 * (2^32 - 1) + vC
  3945.             vB = vB + vC
  3946.             v6 = XOR(v6, vB) % 2^32 / 2^12
  3947.             v6 = v6 % 1 * (2^32 - 1) + v6
  3948.             v1 = v1 + v6 + W[perm_blake3[j + 6]]
  3949.             vC = XOR(vC, v1) % 2^32 / 2^8
  3950.             vC = vC % 1 * (2^32 - 1) + vC
  3951.             vB = vB + vC
  3952.             v6 = XOR(v6, vB) % 2^32 / 2^7
  3953.             v6 = v6 % 1 * (2^32 - 1) + v6
  3954.             v2 = v2 + v7 + W[perm_blake3[j + 4]]
  3955.             vD = XOR(vD, v2) % 2^32 / 2^16
  3956.             vD = vD % 1 * (2^32 - 1) + vD
  3957.             v8 = v8 + vD
  3958.             v7 = XOR(v7, v8) % 2^32 / 2^12
  3959.             v7 = v7 % 1 * (2^32 - 1) + v7
  3960.             v2 = v2 + v7 + W[perm_blake3[j + 18]]
  3961.             vD = XOR(vD, v2) % 2^32 / 2^8
  3962.             vD = vD % 1 * (2^32 - 1) + vD
  3963.             v8 = v8 + vD
  3964.             v7 = XOR(v7, v8) % 2^32 / 2^7
  3965.             v7 = v7 % 1 * (2^32 - 1) + v7
  3966.             v3 = v3 + v4 + W[perm_blake3[j + 19]]
  3967.             vE = XOR(vE, v3) % 2^32 / 2^16
  3968.             vE = vE % 1 * (2^32 - 1) + vE
  3969.             v9 = v9 + vE
  3970.             v4 = XOR(v4, v9) % 2^32 / 2^12
  3971.             v4 = v4 % 1 * (2^32 - 1) + v4
  3972.             v3 = v3 + v4 + W[perm_blake3[j + 20]]
  3973.             vE = XOR(vE, v3) % 2^32 / 2^8
  3974.             vE = vE % 1 * (2^32 - 1) + vE
  3975.             v9 = v9 + vE
  3976.             v4 = XOR(v4, v9) % 2^32 / 2^7
  3977.             v4 = v4 % 1 * (2^32 - 1) + v4
  3978.          end
  3979.          if wide_output then
  3980.             H_out[ 9] = XOR(h1, v8)
  3981.             H_out[10] = XOR(h2, v9)
  3982.             H_out[11] = XOR(h3, vA)
  3983.             H_out[12] = XOR(h4, vB)
  3984.             H_out[13] = XOR(h5, vC)
  3985.             H_out[14] = XOR(h6, vD)
  3986.             H_out[15] = XOR(h7, vE)
  3987.             H_out[16] = XOR(h8, vF)
  3988.          end
  3989.          h1 = XOR(v0, v8)
  3990.          h2 = XOR(v1, v9)
  3991.          h3 = XOR(v2, vA)
  3992.          h4 = XOR(v3, vB)
  3993.          h5 = XOR(v4, vC)
  3994.          h6 = XOR(v5, vD)
  3995.          h7 = XOR(v6, vE)
  3996.          h8 = XOR(v7, vF)
  3997.       end
  3998.       H_out[1], H_out[2], H_out[3], H_out[4], H_out[5], H_out[6], H_out[7], H_out[8] = h1, h2, h3, h4, h5, h6, h7, h8
  3999.    end
  4000.  
  4001. end
  4002.  
  4003.  
  4004. --------------------------------------------------------------------------------
  4005. -- MAGIC NUMBERS CALCULATOR
  4006. --------------------------------------------------------------------------------
  4007. -- Q:
  4008. --    Is 53-bit "double" math enough to calculate square roots and cube roots of primes with 64 correct bits after decimal point?
  4009. -- A:
  4010. --    Yes, 53-bit "double" arithmetic is enough.
  4011. --    We could obtain first 40 bits by direct calculation of p^(1/3) and next 40 bits by one step of Newton's method.
  4012.  
  4013. do
  4014.    local function mul(src1, src2, factor, result_length)
  4015.       -- src1, src2 - long integers (arrays of digits in base 2^24)
  4016.       -- factor - small integer
  4017.       -- returns long integer result (src1 * src2 * factor) and its floating point approximation
  4018.       local result, carry, value, weight = {}, 0.0, 0.0, 1.0
  4019.       for j = 1, result_length do
  4020.          for k = math_max(1, j + 1 - #src2), math_min(j, #src1) do
  4021.             carry = carry + factor * src1[k] * src2[j + 1 - k]  -- "int32" is not enough for multiplication result, that's why "factor" must be of type "double"
  4022.          end
  4023.          local digit = carry % 2^24
  4024.          result[j] = floor(digit)
  4025.          carry = (carry - digit) / 2^24
  4026.          value = value + digit * weight
  4027.          weight = weight * 2^24
  4028.       end
  4029.       return result, value
  4030.    end
  4031.  
  4032.    local idx, step, p, one, sqrt_hi, sqrt_lo = 0, {4, 1, 2, -2, 2}, 4, {1}, sha2_H_hi, sha2_H_lo
  4033.    repeat
  4034.       p = p + step[p % 6]
  4035.       local d = 1
  4036.       repeat
  4037.          d = d + step[d % 6]
  4038.          if d*d > p then -- next prime number is found
  4039.             local root = p^(1/3)
  4040.             local R = root * 2^40
  4041.             R = mul({R - R % 1}, one, 1.0, 2)
  4042.             local _, delta = mul(R, mul(R, R, 1.0, 4), -1.0, 4)
  4043.             local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
  4044.             local lo = R[1] % 256 * 16777216 + floor(delta * (2^-56 / 3) * root / p)
  4045.             if idx < 16 then
  4046.                root = p^(1/2)
  4047.                R = root * 2^40
  4048.                R = mul({R - R % 1}, one, 1.0, 2)
  4049.                _, delta = mul(R, R, -1.0, 2)
  4050.                local hi = R[2] % 65536 * 65536 + floor(R[1] / 256)
  4051.                local lo = R[1] % 256 * 16777216 + floor(delta * 2^-17 / root)
  4052.                local idx = idx % 8 + 1
  4053.                sha2_H_ext256[224][idx] = lo
  4054.                sqrt_hi[idx], sqrt_lo[idx] = hi, lo + hi * hi_factor
  4055.                if idx > 7 then
  4056.                   sqrt_hi, sqrt_lo = sha2_H_ext512_hi[384], sha2_H_ext512_lo[384]
  4057.                end
  4058.             end
  4059.             idx = idx + 1
  4060.             sha2_K_hi[idx], sha2_K_lo[idx] = hi, lo % K_lo_modulo + hi * hi_factor
  4061.             break
  4062.          end
  4063.       until p % d == 0
  4064.    until idx > 79
  4065. end
  4066.  
  4067. -- Calculating IVs for SHA512/224 and SHA512/256
  4068. for width = 224, 256, 32 do
  4069.    local H_lo, H_hi = {}
  4070.    if HEX64 then
  4071.       for j = 1, 8 do
  4072.          H_lo[j] = XORA5(sha2_H_lo[j])
  4073.       end
  4074.    else
  4075.       H_hi = {}
  4076.       for j = 1, 8 do
  4077.          H_lo[j] = XORA5(sha2_H_lo[j])
  4078.          H_hi[j] = XORA5(sha2_H_hi[j])
  4079.       end
  4080.    end
  4081.    sha512_feed_128(H_lo, H_hi, "SHA-512/"..tostring(width).."\128"..string_rep("\0", 115).."\88", 0, 128)
  4082.    sha2_H_ext512_lo[width] = H_lo
  4083.    sha2_H_ext512_hi[width] = H_hi
  4084. end
  4085.  
  4086. -- Constants for MD5
  4087. do
  4088.    local sin, abs, modf = math.sin, math.abs, math.modf
  4089.    for idx = 1, 64 do
  4090.       -- we can't use formula floor(abs(sin(idx))*2^32) because its result may be beyond integer range on Lua built with 32-bit integers
  4091.       local hi, lo = modf(abs(sin(idx)) * 2^16)
  4092.       md5_K[idx] = hi * 65536 + floor(lo * 2^16)
  4093.    end
  4094. end
  4095.  
  4096. -- Constants for SHA-3
  4097. do
  4098.    local sh_reg = 29
  4099.  
  4100.    local function next_bit()
  4101.       local r = sh_reg % 2
  4102.       sh_reg = XOR_BYTE((sh_reg - r) / 2, 142 * r)
  4103.       return r
  4104.    end
  4105.  
  4106.    for idx = 1, 24 do
  4107.       local lo, m = 0
  4108.       for _ = 1, 6 do
  4109.          m = m and m * m * 2 or 1
  4110.          lo = lo + next_bit() * m
  4111.       end
  4112.       local hi = next_bit() * m
  4113.       sha3_RC_hi[idx], sha3_RC_lo[idx] = hi, lo + hi * hi_factor_keccak
  4114.    end
  4115. end
  4116.  
  4117. if branch == "FFI" then
  4118.    sha2_K_hi = ffi.new("uint32_t[?]", #sha2_K_hi + 1, 0, unpack(sha2_K_hi))
  4119.    sha2_K_lo = ffi.new("int64_t[?]",  #sha2_K_lo + 1, 0, unpack(sha2_K_lo))
  4120.    --md5_K = ffi.new("uint32_t[?]", #md5_K + 1, 0, unpack(md5_K))
  4121.    if hi_factor_keccak == 0 then
  4122.       sha3_RC_lo = ffi.new("uint32_t[?]", #sha3_RC_lo + 1, 0, unpack(sha3_RC_lo))
  4123.       sha3_RC_hi = ffi.new("uint32_t[?]", #sha3_RC_hi + 1, 0, unpack(sha3_RC_hi))
  4124.    else
  4125.       sha3_RC_lo = ffi.new("int64_t[?]", #sha3_RC_lo + 1, 0, unpack(sha3_RC_lo))
  4126.    end
  4127. end
  4128.  
  4129.  
  4130. --------------------------------------------------------------------------------
  4131. -- MAIN FUNCTIONS
  4132. --------------------------------------------------------------------------------
  4133.  
  4134. local function sha256ext(width, message)
  4135.    -- Create an instance (private objects for current calculation)
  4136.    local H, length, tail = {unpack(sha2_H_ext256[width])}, 0.0, ""
  4137.  
  4138.    local function partial(message_part)
  4139.       if message_part then
  4140.          if tail then
  4141.             length = length + #message_part
  4142.             local offs = 0
  4143.             if tail ~= "" and #tail + #message_part >= 64 then
  4144.                offs = 64 - #tail
  4145.                sha256_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
  4146.                tail = ""
  4147.             end
  4148.             local size = #message_part - offs
  4149.             local size_tail = size % 64
  4150.             sha256_feed_64(H, message_part, offs, size - size_tail)
  4151.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4152.             return partial
  4153.          else
  4154.             error("Adding more chunks is not allowed after receiving the result", 2)
  4155.          end
  4156.       else
  4157.          if tail then
  4158.             local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
  4159.             tail = nil
  4160.             -- Assuming user data length is shorter than (2^53)-9 bytes
  4161.             -- Anyway, it looks very unrealistic that someone would spend more than a year of calculations to process 2^53 bytes of data by using this Lua script :-)
  4162.             -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
  4163.             length = length * (8 / 256^7)  -- convert "byte-counter" to "bit-counter" and move decimal point to the left
  4164.             for j = 4, 10 do
  4165.                length = length % 1 * 256
  4166.                final_blocks[j] = char(floor(length))
  4167.             end
  4168.             final_blocks = table_concat(final_blocks)
  4169.             sha256_feed_64(H, final_blocks, 0, #final_blocks)
  4170.             local max_reg = width / 32
  4171.             for j = 1, max_reg do
  4172.                H[j] = HEX(H[j])
  4173.             end
  4174.             H = table_concat(H, "", 1, max_reg)
  4175.          end
  4176.          return H
  4177.       end
  4178.    end
  4179.  
  4180.    if message then
  4181.       -- Actually perform calculations and return the SHA256 digest of a message
  4182.       return partial(message)()
  4183.    else
  4184.       -- Return function for chunk-by-chunk loading
  4185.       -- User should feed every chunk of input data as single argument to this function and finally get SHA256 digest by invoking this function without an argument
  4186.       return partial
  4187.    end
  4188. end
  4189.  
  4190.  
  4191. local function sha512ext(width, message)
  4192.    -- Create an instance (private objects for current calculation)
  4193.    local length, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_ext512_lo[width])}, not HEX64 and {unpack(sha2_H_ext512_hi[width])}
  4194.  
  4195.    local function partial(message_part)
  4196.       if message_part then
  4197.          if tail then
  4198.             length = length + #message_part
  4199.             local offs = 0
  4200.             if tail ~= "" and #tail + #message_part >= 128 then
  4201.                offs = 128 - #tail
  4202.                sha512_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128)
  4203.                tail = ""
  4204.             end
  4205.             local size = #message_part - offs
  4206.             local size_tail = size % 128
  4207.             sha512_feed_128(H_lo, H_hi, message_part, offs, size - size_tail)
  4208.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4209.             return partial
  4210.          else
  4211.             error("Adding more chunks is not allowed after receiving the result", 2)
  4212.          end
  4213.       else
  4214.          if tail then
  4215.             local final_blocks = {tail, "\128", string_rep("\0", (-17-length) % 128 + 9)}
  4216.             tail = nil
  4217.             -- Assuming user data length is shorter than (2^53)-17 bytes
  4218.             -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
  4219.             length = length * (8 / 256^7)  -- convert "byte-counter" to "bit-counter" and move floating point to the left
  4220.             for j = 4, 10 do
  4221.                length = length % 1 * 256
  4222.                final_blocks[j] = char(floor(length))
  4223.             end
  4224.             final_blocks = table_concat(final_blocks)
  4225.             sha512_feed_128(H_lo, H_hi, final_blocks, 0, #final_blocks)
  4226.             local max_reg = ceil(width / 64)
  4227.             if HEX64 then
  4228.                for j = 1, max_reg do
  4229.                   H_lo[j] = HEX64(H_lo[j])
  4230.                end
  4231.             else
  4232.                for j = 1, max_reg do
  4233.                   H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
  4234.                end
  4235.                H_hi = nil
  4236.             end
  4237.             H_lo = sub(table_concat(H_lo, "", 1, max_reg), 1, width / 4)
  4238.          end
  4239.          return H_lo
  4240.       end
  4241.    end
  4242.  
  4243.    if message then
  4244.       -- Actually perform calculations and return the SHA512 digest of a message
  4245.       return partial(message)()
  4246.    else
  4247.       -- Return function for chunk-by-chunk loading
  4248.       -- User should feed every chunk of input data as single argument to this function and finally get SHA512 digest by invoking this function without an argument
  4249.       return partial
  4250.    end
  4251. end
  4252.  
  4253.  
  4254. local function md5(message)
  4255.    -- Create an instance (private objects for current calculation)
  4256.    local H, length, tail = {unpack(md5_sha1_H, 1, 4)}, 0.0, ""
  4257.  
  4258.    local function partial(message_part)
  4259.       if message_part then
  4260.          if tail then
  4261.             length = length + #message_part
  4262.             local offs = 0
  4263.             if tail ~= "" and #tail + #message_part >= 64 then
  4264.                offs = 64 - #tail
  4265.                md5_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
  4266.                tail = ""
  4267.             end
  4268.             local size = #message_part - offs
  4269.             local size_tail = size % 64
  4270.             md5_feed_64(H, message_part, offs, size - size_tail)
  4271.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4272.             return partial
  4273.          else
  4274.             error("Adding more chunks is not allowed after receiving the result", 2)
  4275.          end
  4276.       else
  4277.          if tail then
  4278.             local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64)}
  4279.             tail = nil
  4280.             length = length * 8  -- convert "byte-counter" to "bit-counter"
  4281.             for j = 4, 11 do
  4282.                local low_byte = length % 256
  4283.                final_blocks[j] = char(low_byte)
  4284.                length = (length - low_byte) / 256
  4285.             end
  4286.             final_blocks = table_concat(final_blocks)
  4287.             md5_feed_64(H, final_blocks, 0, #final_blocks)
  4288.             for j = 1, 4 do
  4289.                H[j] = HEX(H[j])
  4290.             end
  4291.             H = gsub(table_concat(H), "(..)(..)(..)(..)", "%4%3%2%1")
  4292.          end
  4293.          return H
  4294.       end
  4295.    end
  4296.  
  4297.    if message then
  4298.       -- Actually perform calculations and return the MD5 digest of a message
  4299.       return partial(message)()
  4300.    else
  4301.       -- Return function for chunk-by-chunk loading
  4302.       -- User should feed every chunk of input data as single argument to this function and finally get MD5 digest by invoking this function without an argument
  4303.       return partial
  4304.    end
  4305. end
  4306.  
  4307.  
  4308. local function sha1(message)
  4309.    -- Create an instance (private objects for current calculation)
  4310.    local H, length, tail = {unpack(md5_sha1_H)}, 0.0, ""
  4311.  
  4312.    local function partial(message_part)
  4313.       if message_part then
  4314.          if tail then
  4315.             length = length + #message_part
  4316.             local offs = 0
  4317.             if tail ~= "" and #tail + #message_part >= 64 then
  4318.                offs = 64 - #tail
  4319.                sha1_feed_64(H, tail..sub(message_part, 1, offs), 0, 64)
  4320.                tail = ""
  4321.             end
  4322.             local size = #message_part - offs
  4323.             local size_tail = size % 64
  4324.             sha1_feed_64(H, message_part, offs, size - size_tail)
  4325.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4326.             return partial
  4327.          else
  4328.             error("Adding more chunks is not allowed after receiving the result", 2)
  4329.          end
  4330.       else
  4331.          if tail then
  4332.             local final_blocks = {tail, "\128", string_rep("\0", (-9 - length) % 64 + 1)}
  4333.             tail = nil
  4334.             -- Assuming user data length is shorter than (2^53)-9 bytes
  4335.             -- 2^53 bytes = 2^56 bits, so "bit-counter" fits in 7 bytes
  4336.             length = length * (8 / 256^7)  -- convert "byte-counter" to "bit-counter" and move decimal point to the left
  4337.             for j = 4, 10 do
  4338.                length = length % 1 * 256
  4339.                final_blocks[j] = char(floor(length))
  4340.             end
  4341.             final_blocks = table_concat(final_blocks)
  4342.             sha1_feed_64(H, final_blocks, 0, #final_blocks)
  4343.             for j = 1, 5 do
  4344.                H[j] = HEX(H[j])
  4345.             end
  4346.             H = table_concat(H)
  4347.          end
  4348.          return H
  4349.       end
  4350.    end
  4351.  
  4352.    if message then
  4353.       -- Actually perform calculations and return the SHA-1 digest of a message
  4354.       return partial(message)()
  4355.    else
  4356.       -- Return function for chunk-by-chunk loading
  4357.       -- User should feed every chunk of input data as single argument to this function and finally get SHA-1 digest by invoking this function without an argument
  4358.       return partial
  4359.    end
  4360. end
  4361.  
  4362.  
  4363. local function keccak(block_size_in_bytes, digest_size_in_bytes, is_SHAKE, message)
  4364.    -- "block_size_in_bytes" is multiple of 8
  4365.    if type(digest_size_in_bytes) ~= "number" then
  4366.       -- arguments in SHAKE are swapped:
  4367.       --    NIST FIPS 202 defines SHAKE(message,num_bits)
  4368.       --    this module   defines SHAKE(num_bytes,message)
  4369.       -- it's easy to forget about this swap, hence the check
  4370.       error("Argument 'digest_size_in_bytes' must be a number", 2)
  4371.    end
  4372.    -- Create an instance (private objects for current calculation)
  4373.    local tail, lanes_lo, lanes_hi = "", create_array_of_lanes(), hi_factor_keccak == 0 and create_array_of_lanes()
  4374.    local result
  4375.  
  4376.    local function partial(message_part)
  4377.       if message_part then
  4378.          if tail then
  4379.             local offs = 0
  4380.             if tail ~= "" and #tail + #message_part >= block_size_in_bytes then
  4381.                offs = block_size_in_bytes - #tail
  4382.                keccak_feed(lanes_lo, lanes_hi, tail..sub(message_part, 1, offs), 0, block_size_in_bytes, block_size_in_bytes)
  4383.                tail = ""
  4384.             end
  4385.             local size = #message_part - offs
  4386.             local size_tail = size % block_size_in_bytes
  4387.             keccak_feed(lanes_lo, lanes_hi, message_part, offs, size - size_tail, block_size_in_bytes)
  4388.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4389.             return partial
  4390.          else
  4391.             error("Adding more chunks is not allowed after receiving the result", 2)
  4392.          end
  4393.       else
  4394.          if tail then
  4395.             -- append the following bits to the message: for usual SHA-3: 011(0*)1, for SHAKE: 11111(0*)1
  4396.             local gap_start = is_SHAKE and 31 or 6
  4397.             tail = tail..(#tail + 1 == block_size_in_bytes and char(gap_start + 128) or char(gap_start)..string_rep("\0", (-2 - #tail) % block_size_in_bytes).."\128")
  4398.             keccak_feed(lanes_lo, lanes_hi, tail, 0, #tail, block_size_in_bytes)
  4399.             tail = nil
  4400.             local lanes_used = 0
  4401.             local total_lanes = floor(block_size_in_bytes / 8)
  4402.             local qwords = {}
  4403.  
  4404.             local function get_next_qwords_of_digest(qwords_qty)
  4405.                -- returns not more than 'qwords_qty' qwords ('qwords_qty' might be non-integer)
  4406.                -- doesn't go across keccak-buffer boundary
  4407.                -- block_size_in_bytes is a multiple of 8, so, keccak-buffer contains integer number of qwords
  4408.                if lanes_used >= total_lanes then
  4409.                   keccak_feed(lanes_lo, lanes_hi, "\0\0\0\0\0\0\0\0", 0, 8, 8)
  4410.                   lanes_used = 0
  4411.                end
  4412.                qwords_qty = floor(math_min(qwords_qty, total_lanes - lanes_used))
  4413.                if hi_factor_keccak ~= 0 then
  4414.                   for j = 1, qwords_qty do
  4415.                      qwords[j] = HEX64(lanes_lo[lanes_used + j - 1 + lanes_index_base])
  4416.                   end
  4417.                else
  4418.                   for j = 1, qwords_qty do
  4419.                      qwords[j] = HEX(lanes_hi[lanes_used + j])..HEX(lanes_lo[lanes_used + j])
  4420.                   end
  4421.                end
  4422.                lanes_used = lanes_used + qwords_qty
  4423.                return
  4424.                   gsub(table_concat(qwords, "", 1, qwords_qty), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"),
  4425.                   qwords_qty * 8
  4426.             end
  4427.  
  4428.             local parts = {}      -- digest parts
  4429.             local last_part, last_part_size = "", 0
  4430.  
  4431.             local function get_next_part_of_digest(bytes_needed)
  4432.                -- returns 'bytes_needed' bytes, for arbitrary integer 'bytes_needed'
  4433.                bytes_needed = bytes_needed or 1
  4434.                if bytes_needed <= last_part_size then
  4435.                   last_part_size = last_part_size - bytes_needed
  4436.                   local part_size_in_nibbles = bytes_needed * 2
  4437.                   local result = sub(last_part, 1, part_size_in_nibbles)
  4438.                   last_part = sub(last_part, part_size_in_nibbles + 1)
  4439.                   return result
  4440.                end
  4441.                local parts_qty = 0
  4442.                if last_part_size > 0 then
  4443.                   parts_qty = 1
  4444.                   parts[parts_qty] = last_part
  4445.                   bytes_needed = bytes_needed - last_part_size
  4446.                end
  4447.                -- repeats until the length is enough
  4448.                while bytes_needed >= 8 do
  4449.                   local next_part, next_part_size = get_next_qwords_of_digest(bytes_needed / 8)
  4450.                   parts_qty = parts_qty + 1
  4451.                   parts[parts_qty] = next_part
  4452.                   bytes_needed = bytes_needed - next_part_size
  4453.                end
  4454.                if bytes_needed > 0 then
  4455.                   last_part, last_part_size = get_next_qwords_of_digest(1)
  4456.                   parts_qty = parts_qty + 1
  4457.                   parts[parts_qty] = get_next_part_of_digest(bytes_needed)
  4458.                else
  4459.                   last_part, last_part_size = "", 0
  4460.                end
  4461.                return table_concat(parts, "", 1, parts_qty)
  4462.             end
  4463.  
  4464.             if digest_size_in_bytes < 0 then
  4465.                result = get_next_part_of_digest
  4466.             else
  4467.                result = get_next_part_of_digest(digest_size_in_bytes)
  4468.             end
  4469.          end
  4470.          return result
  4471.       end
  4472.    end
  4473.  
  4474.    if message then
  4475.       -- Actually perform calculations and return the SHA-3 digest of a message
  4476.       return partial(message)()
  4477.    else
  4478.       -- Return function for chunk-by-chunk loading
  4479.       -- User should feed every chunk of input data as single argument to this function and finally get SHA-3 digest by invoking this function without an argument
  4480.       return partial
  4481.    end
  4482. end
  4483.  
  4484.  
  4485. local hex_to_bin, bin_to_hex, bin_to_base64, base64_to_bin
  4486. do
  4487.    function hex_to_bin(hex_string)
  4488.       return (gsub(hex_string, "%x%x",
  4489.          function (hh)
  4490.             return char(tonumber(hh, 16))
  4491.          end
  4492.       ))
  4493.    end
  4494.  
  4495.    function bin_to_hex(binary_string)
  4496.       return (gsub(binary_string, ".",
  4497.          function (c)
  4498.             return string_format("%02x", byte(c))
  4499.          end
  4500.       ))
  4501.    end
  4502.  
  4503.    local base64_symbols = {
  4504.       ['+'] = 62, ['-'] = 62,  [62] = '+',
  4505.       ['/'] = 63, ['_'] = 63,  [63] = '/',
  4506.       ['='] = -1, ['.'] = -1,  [-1] = '='
  4507.    }
  4508.    local symbol_index = 0
  4509.    for j, pair in ipairs{'AZ', 'az', '09'} do
  4510.       for ascii = byte(pair), byte(pair, 2) do
  4511.          local ch = char(ascii)
  4512.          base64_symbols[ch] = symbol_index
  4513.          base64_symbols[symbol_index] = ch
  4514.          symbol_index = symbol_index + 1
  4515.       end
  4516.    end
  4517.  
  4518.    function bin_to_base64(binary_string)
  4519.       local result = {}
  4520.       for pos = 1, #binary_string, 3 do
  4521.          local c1, c2, c3, c4 = byte(sub(binary_string, pos, pos + 2)..'\0', 1, -1)
  4522.          result[#result + 1] =
  4523.             base64_symbols[floor(c1 / 4)]
  4524.             ..base64_symbols[c1 % 4 * 16 + floor(c2 / 16)]
  4525.             ..base64_symbols[c3 and c2 % 16 * 4 + floor(c3 / 64) or -1]
  4526.             ..base64_symbols[c4 and c3 % 64 or -1]
  4527.       end
  4528.       return table_concat(result)
  4529.    end
  4530.  
  4531.    function base64_to_bin(base64_string)
  4532.       local result, chars_qty = {}, 3
  4533.       for pos, ch in gmatch(gsub(base64_string, '%s+', ''), '()(.)') do
  4534.          local code = base64_symbols[ch]
  4535.          if code < 0 then
  4536.             chars_qty = chars_qty - 1
  4537.             code = 0
  4538.          end
  4539.          local idx = pos % 4
  4540.          if idx > 0 then
  4541.             result[-idx] = code
  4542.          else
  4543.             local c1 = result[-1] * 4 + floor(result[-2] / 16)
  4544.             local c2 = (result[-2] % 16) * 16 + floor(result[-3] / 4)
  4545.             local c3 = (result[-3] % 4) * 64 + code
  4546.             result[#result + 1] = sub(char(c1, c2, c3), 1, chars_qty)
  4547.          end
  4548.       end
  4549.       return table_concat(result)
  4550.    end
  4551.  
  4552. end
  4553.  
  4554.  
  4555. local block_size_for_HMAC  -- this table will be initialized at the end of the module
  4556.  
  4557. local function pad_and_xor(str, result_length, byte_for_xor)
  4558.    return gsub(str, ".",
  4559.       function(c)
  4560.          return char(XOR_BYTE(byte(c), byte_for_xor))
  4561.       end
  4562.    )..string_rep(char(byte_for_xor), result_length - #str)
  4563. end
  4564.  
  4565. local function hmac(hash_func, key, message)
  4566.    -- Create an instance (private objects for current calculation)
  4567.    local block_size = block_size_for_HMAC[hash_func]
  4568.    if not block_size then
  4569.       error("Unknown hash function", 2)
  4570.    end
  4571.    if #key > block_size then
  4572.       key = hex_to_bin(hash_func(key))
  4573.    end
  4574.    local append = hash_func()(pad_and_xor(key, block_size, 0x36))
  4575.    local result
  4576.  
  4577.    local function partial(message_part)
  4578.       if not message_part then
  4579.          result = result or hash_func(pad_and_xor(key, block_size, 0x5C)..hex_to_bin(append()))
  4580.          return result
  4581.       elseif result then
  4582.          error("Adding more chunks is not allowed after receiving the result", 2)
  4583.       else
  4584.          append(message_part)
  4585.          return partial
  4586.       end
  4587.    end
  4588.  
  4589.    if message then
  4590.       -- Actually perform calculations and return the HMAC of a message
  4591.       return partial(message)()
  4592.    else
  4593.       -- Return function for chunk-by-chunk loading of a message
  4594.       -- User should feed every chunk of the message as single argument to this function and finally get HMAC by invoking this function without an argument
  4595.       return partial
  4596.    end
  4597. end
  4598.  
  4599.  
  4600. local function xor_blake2_salt(salt, letter, H_lo, H_hi)
  4601.    -- salt: concatenation of "Salt"+"Personalization" fields
  4602.    local max_size = letter == "s" and 16 or 32
  4603.    local salt_size = #salt
  4604.    if salt_size > max_size then
  4605.       error(string_format("For BLAKE2%s/BLAKE2%sp/BLAKE2X%s the 'salt' parameter length must not exceed %d bytes", letter, letter, letter, max_size), 2)
  4606.    end
  4607.    if H_lo then
  4608.       local offset, blake2_word_size, xor = 0, letter == "s" and 4 or 8, letter == "s" and XOR or XORA5
  4609.       for j = 5, 4 + ceil(salt_size / blake2_word_size) do
  4610.          local prev, last
  4611.          for _ = 1, blake2_word_size, 4 do
  4612.             offset = offset + 4
  4613.             local a, b, c, d = byte(salt, offset - 3, offset)
  4614.             local four_bytes = (((d or 0) * 256 + (c or 0)) * 256 + (b or 0)) * 256 + (a or 0)
  4615.             prev, last = last, four_bytes
  4616.          end
  4617.          H_lo[j] = xor(H_lo[j], prev and last * hi_factor + prev or last)
  4618.          if H_hi then
  4619.             H_hi[j] = xor(H_hi[j], last)
  4620.          end
  4621.       end
  4622.    end
  4623. end
  4624.  
  4625. local function blake2s(message, key, salt, digest_size_in_bytes, XOF_length, B2_offset)
  4626.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4627.    -- key:      (optional) binary string up to 32 bytes, by default empty string
  4628.    -- salt:     (optional) binary string up to 16 bytes, by default empty string
  4629.    -- digest_size_in_bytes: (optional) integer from 1 to 32, by default 32
  4630.    -- The last two parameters "XOF_length" and "B2_offset" are for internal use only, user must omit them (or pass nil)
  4631.    digest_size_in_bytes = digest_size_in_bytes or 32
  4632.    if digest_size_in_bytes < 1 or digest_size_in_bytes > 32 then
  4633.       error("BLAKE2s digest length must be from 1 to 32 bytes", 2)
  4634.    end
  4635.    key = key or ""
  4636.    local key_length = #key
  4637.    if key_length > 32 then
  4638.       error("BLAKE2s key length must not exceed 32 bytes", 2)
  4639.    end
  4640.    salt = salt or ""
  4641.    local bytes_compressed, tail, H = 0.0, "", {unpack(sha2_H_hi)}
  4642.    if B2_offset then
  4643.       H[1] = XOR(H[1], digest_size_in_bytes)
  4644.       H[2] = XOR(H[2], 0x20)
  4645.       H[3] = XOR(H[3], B2_offset)
  4646.       H[4] = XOR(H[4], 0x20000000 + XOF_length)
  4647.    else
  4648.       H[1] = XOR(H[1], 0x01010000 + key_length * 256 + digest_size_in_bytes)
  4649.       if XOF_length then
  4650.          H[4] = XOR(H[4], XOF_length)
  4651.       end
  4652.    end
  4653.    if salt ~= "" then
  4654.       xor_blake2_salt(salt, "s", H)
  4655.    end
  4656.  
  4657.    local function partial(message_part)
  4658.       if message_part then
  4659.          if tail then
  4660.             local offs = 0
  4661.             if tail ~= "" and #tail + #message_part > 64 then
  4662.                offs = 64 - #tail
  4663.                bytes_compressed = blake2s_feed_64(H, tail..sub(message_part, 1, offs), 0, 64, bytes_compressed)
  4664.                tail = ""
  4665.             end
  4666.             local size = #message_part - offs
  4667.             local size_tail = size > 0 and (size - 1) % 64 + 1 or 0
  4668.             bytes_compressed = blake2s_feed_64(H, message_part, offs, size - size_tail, bytes_compressed)
  4669.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4670.             return partial
  4671.          else
  4672.             error("Adding more chunks is not allowed after receiving the result", 2)
  4673.          end
  4674.       else
  4675.          if tail then
  4676.             if B2_offset then
  4677.                blake2s_feed_64(H, nil, 0, 64, 0, 32)
  4678.             else
  4679.                blake2s_feed_64(H, tail..string_rep("\0", 64 - #tail), 0, 64, bytes_compressed, #tail)
  4680.             end
  4681.             tail = nil
  4682.             if not XOF_length or B2_offset then
  4683.                local max_reg = ceil(digest_size_in_bytes / 4)
  4684.                for j = 1, max_reg do
  4685.                   H[j] = HEX(H[j])
  4686.                end
  4687.                H = sub(gsub(table_concat(H, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4688.             end
  4689.          end
  4690.          return H
  4691.       end
  4692.    end
  4693.  
  4694.    if key_length > 0 then
  4695.       partial(key..string_rep("\0", 64 - key_length))
  4696.    end
  4697.    if B2_offset then
  4698.       return partial()
  4699.    elseif message then
  4700.       -- Actually perform calculations and return the BLAKE2s digest of a message
  4701.       return partial(message)()
  4702.    else
  4703.       -- Return function for chunk-by-chunk loading
  4704.       -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2s digest by invoking this function without an argument
  4705.       return partial
  4706.    end
  4707. end
  4708.  
  4709. local function blake2b(message, key, salt, digest_size_in_bytes, XOF_length, B2_offset)
  4710.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4711.    -- key:      (optional) binary string up to 64 bytes, by default empty string
  4712.    -- salt:     (optional) binary string up to 32 bytes, by default empty string
  4713.    -- digest_size_in_bytes: (optional) integer from 1 to 64, by default 64
  4714.    -- The last two parameters "XOF_length" and "B2_offset" are for internal use only, user must omit them (or pass nil)
  4715.    digest_size_in_bytes = floor(digest_size_in_bytes or 64)
  4716.    if digest_size_in_bytes < 1 or digest_size_in_bytes > 64 then
  4717.       error("BLAKE2b digest length must be from 1 to 64 bytes", 2)
  4718.    end
  4719.    key = key or ""
  4720.    local key_length = #key
  4721.    if key_length > 64 then
  4722.       error("BLAKE2b key length must not exceed 64 bytes", 2)
  4723.    end
  4724.    salt = salt or ""
  4725.    local bytes_compressed, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
  4726.    if B2_offset then
  4727.       if H_hi then
  4728.          H_lo[1] = XORA5(H_lo[1], digest_size_in_bytes)
  4729.          H_hi[1] = XORA5(H_hi[1], 0x40)
  4730.          H_lo[2] = XORA5(H_lo[2], B2_offset)
  4731.          H_hi[2] = XORA5(H_hi[2], XOF_length)
  4732.       else
  4733.          H_lo[1] = XORA5(H_lo[1], 0x40 * hi_factor + digest_size_in_bytes)
  4734.          H_lo[2] = XORA5(H_lo[2], XOF_length * hi_factor + B2_offset)
  4735.       end
  4736.       H_lo[3] = XORA5(H_lo[3], 0x4000)
  4737.    else
  4738.       H_lo[1] = XORA5(H_lo[1], 0x01010000 + key_length * 256 + digest_size_in_bytes)
  4739.       if XOF_length then
  4740.          if H_hi then
  4741.             H_hi[2] = XORA5(H_hi[2], XOF_length)
  4742.          else
  4743.             H_lo[2] = XORA5(H_lo[2], XOF_length * hi_factor)
  4744.          end
  4745.       end
  4746.    end
  4747.    if salt ~= "" then
  4748.       xor_blake2_salt(salt, "b", H_lo, H_hi)
  4749.    end
  4750.  
  4751.    local function partial(message_part)
  4752.       if message_part then
  4753.          if tail then
  4754.             local offs = 0
  4755.             if tail ~= "" and #tail + #message_part > 128 then
  4756.                offs = 128 - #tail
  4757.                bytes_compressed = blake2b_feed_128(H_lo, H_hi, tail..sub(message_part, 1, offs), 0, 128, bytes_compressed)
  4758.                tail = ""
  4759.             end
  4760.             local size = #message_part - offs
  4761.             local size_tail = size > 0 and (size - 1) % 128 + 1 or 0
  4762.             bytes_compressed = blake2b_feed_128(H_lo, H_hi, message_part, offs, size - size_tail, bytes_compressed)
  4763.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  4764.             return partial
  4765.          else
  4766.             error("Adding more chunks is not allowed after receiving the result", 2)
  4767.          end
  4768.       else
  4769.          if tail then
  4770.             if B2_offset then
  4771.                blake2b_feed_128(H_lo, H_hi, nil, 0, 128, 0, 64)
  4772.             else
  4773.                blake2b_feed_128(H_lo, H_hi, tail..string_rep("\0", 128 - #tail), 0, 128, bytes_compressed, #tail)
  4774.             end
  4775.             tail = nil
  4776.             if XOF_length and not B2_offset then
  4777.                if H_hi then
  4778.                   for j = 8, 1, -1 do
  4779.                      H_lo[j*2] = H_hi[j]
  4780.                      H_lo[j*2-1] = H_lo[j]
  4781.                   end
  4782.                   return H_lo, 16
  4783.                end
  4784.             else
  4785.                local max_reg = ceil(digest_size_in_bytes / 8)
  4786.                if H_hi then
  4787.                   for j = 1, max_reg do
  4788.                      H_lo[j] = HEX(H_hi[j])..HEX(H_lo[j])
  4789.                   end
  4790.                else
  4791.                   for j = 1, max_reg do
  4792.                      H_lo[j] = HEX64(H_lo[j])
  4793.                   end
  4794.                end
  4795.                H_lo = sub(gsub(table_concat(H_lo, "", 1, max_reg), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4796.             end
  4797.             H_hi = nil
  4798.          end
  4799.          return H_lo
  4800.       end
  4801.    end
  4802.  
  4803.    if key_length > 0 then
  4804.       partial(key..string_rep("\0", 128 - key_length))
  4805.    end
  4806.    if B2_offset then
  4807.       return partial()
  4808.    elseif message then
  4809.       -- Actually perform calculations and return the BLAKE2b digest of a message
  4810.       return partial(message)()
  4811.    else
  4812.       -- Return function for chunk-by-chunk loading
  4813.       -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2b digest by invoking this function without an argument
  4814.       return partial
  4815.    end
  4816. end
  4817.  
  4818. local function blake2sp(message, key, salt, digest_size_in_bytes)
  4819.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4820.    -- key:      (optional) binary string up to 32 bytes, by default empty string
  4821.    -- salt:     (optional) binary string up to 16 bytes, by default empty string
  4822.    -- digest_size_in_bytes: (optional) integer from 1 to 32, by default 32
  4823.    digest_size_in_bytes = digest_size_in_bytes or 32
  4824.    if digest_size_in_bytes < 1 or digest_size_in_bytes > 32 then
  4825.       error("BLAKE2sp digest length must be from 1 to 32 bytes", 2)
  4826.    end
  4827.    key = key or ""
  4828.    local key_length = #key
  4829.    if key_length > 32 then
  4830.       error("BLAKE2sp key length must not exceed 32 bytes", 2)
  4831.    end
  4832.    salt = salt or ""
  4833.    local instances, length, first_dword_of_parameter_block, result = {}, 0.0, 0x02080000 + key_length * 256 + digest_size_in_bytes
  4834.    for j = 1, 8 do
  4835.       local bytes_compressed, tail, H = 0.0, "", {unpack(sha2_H_hi)}
  4836.       instances[j] = {bytes_compressed, tail, H}
  4837.       H[1] = XOR(H[1], first_dword_of_parameter_block)
  4838.       H[3] = XOR(H[3], j-1)
  4839.       H[4] = XOR(H[4], 0x20000000)
  4840.       if salt ~= "" then
  4841.          xor_blake2_salt(salt, "s", H)
  4842.       end
  4843.    end
  4844.  
  4845.    local function partial(message_part)
  4846.       if message_part then
  4847.          if instances then
  4848.             local from = 0
  4849.             while true do
  4850.                local to = math_min(from + 64 - length % 64, #message_part)
  4851.                if to > from then
  4852.                   local inst = instances[floor(length / 64) % 8 + 1]
  4853.                   local part = sub(message_part, from + 1, to)
  4854.                   length, from = length + to - from, to
  4855.                   local bytes_compressed, tail = inst[1], inst[2]
  4856.                   if #tail < 64 then
  4857.                      tail = tail..part
  4858.                   else
  4859.                      local H = inst[3]
  4860.                      bytes_compressed = blake2s_feed_64(H, tail, 0, 64, bytes_compressed)
  4861.                      tail = part
  4862.                   end
  4863.                   inst[1], inst[2] = bytes_compressed, tail
  4864.                else
  4865.                   break
  4866.                end
  4867.             end
  4868.             return partial
  4869.          else
  4870.             error("Adding more chunks is not allowed after receiving the result", 2)
  4871.          end
  4872.       else
  4873.          if instances then
  4874.             local root_H = {unpack(sha2_H_hi)}
  4875.             root_H[1] = XOR(root_H[1], first_dword_of_parameter_block)
  4876.             root_H[4] = XOR(root_H[4], 0x20010000)
  4877.             if salt ~= "" then
  4878.                xor_blake2_salt(salt, "s", root_H)
  4879.             end
  4880.             for j = 1, 8 do
  4881.                local inst = instances[j]
  4882.                local bytes_compressed, tail, H = inst[1], inst[2], inst[3]
  4883.                blake2s_feed_64(H, tail..string_rep("\0", 64 - #tail), 0, 64, bytes_compressed, #tail, j == 8)
  4884.                if j % 2 == 0 then
  4885.                   local index = 0
  4886.                   for k = j - 1, j do
  4887.                      local inst = instances[k]
  4888.                      local H = inst[3]
  4889.                      for i = 1, 8 do
  4890.                         index = index + 1
  4891.                         common_W_blake2s[index] = H[i]
  4892.                      end
  4893.                   end
  4894.                   blake2s_feed_64(root_H, nil, 0, 64, 64 * (j/2 - 1), j == 8 and 64, j == 8)
  4895.                end
  4896.             end
  4897.             instances = nil
  4898.             local max_reg = ceil(digest_size_in_bytes / 4)
  4899.             for j = 1, max_reg do
  4900.                root_H[j] = HEX(root_H[j])
  4901.             end
  4902.             result = sub(gsub(table_concat(root_H, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, digest_size_in_bytes * 2)
  4903.          end
  4904.          return result
  4905.       end
  4906.    end
  4907.  
  4908.    if key_length > 0 then
  4909.       key = key..string_rep("\0", 64 - key_length)
  4910.       for j = 1, 8 do
  4911.          partial(key)
  4912.       end
  4913.    end
  4914.    if message then
  4915.       -- Actually perform calculations and return the BLAKE2sp digest of a message
  4916.       return partial(message)()
  4917.    else
  4918.       -- Return function for chunk-by-chunk loading
  4919.       -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2sp digest by invoking this function without an argument
  4920.       return partial
  4921.    end
  4922.  
  4923. end
  4924.  
  4925. local function blake2bp(message, key, salt, digest_size_in_bytes)
  4926.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  4927.    -- key:      (optional) binary string up to 64 bytes, by default empty string
  4928.    -- salt:     (optional) binary string up to 32 bytes, by default empty string
  4929.    -- digest_size_in_bytes: (optional) integer from 1 to 64, by default 64
  4930.    digest_size_in_bytes = digest_size_in_bytes or 64
  4931.    if digest_size_in_bytes < 1 or digest_size_in_bytes > 64 then
  4932.       error("BLAKE2bp digest length must be from 1 to 64 bytes", 2)
  4933.    end
  4934.    key = key or ""
  4935.    local key_length = #key
  4936.    if key_length > 64 then
  4937.       error("BLAKE2bp key length must not exceed 64 bytes", 2)
  4938.    end
  4939.    salt = salt or ""
  4940.    local instances, length, first_dword_of_parameter_block, result = {}, 0.0, 0x02040000 + key_length * 256 + digest_size_in_bytes
  4941.    for j = 1, 4 do
  4942.       local bytes_compressed, tail, H_lo, H_hi = 0.0, "", {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
  4943.       instances[j] = {bytes_compressed, tail, H_lo, H_hi}
  4944.       H_lo[1] = XORA5(H_lo[1], first_dword_of_parameter_block)
  4945.       H_lo[2] = XORA5(H_lo[2], j-1)
  4946.       H_lo[3] = XORA5(H_lo[3], 0x4000)
  4947.       if salt ~= "" then
  4948.          xor_blake2_salt(salt, "b", H_lo, H_hi)
  4949.       end
  4950.    end
  4951.  
  4952.    local function partial(message_part)
  4953.       if message_part then
  4954.          if instances then
  4955.             local from = 0
  4956.             while true do
  4957.                local to = math_min(from + 128 - length % 128, #message_part)
  4958.                if to > from then
  4959.                   local inst = instances[floor(length / 128) % 4 + 1]
  4960.                   local part = sub(message_part, from + 1, to)
  4961.                   length, from = length + to - from, to
  4962.                   local bytes_compressed, tail = inst[1], inst[2]
  4963.                   if #tail < 128 then
  4964.                      tail = tail..part
  4965.                   else
  4966.                      local H_lo, H_hi = inst[3], inst[4]
  4967.                      bytes_compressed = blake2b_feed_128(H_lo, H_hi, tail, 0, 128, bytes_compressed)
  4968.                      tail = part
  4969.                   end
  4970.                   inst[1], inst[2] = bytes_compressed, tail
  4971.                else
  4972.                   break
  4973.                end
  4974.             end
  4975.             return partial
  4976.          else
  4977.             error("Adding more chunks is not allowed after receiving the result", 2)
  4978.          end
  4979.       else
  4980.          if instances then
  4981.             local root_H_lo, root_H_hi = {unpack(sha2_H_lo)}, not HEX64 and {unpack(sha2_H_hi)}
  4982.             root_H_lo[1] = XORA5(root_H_lo[1], first_dword_of_parameter_block)
  4983.             root_H_lo[3] = XORA5(root_H_lo[3], 0x4001)
  4984.             if salt ~= "" then
  4985.                xor_blake2_salt(salt, "b", root_H_lo, root_H_hi)
  4986.             end
  4987.             for j = 1, 4 do
  4988.                local inst = instances[j]
  4989.                local bytes_compressed, tail, H_lo, H_hi = inst[1], inst[2], inst[3], inst[4]
  4990.                blake2b_feed_128(H_lo, H_hi, tail..string_rep("\0", 128 - #tail), 0, 128, bytes_compressed, #tail, j == 4)
  4991.                if j % 2 == 0 then
  4992.                   local index = 0
  4993.                   for k = j - 1, j do
  4994.                      local inst = instances[k]
  4995.                      local H_lo, H_hi = inst[3], inst[4]
  4996.                      for i = 1, 8 do
  4997.                         index = index + 1
  4998.                         common_W_blake2b[index] = H_lo[i]
  4999.                         if H_hi then
  5000.                            index = index + 1
  5001.                            common_W_blake2b[index] = H_hi[i]
  5002.                         end
  5003.                      end
  5004.                   end
  5005.                   blake2b_feed_128(root_H_lo, root_H_hi, nil, 0, 128, 128 * (j/2 - 1), j == 4 and 128, j == 4)
  5006.                end
  5007.             end
  5008.             instances = nil
  5009.             local max_reg = ceil(digest_size_in_bytes / 8)
  5010.             if HEX64 then
  5011.                for j = 1, max_reg do
  5012.                   root_H_lo[j] = HEX64(root_H_lo[j])
  5013.                end
  5014.             else
  5015.                for j = 1, max_reg do
  5016.                   root_H_lo[j] = HEX(root_H_hi[j])..HEX(root_H_lo[j])
  5017.                end
  5018.             end
  5019.             result = sub(gsub(table_concat(root_H_lo, "", 1, max_reg), "(..)(..)(..)(..)(..)(..)(..)(..)", "%8%7%6%5%4%3%2%1"), 1, digest_size_in_bytes * 2)
  5020.          end
  5021.          return result
  5022.       end
  5023.    end
  5024.  
  5025.    if key_length > 0 then
  5026.       key = key..string_rep("\0", 128 - key_length)
  5027.       for j = 1, 4 do
  5028.          partial(key)
  5029.       end
  5030.    end
  5031.    if message then
  5032.       -- Actually perform calculations and return the BLAKE2bp digest of a message
  5033.       return partial(message)()
  5034.    else
  5035.       -- Return function for chunk-by-chunk loading
  5036.       -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2bp digest by invoking this function without an argument
  5037.       return partial
  5038.    end
  5039.  
  5040. end
  5041.  
  5042. local function blake2x(inner_func, inner_func_letter, common_W_blake2, block_size, digest_size_in_bytes, message, key, salt)
  5043.    local XOF_digest_length_limit, XOF_digest_length, chunk_by_chunk_output = 2^(block_size / 2) - 1
  5044.    if digest_size_in_bytes == -1 then  -- infinite digest
  5045.       digest_size_in_bytes = math_huge
  5046.       XOF_digest_length = floor(XOF_digest_length_limit)
  5047.       chunk_by_chunk_output = true
  5048.    else
  5049.       if digest_size_in_bytes < 0 then
  5050.          digest_size_in_bytes = -1.0 * digest_size_in_bytes
  5051.          chunk_by_chunk_output = true
  5052.       end
  5053.       XOF_digest_length = floor(digest_size_in_bytes)
  5054.       if XOF_digest_length >= XOF_digest_length_limit then
  5055.          error("Requested digest is too long.  BLAKE2X"..inner_func_letter.." finite digest is limited by (2^"..floor(block_size / 2)..")-2 bytes.  Hint: you can generate infinite digest.", 2)
  5056.       end
  5057.    end
  5058.    salt = salt or ""
  5059.    if salt ~= "" then
  5060.       xor_blake2_salt(salt, inner_func_letter)  -- don't xor, only check the size of salt
  5061.    end
  5062.    local inner_partial = inner_func(nil, key, salt, nil, XOF_digest_length)
  5063.    local result
  5064.  
  5065.    local function partial(message_part)
  5066.       if message_part then
  5067.          if inner_partial then
  5068.             inner_partial(message_part)
  5069.             return partial
  5070.          else
  5071.             error("Adding more chunks is not allowed after receiving the result", 2)
  5072.          end
  5073.       else
  5074.          if inner_partial then
  5075.             local half_W, half_W_size = inner_partial()
  5076.             half_W_size, inner_partial = half_W_size or 8
  5077.  
  5078.             local function get_hash_block(block_no)
  5079.                -- block_no = 0...(2^32-1)
  5080.                local size = math_min(block_size, digest_size_in_bytes - block_no * block_size)
  5081.                if size <= 0 then
  5082.                   return ""
  5083.                end
  5084.                for j = 1, half_W_size do
  5085.                   common_W_blake2[j] = half_W[j]
  5086.                end
  5087.                for j = half_W_size + 1, 2 * half_W_size do
  5088.                   common_W_blake2[j] = 0
  5089.                end
  5090.                return inner_func(nil, nil, salt, size, XOF_digest_length, floor(block_no))
  5091.             end
  5092.  
  5093.             local hash = {}
  5094.             if chunk_by_chunk_output then
  5095.                local pos, period, cached_block_no, cached_block = 0, block_size * 2^32
  5096.  
  5097.                local function get_next_part_of_digest(arg1, arg2)
  5098.                   if arg1 == "seek" then
  5099.                      -- Usage #1:  get_next_part_of_digest("seek", new_pos)
  5100.                      pos = arg2 % period
  5101.                   else
  5102.                      -- Usage #2:  hex_string = get_next_part_of_digest(size)
  5103.                      local size, index = arg1 or 1, 0
  5104.                      while size > 0 do
  5105.                         local block_offset = pos % block_size
  5106.                         local block_no = (pos - block_offset) / block_size
  5107.                         local part_size = math_min(size, block_size - block_offset)
  5108.                         if cached_block_no ~= block_no then
  5109.                            cached_block_no = block_no
  5110.                            cached_block = get_hash_block(block_no)
  5111.                         end
  5112.                         index = index + 1
  5113.                         hash[index] = sub(cached_block, block_offset * 2 + 1, (block_offset + part_size) * 2)
  5114.                         size = size - part_size
  5115.                         pos = (pos + part_size) % period
  5116.                      end
  5117.                      return table_concat(hash, "", 1, index)
  5118.                   end
  5119.                end
  5120.  
  5121.                result = get_next_part_of_digest
  5122.             else
  5123.                for j = 1.0, ceil(digest_size_in_bytes / block_size) do
  5124.                   hash[j] = get_hash_block(j - 1.0)
  5125.                end
  5126.                result = table_concat(hash)
  5127.             end
  5128.          end
  5129.          return result
  5130.       end
  5131.    end
  5132.  
  5133.    if message then
  5134.       -- Actually perform calculations and return the BLAKE2X digest of a message
  5135.       return partial(message)()
  5136.    else
  5137.       -- Return function for chunk-by-chunk loading
  5138.       -- User should feed every chunk of input data as single argument to this function and finally get BLAKE2X digest by invoking this function without an argument
  5139.       return partial
  5140.    end
  5141. end
  5142.  
  5143. local function blake2xs(digest_size_in_bytes, message, key, salt)
  5144.    -- digest_size_in_bytes:
  5145.    --    0..65534       = get finite digest as single Lua string
  5146.    --    (-1)           = get infinite digest in "chunk-by-chunk" output mode
  5147.    --    (-2)..(-65534) = get finite digest in "chunk-by-chunk" output mode
  5148.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  5149.    -- key:      (optional) binary string up to 32 bytes, by default empty string
  5150.    -- salt:     (optional) binary string up to 16 bytes, by default empty string
  5151.    return blake2x(blake2s, "s", common_W_blake2s, 32, digest_size_in_bytes, message, key, salt)
  5152. end
  5153.  
  5154. local function blake2xb(digest_size_in_bytes, message, key, salt)
  5155.    -- digest_size_in_bytes:
  5156.    --    0..4294967294       = get finite digest as single Lua string
  5157.    --    (-1)                = get infinite digest in "chunk-by-chunk" output mode
  5158.    --    (-2)..(-4294967294) = get finite digest in "chunk-by-chunk" output mode
  5159.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  5160.    -- key:      (optional) binary string up to 64 bytes, by default empty string
  5161.    -- salt:     (optional) binary string up to 32 bytes, by default empty string
  5162.    return blake2x(blake2b, "b", common_W_blake2b, 64, digest_size_in_bytes, message, key, salt)
  5163. end
  5164.  
  5165.  
  5166. local function blake3(message, key, digest_size_in_bytes, message_flags, K, return_array)
  5167.    -- message:  binary string to be hashed (or nil for "chunk-by-chunk" input mode)
  5168.    -- key:      (optional) binary string up to 32 bytes, by default empty string
  5169.    -- digest_size_in_bytes: (optional) by default 32
  5170.    --    0,1,2,3,4,...  = get finite digest as single Lua string
  5171.    --    (-1)           = get infinite digest in "chunk-by-chunk" output mode
  5172.    --    -2,-3,-4,...   = get finite digest in "chunk-by-chunk" output mode
  5173.    -- The last three parameters "message_flags", "K" and "return_array" are for internal use only, user must omit them (or pass nil)
  5174.    key = key or ""
  5175.    digest_size_in_bytes = digest_size_in_bytes or 32
  5176.    message_flags = message_flags or 0
  5177.    if key == "" then
  5178.       K = K or sha2_H_hi
  5179.    else
  5180.       local key_length = #key
  5181.       if key_length > 32 then
  5182.          error("BLAKE3 key length must not exceed 32 bytes", 2)
  5183.       end
  5184.       key = key..string_rep("\0", 32 - key_length)
  5185.       K = {}
  5186.       for j = 1, 8 do
  5187.          local a, b, c, d = byte(key, 4*j-3, 4*j)
  5188.          K[j] = ((d * 256 + c) * 256 + b) * 256 + a
  5189.       end
  5190.       message_flags = message_flags + 16  -- flag:KEYED_HASH
  5191.    end
  5192.    local tail, H, chunk_index, blocks_in_chunk, stack_size, stack = "", {}, 0, 0, 0, {}
  5193.    local final_H_in, final_block_length, chunk_by_chunk_output, result, wide_output = K
  5194.    local final_compression_flags = 3      -- flags:CHUNK_START,CHUNK_END
  5195.  
  5196.    local function feed_blocks(str, offs, size)
  5197.       -- size >= 0, size is multiple of 64
  5198.       while size > 0 do
  5199.          local part_size_in_blocks, block_flags, H_in = 1, 0, H
  5200.          if blocks_in_chunk == 0 then
  5201.             block_flags = 1               -- flag:CHUNK_START
  5202.             H_in, final_H_in = K, H
  5203.             final_compression_flags = 2   -- flag:CHUNK_END
  5204.          elseif blocks_in_chunk == 15 then
  5205.             block_flags = 2               -- flag:CHUNK_END
  5206.             final_compression_flags = 3   -- flags:CHUNK_START,CHUNK_END
  5207.             final_H_in = K
  5208.          else
  5209.             part_size_in_blocks = math_min(size / 64, 15 - blocks_in_chunk)
  5210.          end
  5211.          local part_size = part_size_in_blocks * 64
  5212.          blake3_feed_64(str, offs, part_size, message_flags + block_flags, chunk_index, H_in, H)
  5213.          offs, size = offs + part_size, size - part_size
  5214.          blocks_in_chunk = (blocks_in_chunk + part_size_in_blocks) % 16
  5215.          if blocks_in_chunk == 0 then
  5216.             -- completing the currect chunk
  5217.             chunk_index = chunk_index + 1.0
  5218.             local divider = 2.0
  5219.             while chunk_index % divider == 0 do
  5220.                divider = divider * 2.0
  5221.                stack_size = stack_size - 8
  5222.                for j = 1, 8 do
  5223.                   common_W_blake2s[j] = stack[stack_size + j]
  5224.                end
  5225.                for j = 1, 8 do
  5226.                   common_W_blake2s[j + 8] = H[j]
  5227.                end
  5228.                blake3_feed_64(nil, 0, 64, message_flags + 4, 0, K, H)  -- flag:PARENT
  5229.             end
  5230.             for j = 1, 8 do
  5231.                stack[stack_size + j] = H[j]
  5232.             end
  5233.             stack_size = stack_size + 8
  5234.          end
  5235.       end
  5236.    end
  5237.  
  5238.    local function get_hash_block(block_no)
  5239.       local size = math_min(64, digest_size_in_bytes - block_no * 64)
  5240.       if block_no < 0 or size <= 0 then
  5241.          return ""
  5242.       end
  5243.       if chunk_by_chunk_output then
  5244.          for j = 1, 16 do
  5245.             common_W_blake2s[j] = stack[j + 16]
  5246.          end
  5247.       end
  5248.       blake3_feed_64(nil, 0, 64, final_compression_flags, block_no, final_H_in, stack, wide_output, final_block_length)
  5249.       if return_array then
  5250.          return stack
  5251.       end
  5252.       local max_reg = ceil(size / 4)
  5253.       for j = 1, max_reg do
  5254.          stack[j] = HEX(stack[j])
  5255.       end
  5256.       return sub(gsub(table_concat(stack, "", 1, max_reg), "(..)(..)(..)(..)", "%4%3%2%1"), 1, size * 2)
  5257.    end
  5258.  
  5259.    local function partial(message_part)
  5260.       if message_part then
  5261.          if tail then
  5262.             local offs = 0
  5263.             if tail ~= "" and #tail + #message_part > 64 then
  5264.                offs = 64 - #tail
  5265.                feed_blocks(tail..sub(message_part, 1, offs), 0, 64)
  5266.                tail = ""
  5267.             end
  5268.             local size = #message_part - offs
  5269.             local size_tail = size > 0 and (size - 1) % 64 + 1 or 0
  5270.             feed_blocks(message_part, offs, size - size_tail)
  5271.             tail = tail..sub(message_part, #message_part + 1 - size_tail)
  5272.             return partial
  5273.          else
  5274.             error("Adding more chunks is not allowed after receiving the result", 2)
  5275.          end
  5276.       else
  5277.          if tail then
  5278.             final_block_length = #tail
  5279.             tail = tail..string_rep("\0", 64 - #tail)
  5280.             if common_W_blake2s[0] then
  5281.                for j = 1, 16 do
  5282.                   local a, b, c, d = byte(tail, 4*j-3, 4*j)
  5283.                   common_W_blake2s[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
  5284.                end
  5285.             else
  5286.                for j = 1, 16 do
  5287.                   local a, b, c, d = byte(tail, 4*j-3, 4*j)
  5288.                   common_W_blake2s[j] = ((d * 256 + c) * 256 + b) * 256 + a
  5289.                end
  5290.             end
  5291.             tail = nil
  5292.             for stack_size = stack_size - 8, 0, -8 do
  5293.                blake3_feed_64(nil, 0, 64, message_flags + final_compression_flags, chunk_index, final_H_in, H, nil, final_block_length)
  5294.                chunk_index, final_block_length, final_H_in, final_compression_flags = 0, 64, K, 4  -- flag:PARENT
  5295.                for j = 1, 8 do
  5296.                   common_W_blake2s[j] = stack[stack_size + j]
  5297.                end
  5298.                for j = 1, 8 do
  5299.                   common_W_blake2s[j + 8] = H[j]
  5300.                end
  5301.             end
  5302.             final_compression_flags = message_flags + final_compression_flags + 8  -- flag:ROOT
  5303.             if digest_size_in_bytes < 0 then
  5304.                if digest_size_in_bytes == -1 then  -- infinite digest
  5305.                   digest_size_in_bytes = math_huge
  5306.                else
  5307.                   digest_size_in_bytes = -1.0 * digest_size_in_bytes
  5308.                end
  5309.                chunk_by_chunk_output = true
  5310.                for j = 1, 16 do
  5311.                   stack[j + 16] = common_W_blake2s[j]
  5312.                end
  5313.             end
  5314.             digest_size_in_bytes = math_min(2^53, digest_size_in_bytes)
  5315.             wide_output = digest_size_in_bytes > 32
  5316.             if chunk_by_chunk_output then
  5317.                local pos, cached_block_no, cached_block = 0.0
  5318.  
  5319.                local function get_next_part_of_digest(arg1, arg2)
  5320.                   if arg1 == "seek" then
  5321.                      -- Usage #1:  get_next_part_of_digest("seek", new_pos)
  5322.                      pos = arg2 * 1.0
  5323.                   else
  5324.                      -- Usage #2:  hex_string = get_next_part_of_digest(size)
  5325.                      local size, index = arg1 or 1, 32
  5326.                      while size > 0 do
  5327.                         local block_offset = pos % 64
  5328.                         local block_no = (pos - block_offset) / 64
  5329.                         local part_size = math_min(size, 64 - block_offset)
  5330.                         if cached_block_no ~= block_no then
  5331.                            cached_block_no = block_no
  5332.                            cached_block = get_hash_block(block_no)
  5333.                         end
  5334.                         index = index + 1
  5335.                         stack[index] = sub(cached_block, block_offset * 2 + 1, (block_offset + part_size) * 2)
  5336.                         size = size - part_size
  5337.                         pos = pos + part_size
  5338.                      end
  5339.                      return table_concat(stack, "", 33, index)
  5340.                   end
  5341.                end
  5342.  
  5343.                result = get_next_part_of_digest
  5344.             elseif digest_size_in_bytes <= 64 then
  5345.                result = get_hash_block(0)
  5346.             else
  5347.                local last_block_no = ceil(digest_size_in_bytes / 64) - 1
  5348.                for block_no = 0.0, last_block_no do
  5349.                   stack[33 + block_no] = get_hash_block(block_no)
  5350.                end
  5351.                result = table_concat(stack, "", 33, 33 + last_block_no)
  5352.             end
  5353.          end
  5354.          return result
  5355.       end
  5356.    end
  5357.  
  5358.    if message then
  5359.       -- Actually perform calculations and return the BLAKE3 digest of a message
  5360.       return partial(message)()
  5361.    else
  5362.       -- Return function for chunk-by-chunk loading
  5363.       -- User should feed every chunk of input data as single argument to this function and finally get BLAKE3 digest by invoking this function without an argument
  5364.       return partial
  5365.    end
  5366. end
  5367.  
  5368. local function blake3_derive_key(key_material, context_string, derived_key_size_in_bytes)
  5369.    -- key_material: (string) your source of entropy to derive a key from (for example, it can be a master password)
  5370.    --               set to nil for feeding the key material in "chunk-by-chunk" input mode
  5371.    -- context_string: (string) unique description of the derived key
  5372.    -- digest_size_in_bytes: (optional) by default 32
  5373.    --    0,1,2,3,4,...  = get finite derived key as single Lua string
  5374.    --    (-1)           = get infinite derived key in "chunk-by-chunk" output mode
  5375.    --    -2,-3,-4,...   = get finite derived key in "chunk-by-chunk" output mode
  5376.    if type(context_string) ~= "string" then
  5377.       error("'context_string' parameter must be a Lua string", 2)
  5378.    end
  5379.    local K = blake3(context_string, nil, nil, 32, nil, true)           -- flag:DERIVE_KEY_CONTEXT
  5380.    return blake3(key_material, nil, derived_key_size_in_bytes, 64, K)  -- flag:DERIVE_KEY_MATERIAL
  5381. end
  5382.  
  5383.  
  5384.  
  5385. local sha = {
  5386.    md5        = md5,                                                                                                                   -- MD5
  5387.    sha1       = sha1,                                                                                                                  -- SHA-1
  5388.    -- SHA-2 hash functions:
  5389.    sha224     = function (message)                       return sha256ext(224, message)                                           end, -- SHA-224
  5390.    sha256     = function (message)                       return sha256ext(256, message)                                           end, -- SHA-256
  5391.    sha512_224 = function (message)                       return sha512ext(224, message)                                           end, -- SHA-512/224
  5392.    sha512_256 = function (message)                       return sha512ext(256, message)                                           end, -- SHA-512/256
  5393.    sha384     = function (message)                       return sha512ext(384, message)                                           end, -- SHA-384
  5394.    sha512     = function (message)                       return sha512ext(512, message)                                           end, -- SHA-512
  5395.    -- SHA-3 hash functions:
  5396.    sha3_224   = function (message)                       return keccak((1600 - 2 * 224) / 8, 224 / 8, false, message)             end, -- SHA3-224
  5397.    sha3_256   = function (message)                       return keccak((1600 - 2 * 256) / 8, 256 / 8, false, message)             end, -- SHA3-256
  5398.    sha3_384   = function (message)                       return keccak((1600 - 2 * 384) / 8, 384 / 8, false, message)             end, -- SHA3-384
  5399.    sha3_512   = function (message)                       return keccak((1600 - 2 * 512) / 8, 512 / 8, false, message)             end, -- SHA3-512
  5400.    shake128   = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 128) / 8, digest_size_in_bytes, true, message) end, -- SHAKE128
  5401.    shake256   = function (digest_size_in_bytes, message) return keccak((1600 - 2 * 256) / 8, digest_size_in_bytes, true, message) end, -- SHAKE256
  5402.    -- HMAC:
  5403.    hmac       = hmac,  -- HMAC(hash_func, key, message) is applicable to any hash function from this module except SHAKE* and BLAKE*
  5404.    -- misc utilities:
  5405.    hex_to_bin    = hex_to_bin,     -- converts hexadecimal representation to binary string
  5406.    bin_to_hex    = bin_to_hex,     -- converts binary string to hexadecimal representation
  5407.    base64_to_bin = base64_to_bin,  -- converts base64 representation to binary string
  5408.    bin_to_base64 = bin_to_base64,  -- converts binary string to base64 representation
  5409.    -- old style names for backward compatibility:
  5410.    hex2bin       = hex_to_bin,
  5411.    bin2hex       = bin_to_hex,
  5412.    base642bin    = base64_to_bin,
  5413.    bin2base64    = bin_to_base64,
  5414.    -- BLAKE2 hash functions:
  5415.    blake2b  = blake2b,   -- BLAKE2b (message, key, salt, digest_size_in_bytes)
  5416.    blake2s  = blake2s,   -- BLAKE2s (message, key, salt, digest_size_in_bytes)
  5417.    blake2bp = blake2bp,  -- BLAKE2bp(message, key, salt, digest_size_in_bytes)
  5418.    blake2sp = blake2sp,  -- BLAKE2sp(message, key, salt, digest_size_in_bytes)
  5419.    blake2xb = blake2xb,  -- BLAKE2Xb(digest_size_in_bytes, message, key, salt)
  5420.    blake2xs = blake2xs,  -- BLAKE2Xs(digest_size_in_bytes, message, key, salt)
  5421.    -- BLAKE2 aliases:
  5422.    blake2      = blake2b,
  5423.    blake2b_160 = function (message, key, salt) return blake2b(message, key, salt, 20) end, -- BLAKE2b-160
  5424.    blake2b_256 = function (message, key, salt) return blake2b(message, key, salt, 32) end, -- BLAKE2b-256
  5425.    blake2b_384 = function (message, key, salt) return blake2b(message, key, salt, 48) end, -- BLAKE2b-384
  5426.    blake2b_512 = blake2b,                                                      -- 64       -- BLAKE2b-512
  5427.    blake2s_128 = function (message, key, salt) return blake2s(message, key, salt, 16) end, -- BLAKE2s-128
  5428.    blake2s_160 = function (message, key, salt) return blake2s(message, key, salt, 20) end, -- BLAKE2s-160
  5429.    blake2s_224 = function (message, key, salt) return blake2s(message, key, salt, 28) end, -- BLAKE2s-224
  5430.    blake2s_256 = blake2s,                                                      -- 32       -- BLAKE2s-256
  5431.    -- BLAKE3 hash function
  5432.    blake3            = blake3,             -- BLAKE3    (message, key, digest_size_in_bytes)
  5433.    blake3_derive_key = blake3_derive_key,  -- BLAKE3_KDF(key_material, context_string, derived_key_size_in_bytes)
  5434. }
  5435.  
  5436.  
  5437. block_size_for_HMAC = {
  5438.    [sha.md5]        =  64,
  5439.    [sha.sha1]       =  64,
  5440.    [sha.sha224]     =  64,
  5441.    [sha.sha256]     =  64,
  5442.    [sha.sha512_224] = 128,
  5443.    [sha.sha512_256] = 128,
  5444.    [sha.sha384]     = 128,
  5445.    [sha.sha512]     = 128,
  5446.    [sha.sha3_224]   = 144,  -- (1600 - 2 * 224) / 8
  5447.    [sha.sha3_256]   = 136,  -- (1600 - 2 * 256) / 8
  5448.    [sha.sha3_384]   = 104,  -- (1600 - 2 * 384) / 8
  5449.    [sha.sha3_512]   =  72,  -- (1600 - 2 * 512) / 8
  5450. }
  5451.  
  5452.  
  5453. return sha
Add Comment
Please, Sign In to add comment