Guest User

IntMath.h

a guest
Jun 25th, 2025
31
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 21.32 KB | Source Code | 0 0
  1. #pragma once
  2.  
  3. #include <stdint.h>
  4. #include <stdbool.h>
  5.  
  6. #include "Machine.h"
  7.  
  8. #define NATIVE_INT128 (MACHINE_PTR64 && (__GNUC__ || __clang__))
  9.  
  10. #if NATIVE_INT128
  11. typedef __int128 int128_t;
  12. typedef unsigned __int128 uint128_t;
  13. #endif
  14.  
  15. // For little endian
  16. typedef union {
  17.     struct {
  18.         uint64_t Low;
  19.         uint64_t High;
  20.     };
  21. #if NATIVE_INT128
  22.     uint128_t Full;
  23. #endif
  24. } uint128_split;
  25.  
  26. typedef union {
  27.     struct {
  28.         uint64_t Low;
  29.         int64_t High;
  30.     };
  31. #if NATIVE_INT128
  32.     int128_t Full;
  33. #endif
  34. } int128_split;
  35.  
  36. #define cast_u64_u128(X) ((uint128_split){.Low = (uint64_t)(X)})
  37. #define merge_u64_u128(X, Y) ((uint128_split){.Low = (uint64_t)(X), .High = (uint64_t)(Y)})
  38.  
  39. static inline uint128_split cast_i128_u128(int128_split X) {
  40.     return (uint128_split){.Low = X.Low, .High = (uint64_t)X.High};
  41. }
  42. static inline int128_split cast_u128_i128(uint128_split X) {
  43.     return (int128_split){.Low = X.Low, .High = (int64_t)X.High};
  44. }
  45.  
  46. // Fast log2 (32-bit)
  47.  
  48. #if _MSC_VER
  49.  
  50. #include <intrin.h>
  51.  
  52. #pragma intrinsic(_BitScanReverse)
  53.  
  54. static inline uint8_t log2_u32(uint32_t X) {
  55.     unsigned long Result;
  56.     _BitScanReverse(&Result, X);
  57.     return (uint8_t)Result;
  58. }
  59.  
  60. #elif __GNUC__
  61.  
  62. static inline uint8_t log2_u32(uint32_t X) {
  63.     return 31 - (uint8_t)__builtin_clz(X);
  64. }
  65.  
  66. #else
  67.  
  68. static const uint8_t aLogTable32[32] = {
  69.      0,  9,  1, 10, 13, 21,  2, 29,
  70.     11, 14, 16, 18, 22, 25,  3, 30,
  71.      8, 12, 20, 28, 15, 17, 24,  7,
  72.     19, 27, 23,  6, 26,  5,  4, 31
  73. };
  74.  
  75. static inline uint8_t log2_u32(uint32_t X) {
  76.     X |= X >> 1;
  77.     X |= X >> 2;
  78.     X |= X >> 4;
  79.     X |= X >> 8;
  80.     X |= X >> 16;
  81.     return aLogTable32[(X * 0x07C4ACDD) >> 27];
  82. }
  83.  
  84. #endif
  85.  
  86. // Fast log2 (64-bit)
  87.  
  88. #if _MSC_VER
  89.  
  90. #include <intrin.h>
  91.  
  92.     #if MACHINE_PTR64
  93.  
  94. #pragma intrinsic(_BitScanReverse64)
  95.  
  96. static inline uint8_t log2_u64(uint64_t X) {
  97.     unsigned long Result;
  98.     _BitScanReverse64(&Result, X);
  99.     return (uint8_t)Result;
  100. }
  101.  
  102.     #elif MACHINE_PTR32
  103.  
  104. #pragma intrinsic(_BitScanReverse)
  105.  
  106. static inline uint8_t log2_u64(uint64_t X) {
  107.     unsigned long Result;
  108.     uint32_t High = (uint32_t)(X >> 32);
  109.     if (High == 0) {
  110.         _BitScanReverse(&Result, (uint32_t)X);
  111.         return (uint8_t)Result;
  112.     } else {
  113.         _BitScanReverse(&Result, High);
  114.         return (uint8_t)Result + 32;
  115.     }
  116. }
  117.  
  118.     #endif
  119.  
  120. #elif __GNUC__
  121.  
  122. static inline uint8_t log2_u64(uint64_t X) {
  123.     return 63 - (uint8_t)__builtin_clzll(X);
  124. }
  125.  
  126. #else
  127.  
  128. static const uint8_t aLogTable64[64] = {
  129.      0, 47,  1, 56, 48, 27,  2, 60,
  130.     57, 49, 41, 37, 28, 16,  3, 61,
  131.     54, 58, 35, 52, 50, 42, 21, 44,
  132.     38, 32, 29, 23, 17, 11,  4, 62,
  133.     46, 55, 26, 59, 40, 36, 15, 53,
  134.     34, 51, 20, 43, 31, 22, 10, 45,
  135.     25, 39, 14, 33, 19, 30,  9, 24,
  136.     13, 18,  8, 12,  7,  6,  5, 63
  137. };
  138.  
  139. static inline uint8_t log2_u64(uint64_t X) {
  140.     X |= X >> 1;
  141.     X |= X >> 2;
  142.     X |= X >> 4;
  143.     X |= X >> 8;
  144.     X |= X >> 16;
  145.     X |= X >> 32;
  146.     return aLogTable64[(X * 0x03F79D71B4CB0A89) >> 58];
  147. }
  148.  
  149. #endif
  150.  
  151. // Fast log2 (128-bit)
  152.  
  153. static inline uint8_t log2_u128(uint128_split X) {
  154.     if (X.High == 0)
  155.         return log2_u64(X.Low);
  156.     else
  157.         return log2_u64(X.High) + 64;
  158. }
  159.  
  160. // Fast log2 (pointer)
  161.  
  162. #if MACHINE_PTR64
  163.  
  164. static inline uint8_t log2_uptr(uintptr_t X) {
  165.     return log2_u64(X);
  166. }
  167.  
  168. #elif MACHINE_PTR32
  169.  
  170. static inline uint8_t log2_uptr(uintptr_t X) {
  171.     return log2_u32(X);
  172. }
  173.  
  174. #endif
  175.  
  176. // Bit scan forward (32-bit)
  177.  
  178. #if _MSC_VER
  179.  
  180. #pragma intrinsic(_BitScanForward)
  181.  
  182. static inline uint8_t bsf_u32(uint32_t X) {
  183.     unsigned long Result;
  184.     _BitScanForward(&Result, X);
  185.     return (uint8_t)Result;
  186. }
  187.  
  188. #elif __GNUC__
  189.  
  190. static inline uint8_t bsf_u32(uint32_t X) {
  191.     return (uint8_t)__builtin_ctz(X);
  192. }
  193.  
  194. #else
  195.    
  196. // Source: https://www.chessprogramming.org/Kim_Walisch#Bitscan
  197.  
  198. static inline uint8_t bsf_u32(uint32_t X) {
  199.     return aLogTable32[((X ^ (X - 1)) * 0x07C4ACDD) >> 27];
  200. }
  201.  
  202. #endif
  203.  
  204. // Bit scan forward (64-bit)
  205.  
  206. #if _MSC_VER
  207.  
  208.     #if MACHINE_PTR64
  209.  
  210. #pragma intrinsic(_BitScanForward64)
  211.  
  212. static inline uint8_t bsf_u64(uint64_t X) {
  213.     unsigned long Result;
  214.     _BitScanForward64(&Result, X);
  215.     return (uint8_t)Result;
  216. }
  217.  
  218.     #elif MACHINE_PTR32
  219.  
  220. #pragma intrinsic(_BitScanForward)
  221.  
  222. static inline uint8_t bsf_u64(uint64_t X) {
  223.     unsigned long Result;
  224.     uint32_t Low = (uint32_t)X;
  225.     if (Low == 0) {
  226.         _BitScanForward(&Result, (uint32_t)(X >> 32));
  227.         return (uint8_t)Result + 32;
  228.     } else {
  229.         _BitScanForward(&Result, Low);
  230.         return (uint8_t)Result;
  231.     }
  232. }
  233.  
  234.     #endif
  235.  
  236. #elif __GNUC__
  237.  
  238. static inline uint8_t bsf_u64(uint64_t X) {
  239.     return (uint8_t)__builtin_ctzll(X);
  240. }
  241.  
  242. #else
  243.  
  244. // Source: https://www.chessprogramming.org/BitScan#With_separated_LS1B
  245.  
  246. static inline uint8_t bsf_u64(uint64_t X) {
  247.    return aLogTable64[((X ^ (X - 1)) * 0x03F79D71B4CB0A89) >> 58];
  248. }
  249.  
  250. #endif
  251.  
  252. // Bit scan forward (pointer)
  253.  
  254. #if MACHINE_PTR64
  255.  
  256. static inline uint8_t bsf_uptr(uintptr_t X) {
  257.     return bsf_u64(X);
  258. }
  259.  
  260. #elif MACHINE_PTR32
  261.  
  262. static inline uint8_t bsf_uptr(uintptr_t X) {
  263.     return bsf_u32(X);
  264. }
  265.  
  266. #endif
  267.  
  268. // Multiply two 64-bit integers to get 128-bit Result
  269.  
  270. #if NATIVE_INT128
  271.  
  272. static inline uint128_split mul_full_u64(uint64_t A, uint64_t B) {
  273.     return (uint128_split){.Full = (uint128_t)A * B};
  274. }
  275.  
  276. #elif _MSC_VER && MACHINE_AMD64
  277.    
  278. #pragma intrinsic(_umul128)
  279.  
  280. static inline uint128_split mul_full_u64(uint64_t A, uint64_t B) {
  281.     uint128_split Result;
  282.     Result.Low = _umul128(A, B, &Result.High);
  283.     return Result;
  284. }
  285.  
  286. #else
  287.  
  288. static inline uint128_split mul_full_u64(uint64_t A, uint64_t B) {
  289.     uint32_t A0 = (uint32_t)A;
  290.     uint32_t A1 = (uint32_t)(A >> 32);
  291.     uint32_t B0 = (uint32_t)B;
  292.     uint32_t B1 = (uint32_t)(B >> 32);
  293.  
  294.     uint64_t M00 = (uint64_t)A0 * B0;
  295.     uint64_t M01 = (uint64_t)A0 * B1;
  296.     uint64_t M10 = (uint64_t)A1 * B0;
  297.     uint64_t M11 = (uint64_t)A1 * B1;
  298.    
  299.     uint8_t Carry = 0;
  300.     uint64_t Mid = (M00 >> 32);
  301.     Mid += M01;
  302.     Carry += (Mid < M01);
  303.     Mid += M10;
  304.     Carry += (Mid < M10);
  305.    
  306.     return merge_u64_u128(
  307.         (Mid << 32) | (uint32_t)M00,
  308.         (((uint64_t)Carry << 32) | (Mid >> 32)) + M11
  309.     );
  310. }
  311.  
  312. #endif
  313.  
  314. // Divide (64-bit)
  315.  
  316. #define NATIVE_DIV_LOW_64_32 (MACHINE_PTR64 || (MACHINE_IA32 && (_MSC_VER || __GNUC__)))
  317.  
  318. #if MACHINE_PTR32
  319.  
  320.     #if !NATIVE_DIV_LOW_64_32
  321.  
  322. static inline uint64_t div_u64_u16(uint64_t A, uint16_t B, uint16_t* pRem) {
  323.     uint64_t Res64 = 0;
  324.     uint32_t Rem32 = 0; // Can be uint16_t but use uint32_t to avoid casts.
  325.     for (uint8_t i = 0; i < 4; ++i) {
  326.         uint32_t A32 = (Rem32 << 16) | (uint32_t)(A >> 48);
  327.         Res64 = (Res64 << 16) | (A32 / B);
  328.         Rem32 = A32 % B;
  329.         A <<= 16;
  330.     };
  331.     *pRem = (uint16_t)Rem32;
  332.     return Res64;
  333. }
  334.  
  335.     #endif
  336.  
  337.     #if NATIVE_DIV_LOW_64_32
  338.  
  339.         #if _MSC_VER && !__clang__
  340.  
  341. // Clang-Cl doesn't support this
  342. #pragma intrinsic(_udiv64)
  343.  
  344. static inline uint32_t div_low_u64_u32(uint64_t A, uint32_t B, uint32_t* pRem) {
  345.     return _udiv64(A, B, pRem);
  346. }
  347.         #else
  348.  
  349. // GCC extended inline assembly
  350. static inline uint32_t div_low_u64_u32(uint64_t A, uint32_t B, uint32_t* pRem) {
  351.     uint32_t A0 = (uint32_t)A;
  352.     uint32_t A1 = (uint32_t)(A >> 32);
  353.     uint32_t Res;
  354.     __asm__("divl %4" : "=d" (*pRem), "=a" (Res) : "0" (A1), "1" (A0), "rm" (B));
  355.     return Res;
  356. }
  357.  
  358.         #endif
  359.  
  360.     #else
  361.  
  362. // Based on: https://github.com/ridiculousfish/libdivide/blob/5dc2d36676e7de41a44c6e86c921eb40fe7af3ee/libdivide.h#L556
  363.  
  364. static uint32_t div_low_u64_u32(uint64_t A, uint32_t B, uint32_t *pRem) {
  365.     uint8_t Shift = 31 - log2_u32(B);
  366.     only_reachable(Shift < 64); // Help the compiler
  367.     A <<= Shift;
  368.     B <<= Shift;
  369.  
  370.     uint16_t B0 = (uint16_t)B;
  371.     uint16_t B1 = (uint16_t)(B >> 16);
  372.     uint16_t A00 = (uint16_t)A;
  373.     uint16_t A01 = (uint16_t)((A >> 16) & UINT16_MAX);
  374.     uint32_t A1 = (uint32_t)(A >> 32);
  375.     uint32_t Q2;
  376.     uint32_t R2;
  377.     uint32_t C2;
  378.     uint32_t C3;
  379.    
  380.     Q2 = A1 / B1;
  381.     R2 = A1 % B1;
  382.     C2 = Q2 * B0;
  383.     C3 = (R2 << 16) | A01;
  384.     if (C2 > C3)
  385.         Q2 -= (C2 - C3 > B) ? 2 : 1;
  386.     uint16_t Q1 = (uint16_t)Q2;
  387.  
  388.     uint32_t A10 = (A1 << 16) + A01 - Q1 * B;
  389.    
  390.     Q2 = A10 / B1;
  391.     R2 = A10 % B1;
  392.     C2 = Q2 * B0;
  393.     C3 = (R2 << 16) | A00;
  394.     if (C2 > C3)
  395.         Q2 -= (C2 - C3 > B) ? 2 : 1;
  396.     uint16_t Q0 = (uint16_t)Q2;
  397.  
  398.     *pRem = ((A10 << 16) + A00 - Q0 * B) >> Shift;
  399.     return ((uint32_t)Q1 << 16) | Q0;
  400. }
  401.  
  402.     #endif
  403.  
  404. // Based on: Hacker's Delight, Chapter 9-5
  405.  
  406. static uint64_t div_u64(uint64_t A, uint64_t B, uint64_t* pRem) {
  407.     if ((B >> 32) == 0) {
  408.         uint32_t B0 = (uint32_t)B;
  409.         uint32_t A0 = (uint32_t)A;
  410.         uint32_t A1 = (uint32_t)(A >> 32);
  411.         if (A1 == 0) {
  412.             *pRem = A0 % B0;
  413.             return A0 / B0;
  414.         }
  415.     #if !NATIVE_DIV_LOW_64_32
  416.         if (B0 <= UINT16_MAX){
  417.             uint16_t Rem16;
  418.             uint64_t Res = div_u64_u16(A, (uint16_t)B0, &Rem16);
  419.             *pRem = Rem16;
  420.             return Res;
  421.         }
  422.     #endif
  423.         uint64_t Res;
  424.         uint32_t Rem0;
  425.         if (A1 < B0) {
  426.             Res = div_low_u64_u32(A, B0, &Rem0);
  427.         } else {
  428.             uint64_t Res2 = (uint64_t)(A1 / B0) << 32;
  429.             uint64_t Rem2 = (uint64_t)(A1 % B0) << 32;
  430.             Res = div_low_u64_u32(A0 | Rem2, B0, &Rem0) | Res2;
  431.         }
  432.         *pRem = Rem0;
  433.         return Res;
  434.     }
  435.    
  436.     if (A < B) {
  437.         *pRem = A;
  438.         return 0;
  439.     }
  440.    
  441.     uint8_t Shift = log2_u32((uint32_t)(B >> 32));
  442.     uint32_t Rem0;
  443.     uint64_t Res = div_low_u64_u32(A >> 1, (uint32_t)(B << (31 - Shift) >> 32), &Rem0) >> Shift;
  444.     *pRem = A - ((Res - 1) * B);
  445.     if (*pRem < B) {
  446.         return Res - 1;
  447.     } else {
  448.         *pRem -= B;
  449.         return Res;
  450.     }
  451. }
  452.  
  453. #else
  454.  
  455. static uint64_t div_u64(uint64_t A, uint64_t B, uint64_t* pRem) {
  456.     *pRem = A % B;
  457.     return A / B;
  458. }
  459.  
  460. #endif
  461.  
  462. // Compare (unsigned 128-bit)
  463.  
  464. #if NATIVE_INT128
  465.  
  466. static inline bool cmp_gt_u128(uint128_split A, uint128_split B) {
  467.     return A.Full > B.Full;
  468. }
  469.  
  470. static inline bool cmp_ge_u128(uint128_split A, uint128_split B) {
  471.     return A.Full >= B.Full;
  472. }
  473.  
  474. static inline bool cmp_eq_u128(uint128_split A, uint128_split B) {
  475.     return A.Full == B.Full;
  476. }
  477.  
  478. static inline bool cmp_ne_u128(uint128_split A, uint128_split B) {
  479.     return A.Full != B.Full;
  480. }
  481.  
  482. static inline bool cmp_le_u128(uint128_split A, uint128_split B) {
  483.     return A.Full <= B.Full;
  484. }
  485.  
  486. static inline bool cmp_ls_u128(uint128_split A, uint128_split B) {
  487.     return A.Full < B.Full;
  488. }
  489.  
  490. #else
  491.  
  492. // Logic:
  493. // If operations don't underflow (wrap around):
  494. // cmp(A, B) = any qsort-style compare function
  495. // (A > B) = cmp(A.High, B.High) - (A.Low <= B.Low) >= 0
  496. // (A >= B) = cmp(A.High, B.High) - (A.Low < B.Low) >= 0
  497. // cmp(A, B) = (A > B) - (A < B) is the simplest that doesn't underflow.
  498. //
  499. // Alternative method:
  500. // To avoid underflow, we need to check the carry bit:
  501. // carry(A, B) = (A < B)
  502. // (A > B) = !carry(A.High, B.High) & !carry(cmp(A.High, B.High), A.Low <= B.Low)
  503. // (A >= B) = !carry(A.High, B.High) & !carry(cmp(A.High, B.High), A.Low < B.Low)
  504. // cmp(A, B) = (A - B) is the best choice since carry(A, B) is usually a side effect of it.
  505.  
  506. static inline bool cmp_gt_u128(uint128_split A, uint128_split B) {
  507.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low <= B.Low) >= 0;
  508. }
  509.  
  510. static inline bool cmp_ge_u128(uint128_split A, uint128_split B) {
  511.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low < B.Low) >= 0;
  512. }
  513.  
  514. static inline bool cmp_eq_u128(uint128_split A, uint128_split B) {
  515.     return (A.High == B.High) & (A.Low == B.Low);
  516. }
  517.  
  518. static inline bool cmp_ne_u128(uint128_split A, uint128_split B) {
  519.     return (A.High != B.High) | (A.Low != B.Low);
  520. }
  521.  
  522. static inline bool cmp_le_u128(uint128_split A, uint128_split B) {
  523.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low <= B.Low) < 0;
  524. }
  525.  
  526. static inline bool cmp_ls_u128(uint128_split A, uint128_split B) {
  527.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low < B.Low) < 0;
  528. }
  529.  
  530. #endif
  531.  
  532. // Compare (signed 128-bit)
  533.  
  534. #if NATIVE_INT128
  535.  
  536. static inline bool cmp_gt_i128(int128_split A, int128_split B) {
  537.     return A.Full > B.Full;
  538. }
  539.  
  540. static inline bool cmp_ge_i128(int128_split A, int128_split B) {
  541.     return A.Full >= B.Full;
  542. }
  543.  
  544. static inline bool cmp_eq_i128(int128_split A, int128_split B) {
  545.     return A.Full == B.Full;
  546. }
  547.  
  548. static inline bool cmp_ne_i128(int128_split A, int128_split B) {
  549.     return A.Full != B.Full;
  550. }
  551.  
  552. static inline bool cmp_le_i128(int128_split A, int128_split B) {
  553.     return A.Full <= B.Full;
  554. }
  555.  
  556. static inline bool cmp_ls_i128(int128_split A, int128_split B) {
  557.     return A.Full < B.Full;
  558. }
  559.  
  560. #else
  561.  
  562. // Logic:
  563. // Same as unsigned comparision but the low values doesn't make sense
  564. // when A and B has different signs. Fortunately, cmp(A.High, B.High) guards these cases,
  565. // so the result of the low comparision only matters otherwise.
  566. //
  567. // Alternative method:
  568. // We increase both values by (1 << 127) (flip the last bit)
  569. // to increase all negative values to non-negative. Then, do an unsigned comparision.
  570. // The order doesn't change so the comparision is still correct.
  571.  
  572. static inline bool cmp_gt_i128(int128_split A, int128_split B) {
  573.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low <= B.Low) >= 0;
  574. }
  575.  
  576. static inline bool cmp_ge_i128(int128_split A, int128_split B) {
  577.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low < B.Low) >= 0;
  578. }
  579.  
  580. static inline bool cmp_eq_i128(int128_split A, int128_split B) {
  581.     return (int)(A.High == B.High) & (A.Low == B.Low);
  582. }
  583.  
  584. static inline bool cmp_ne_i128(int128_split A, int128_split B) {
  585.     return (int)(A.High != B.High) | (A.Low != B.Low);
  586. }
  587.  
  588. static inline bool cmp_le_i128(int128_split A, int128_split B) {
  589.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low <= B.Low) < 0;
  590. }
  591.  
  592. static inline bool cmp_ls_i128(int128_split A, int128_split B) {
  593.     return (int)(A.High > B.High) - (A.High < B.High) - (A.Low < B.Low) < 0;
  594. }
  595.  
  596. #endif
  597.  
  598. // Add & Subtract (128-bit)
  599.  
  600. #if NATIVE_INT128
  601.  
  602. static inline uint128_split add_u128(uint128_split A, uint128_split B) {
  603.     return (uint128_split){.Full = A.Full + B.Full};
  604. }
  605.  
  606. static inline uint128_split sub_u128(uint128_split A, uint128_split B) {
  607.     return (uint128_split){.Full = A.Full - B.Full};
  608. }
  609.  
  610. #else
  611.  
  612. static inline uint128_split add_u128(uint128_split A, uint128_split B) {
  613.     uint128_split Result;
  614.     Result.Low  = A.Low  + B.Low;
  615.     Result.High = A.High + B.High + (Result.Low < A.Low);
  616.     return Result;
  617. }
  618.  
  619. static inline uint128_split sub_u128(uint128_split A, uint128_split B) {
  620.     uint128_split Result;
  621.     Result.Low  = A.Low  - B.Low;
  622.     Result.High = A.High - B.High - (Result.Low > A.Low);
  623.     return Result;
  624. }
  625.  
  626. #endif
  627.  
  628. // Bitwise (128-bit)
  629.  
  630. static inline uint128_split or_u128(uint128_split A, uint128_split B) {
  631.     uint128_split Result;
  632.     Result.Low  = A.Low  | B.Low;
  633.     Result.High = A.High | B.High;
  634.     return Result;
  635. }
  636.  
  637. static inline uint128_split and_u128(uint128_split A, uint128_split B) {
  638.     uint128_split Result;
  639.     Result.Low  = A.Low  & B.Low;
  640.     Result.High = A.High & B.High;
  641.     return Result;
  642. }
  643.  
  644. static inline uint128_split xor_u128(uint128_split A, uint128_split B) {
  645.     uint128_split Result;
  646.     Result.Low  = A.Low  ^ B.Low;
  647.     Result.High = A.High ^ B.High;
  648.     return Result;
  649. }
  650.  
  651. static inline uint128_split not_u128(uint128_split A) {
  652.     uint128_split Result;
  653.     Result.Low  = ~A.Low;
  654.     Result.High = ~A.High;
  655.     return Result;
  656. }
  657.  
  658. // Logical Shift left (128-bit)
  659.  
  660. #if NATIVE_INT128
  661.  
  662. static inline uint128_split lshl_u128(uint128_split A, uint8_t B) {
  663.     return (uint128_split){.Full = A.Full << B};
  664. }
  665.  
  666. #else
  667.  
  668. static inline uint128_split lshl_u128(uint128_split A, uint8_t B) {
  669.     uint128_split Result;
  670.     if (B == 0)
  671.         Result = A;
  672.     else if (B < 64) {
  673.         Result.High  = A.High << B;
  674.         Result.High |= A.Low  >> (64 - B);
  675.         Result.Low   = A.Low  << B;
  676.     } else {
  677.         Result.High  = A.Low  << (B - 64);
  678.         Result.Low   = 0;
  679.     }
  680.     return Result;
  681. }
  682.  
  683. #endif
  684.  
  685. // Logical Shift right (128-bit)
  686.  
  687. #if NATIVE_INT128
  688.  
  689. static inline uint128_split lshr_u128(uint128_split A, uint8_t B) {
  690.     return (uint128_split){.Full = A.Full >> B};
  691. }
  692.  
  693. #else
  694.  
  695. static inline uint128_split lshr_u128(uint128_split A, uint8_t B) {
  696.     uint128_split Result;
  697.     if (B == 0)
  698.         Result = A;
  699.     else if (B < 64) {
  700.         Result.Low  = A.Low  >> B;
  701.         Result.Low |= A.High << (64 - B);
  702.         Result.High = A.High >> B;
  703.     } else {
  704.         Result.Low  = A.High >> (B - 64);
  705.         Result.High = 0;
  706.     }
  707.     return Result;
  708. }
  709.  
  710. #endif
  711.  
  712. // Arithmetic Shift right (128-bit)
  713.  
  714. #if NATIVE_INT128
  715.  
  716. static inline int128_split ashr_i128(int128_split A, uint8_t B) {
  717.     return (int128_split){.Full = A.Full >> B};
  718. }
  719.  
  720. #else
  721.  
  722. static inline int128_split ashr_i128(int128_split A, uint8_t B) {
  723.     int128_split Result;
  724.     if (B == 0)
  725.         Result = A;
  726.     else if (B < 64) {
  727.         Result.Low  = A.Low  >> B;
  728.         Result.Low |= A.High << (64 - B);
  729.         Result.High = A.High >> B;
  730.     } else {
  731.         Result.Low  = A.High >> (B - 64);
  732.         Result.High = A.High >> 63;
  733.     }
  734.     return Result;
  735. }
  736.  
  737. #endif
  738.  
  739. // Multiply (128-bit)
  740. // Low part of signed multiplication is the same as unsigned multiplication.
  741.  
  742. #if NATIVE_INT128
  743.  
  744. static inline uint128_split mul_u128(uint128_split A, uint128_split B) {
  745.     return (uint128_split){.Full = A.Full * B.Full};
  746. }
  747.  
  748. #else
  749.  
  750. static inline uint128_split mul_u128(uint128_split A, uint128_split B) {
  751.     uint128_split M00 = mul_full_u64(A.Low, B.Low);
  752.     uint64_t M01L = A.Low * B.High;
  753.     uint64_t M10L = A.High * B.Low;
  754.     return add_u128(M00, (uint128_split){.High = M01L + M10L});
  755. }
  756.  
  757. #endif
  758.  
  759. // Division (128-bit)
  760.    
  761. #define NATIVE_DIV_LOW_128_64 (MACHINE_AMD64 && (_MSC_VER || __GNUC__))
  762.  
  763. #if !NATIVE_DIV_LOW_64_32
  764.  
  765. static inline uint128_split div_u128_u16(uint128_split A, uint16_t B, uint16_t* pRem) {
  766.     uint128_split Res = {0};
  767.     uint32_t Rem32 = 0; // Can be uint16_t but use uint32_t to avoid casts.
  768.     for (uint8_t i = 0; i < 8; ++i) {
  769.         uint32_t A32 = (Rem32 << 16) | (uint32_t)(A.High >> 48);
  770.         Res = lshl_u128(Res, 16);
  771.         Res.Low |= A32 / B;
  772.         Rem32 = A32 % B;
  773.         A = lshl_u128(A, 16);
  774.     }
  775.     *pRem = (uint16_t)Rem32;
  776.     return Res;
  777. }
  778.  
  779. #endif
  780.  
  781. #if !NATIVE_DIV_LOW_128_64
  782.  
  783. static inline uint128_split div_u128_u32(uint128_split A, uint32_t B, uint32_t* pRem) {
  784.     uint128_split Res = {0};
  785.     uint64_t Rem64 = 0; // Can be uint32_t but use uint64_t to avoid casts.
  786.     for (uint8_t i = 0; i < 4; ++i) {
  787.         uint64_t A64 = (Rem64 << 32) | (uint64_t)(A.High >> 32);
  788.         Res = lshl_u128(Res, 32);
  789.         Res.Low |= div_u64(A64, B, &Rem64);
  790.         A = lshl_u128(A, 32);
  791.     }
  792.     *pRem = (uint32_t)Rem64;
  793.     return Res;
  794. }
  795.  
  796. #endif
  797.  
  798. #if NATIVE_DIV_LOW_128_64
  799.  
  800.     #if _MSC_VER && !__clang__
  801.  
  802. // Clang-Cl doesn't support this
  803. #pragma intrinsic(_udiv128)
  804.  
  805. static inline uint64_t div_u128_u64(uint128_split A, uint64_t B, uint64_t* pRem) {
  806.     return _udiv128(A.High, A.Low, B, pRem);
  807. }
  808.     #else
  809.  
  810. // GCC extended inline assembly
  811. static inline uint64_t div_u128_u64(uint128_split A, uint64_t B, uint64_t* pRem) {
  812.     uint64_t Res;
  813.     __asm__("divq %4" : "=d" (*pRem), "=a" (Res) : "0" (A.High), "1" (A.Low), "rm" (B));
  814.     return Res;
  815. }
  816.  
  817.     #endif
  818.    
  819. #else
  820.  
  821. // Based on: https://github.com/ridiculousfish/libdivide/blob/5dc2d36676e7de41a44c6e86c921eb40fe7af3ee/libdivide.h#L556
  822.  
  823. static uint64_t div_u128_u64(uint128_split A, uint64_t B, uint64_t *pRem) {
  824.     uint8_t Shift = 63 - log2_u64(B);
  825.     only_reachable(Shift < 64); // Help the compiler
  826.     A = lshl_u128(A, Shift);
  827.     B <<= Shift;
  828.  
  829.     uint32_t B0 = (uint32_t)B;
  830.     uint32_t B1 = (uint32_t)(B >> 32);
  831.     uint32_t A00 = (uint32_t)A.Low;
  832.     uint32_t A01 = (uint32_t)(A.Low >> 32);
  833.     uint64_t Q2;
  834.     uint64_t R2;
  835.     uint64_t C2;
  836.     uint64_t C3;
  837.    
  838.     Q2 = div_u64(A.High, B1, &R2);
  839.     C2 = Q2 * B0;
  840.     C3 = (R2 << 32) | A01;
  841.     if (C2 > C3)
  842.         Q2 -= (C2 - C3 > B) ? 2 : 1;
  843.     uint32_t Q1 = (uint32_t)Q2;
  844.  
  845.     uint64_t A10 = (A.High << 32) + A01 - Q1 * B;
  846.    
  847.     Q2 = div_u64(A10, B1, &R2);
  848.     C2 = Q2 * B0;
  849.     C3 = (R2 << 32) | A00;
  850.     if (C2 > C3)
  851.         Q2 -= (C2 - C3 > B) ? 2 : 1;
  852.     uint32_t Q0 = (uint32_t)Q2;
  853.  
  854.     *pRem = ((A10 << 32) + A00 - Q0 * B) >> Shift;
  855.     return ((uint64_t)Q1 << 32) | Q0;
  856. }
  857.  
  858. #endif
  859.  
  860. // Based on: Hacker's Delight, Chapter 9-5
  861.  
  862. static inline uint128_split div_u128(uint128_split A, uint128_split B, uint128_split* pRem) {
  863.     if (B.High == 0) {
  864.         pRem->High = 0;
  865.         if (A.High == 0)
  866.             return cast_u64_u128(div_u64(A.Low, B.Low, &pRem->Low));
  867. #if !NATIVE_DIV_LOW_64_32
  868.         if (B.Low <= UINT16_MAX) {
  869.             uint16_t Rem16;
  870.             uint128_split Res = div_u128_u16(A, (uint16_t)B.Low, &Rem16);
  871.             pRem->Low = Rem16;
  872.             return Res;
  873.         }
  874. #endif
  875. #if !NATIVE_DIV_LOW_128_64
  876.         if (B.Low >> 32 == 0) { // B.Low <= UINT32_MAX
  877.             uint32_t Rem32;
  878.             uint128_split Res = div_u128_u32(A, (uint32_t)B.Low, &Rem32);
  879.             pRem->Low = Rem32;
  880.             return Res;
  881.         }
  882. #endif
  883.         if (A.High < B.Low)
  884.             return cast_u64_u128(div_u128_u64(A, B.Low, &pRem->Low));
  885.        
  886.         uint64_t Rem1;
  887.         uint64_t Res1 = div_u64(A.High, B.Low, &Rem1);
  888.         uint128_split Rem2 = merge_u64_u128(A.Low, Rem1);
  889.         return merge_u64_u128(div_u128_u64(Rem2, B.Low, &pRem->Low), Res1);
  890.     }
  891.    
  892.     if (cmp_ls_u128(A, B)) {
  893.         *pRem = A;
  894.         return cast_u64_u128(0);
  895.     }
  896.        
  897.     uint8_t Shift = log2_u64(B.High);
  898.    
  899.     // Help the compiler
  900.     only_reachable(Shift < 64);
  901.    
  902.     uint64_t Rem64;
  903.     uint64_t Res64 = div_u128_u64(lshr_u128(A, 1), lshl_u128(B, 63 - Shift).High, &Rem64) >> Shift;
  904.    
  905.     *pRem = sub_u128(A, mul_u128(cast_u64_u128(Res64 - 1), B));
  906.     if (cmp_ls_u128(*pRem, B)) {
  907.         return cast_u64_u128(Res64 - 1);
  908.     } else {
  909.         *pRem = sub_u128(*pRem, B);
  910.         return cast_u64_u128(Res64);
  911.     }
  912. }
  913.  
  914. static inline int128_split div_i128(int128_split A, int128_split B, uint128_split* pRem) {
  915.     uint128_split MaskA = cast_i128_u128(ashr_i128(A, 127));
  916.     uint128_split MaskB = cast_i128_u128(ashr_i128(B, 127));
  917.     uint128_split UA = xor_u128(add_u128(cast_i128_u128(A), MaskA), MaskA);
  918.     uint128_split UB = xor_u128(add_u128(cast_i128_u128(B), MaskB), MaskB);
  919.     uint128_split URem;
  920.     uint128_split URes = div_u128(UA, UB, &URem);
  921.     *pRem = xor_u128(add_u128(URem, MaskB), MaskB);
  922.     uint128_split MaskRes = xor_u128(MaskA, MaskB);
  923.     return cast_u128_i128(xor_u128(add_u128(URes, MaskRes), MaskRes));
  924. }
  925.  
Advertisement
Add Comment
Please, Sign In to add comment