Bit-efficient uniform random number generators

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#define TEST_ROUNDS 1000000000

#undef UNROLLED_LOOP    /* Use unrolled loop variants (for branch predictors) */
#define LONG_MULTIPLICATION     /* Long multiplication vs. bitmask with
                                 * rejection */

#undef BIT_ACCOUNTING   /* Keep track of used bits */
#undef PRINT_RESULTS    /* Print individual get_uniform_rand() results */

#undef BOGUS_RANDOMNESS /* Rotate seed by one bit per request instead of
                         * proper PRNG */

/* The state word must be initialized to non-zero */
static uint64_t state;

static uint64_t buf[2];
static uint64_t left[2] = {0, 0};

/* By Marsaglia(?) */
static uint64_t
xorshift64s(void)
{
  uint64_t x = state;

  x ^= x >> 12;
  x ^= x << 25;
  x ^= x >> 27;
  state = x;

  return x * 2685821657736338717ULL;
}

static void
fill_primary(uint64_t n)
{
  if (64 - left[0] >= left[1])
  {
    if (__builtin_expect(left[0] != 0, 1))
    {
      buf[0] = buf[0] | (buf[1] >> left[0]);
      buf[1] <<= left[0];
    }
    else
    {
      buf[0] = buf[1];
      buf[1] = 0;
    }

    left[0] += left[1];
    buf[1] = xorshift64s();
    left[1] = 64;
  }

  if (__builtin_expect(left[0] >= n, 1))
  {
    return;
  }

  buf[0] |= buf[1] >> left[0];
  buf[1] <<= 64 - left[0];
  left[1] -= 64 - left[0];
  left[0] = 64;
}

static void
init_fill(uint64_t x)
{
  state = x;
  buf[1] = xorshift64s();
  left[1] = 64;

  fill_primary(64);
}

/* Provides n random bits on most significant bits. */
static uint64_t
peek_bits(uint64_t n)
{
#ifndef BOGUS_RANDOMNESS
  if (__builtin_expect(left[0] < n, 0))
  {
    fill_primary(n);
  }

  return buf[0];
#else
  state = (state << 1) + (state >> 63);

  return state;
#endif
}

#ifdef BIT_ACCOUNTING
static uint64_t used_bits = 0;
#endif

/* Consumes bits peeked by earlier peek_bits() call. Multiple calls can be
 * made for one peek_bits() call, but sum of arguments must be less or equal
 * to earlier peek. */
static void
consume_bits(uint64_t n)
{
  buf[0] <<= n;
  left[0] -= n;

#ifdef BIT_ACCOUNTING
  used_bits += n;
#endif
}

#ifndef LONG_MULTIPLICATION
/* Variable length bitmask with rejection method. */

#ifndef UNROLLED_LOOP
static uint64_t
get_uniform_rand(uint64_t range)
{
  uint64_t shift = __builtin_clzl(range);
  uint64_t res;

  if (__builtin_expect(range == 0, 0))
  {
    return 0;
  }

  do
  {
    res = peek_bits(64 - shift) >> shift;
    consume_bits(64 - shift);
  }
  while (__builtin_expect(res >= range, 0));

  return res;
}

#else   /* UNROLLED_LOOP */
static uint64_t
get_uniform_rand(uint64_t range)
{
  uint64_t rounds = 2;
  uint64_t shift = __builtin_clzl(range);
  uint64_t bitsperloop;
  uint64_t res;
  uint64_t nores = 1;
  uint64_t neededrounds = 0;

  if (__builtin_expect(range == 0, 0))
  {
    return 0;
  }

  if (rounds * (64 - shift) <= 64)
  {
    bitsperloop = rounds * (64 - shift);
  }
  else
  {
    bitsperloop = 64 - shift;
    rounds = 1;
  }

  do
  {
    uint64_t rbits = peek_bits(bitsperloop);
    uint64_t neededrounds = 0;

    /* Unrolled loop */
    for (uint64_t i = 0; i < rounds; i++)
    {
      res = nores ? rbits >> shift : res;
      rbits <<= 64 - shift;
      neededrounds += nores;
      nores = res >= range;
    }

    consume_bits(neededrounds * (64 - shift));
  }
  while (__builtin_expect(nores, 0));

  return res;
}

#endif  /* UNROLLED_LOOP */

#else   /* LONG_MULTIPLICATION */
/* These versions effectively perform a long multiplication until a fixed
 * point integer portion of the computation can't change or a carry makes it
 * increment. */

#ifndef UNROLLED_LOOP
static uint64_t
get_uniform_rand(uint64_t range)
{
  uint64_t shift = __builtin_clzl(range);
  uint64_t add = range << shift;
  unsigned __int128 mul;
  uint64_t res, frac;
  uint64_t rbits;
  uint64_t v0 = 1, v1 = 0;

  if (__builtin_expect(range == 0, 0))
  {
    return 0;
  }

  rbits = peek_bits(64 - shift);
  consume_bits(64 - shift);

  frac = mul = (unsigned __int128)(rbits & (~0ULL << shift)) * range;
  res = mul >> 64;

  /* This is essentially arbitrary-precision multiplication, tracking 65
   * bits. */
  while ((v0 ^ v1) & (frac > -add))
  {
    uint64_t tmp;

    /* old top bit */
    v0 = frac >> 63;
    frac <<= 1;

    tmp = frac;
    frac += ((int64_t) peek_bits(1) >> 63) & add;
    /* new top bit */
    v1 = (tmp > frac);

    consume_bits(1);

    /* If both v0 and v1 are set a bit carries to the result. */
    res += v0 & v1;
  }

  return res;
}

#else   /* UNROLLED_LOOP */

static uint64_t
get_uniform_rand(uint64_t range)
{
  const uint64_t rounds = 2;
  uint64_t shift = __builtin_clzl(range);
  uint64_t add = range << shift;
  unsigned __int128 mul;
  uint64_t res, frac;
  uint64_t rbits;
  uint64_t v0 = 1, v1 = 0;
  uint64_t ok = 1;

  if (__builtin_expect(range == 0, 0))
  {
    return 0;
  }

  rbits = peek_bits(64 - shift);
  consume_bits(64 - shift);

  frac = mul = (unsigned __int128)(rbits & (~0ULL << shift)) * range;
  res = mul >> 64;

  /* Can the fractional part still cause res to increment? */
  do
  {
    uint64_t neededbits = 0;

    rbits = peek_bits(rounds);

    /* Unrolled loop */
    for (uint64_t i = 0; i < rounds; i++)
    {
      uint64_t tmp;

      ok &= (v0 ^ v1) & (frac > -add);
      neededbits += ok;

      /* old top bit */
      v0 = frac >> 63;
      frac <<= 1;

      /* random bit */
      v1 = rbits >> 63;
      rbits <<= 1;

      tmp = frac;
      frac += -(ok & v1) & add;
      /* new top bit */
      v1 = (tmp > frac);

      /* If both v0 and v1 are set a bit carries to the result. */
      res += ok & v0 & v1;
    }

    consume_bits(neededbits);
  }
  while (__builtin_expect(ok, 0));

  return res;
}

#endif  /* UNROLLED_LOOP */
#endif  /* LONG_MULTIPLICATION */

int
main(int argc, char **argv)
{
  uint64_t mod;
  uint64_t x = 0;

  if (argc != 3)
  {
    return 1;
  }
  /* Largest possible return value minus one. */
  mod = atoll(argv[1]);

  /* Seed to xorshift64s */
  init_fill(atoll(argv[2]));

  for (int i = 0; i < TEST_ROUNDS; i++)
  {
    uint64_t res = get_uniform_rand((volatile uint64_t)mod);

#ifdef PRINT_RESULTS
    printf("res %llu\n", res);
#endif

    /* assert(res < mod); */
    x += res;
  }

#ifdef BIT_ACCOUNTING
  printf("used bits: %llu\n", used_bits);
#endif

  printf("sum %llu\n", x);
  return 0;
}