roll.cu

/* roll.cu */

#include <cstdint>
#include <vector>
#include <map>
#include <iostream>
#include <cmath>
#include <cassert>

using namespace std;

typedef uint64_t nat;

#include "../cuda_uint128.h"
typedef uint128_t nat2;
//typedef __uint128_t nat2;

struct roll {
  nat flat;
  nat loop;
};

__host__ __device__
nat operator%(nat2 a, nat b) {
  nat res;
  div128to64(a, b, &res);
  return res;
}

__host__ __device__
nat operator%(nat2 a, roll b) {
  return a < b.flat ? a.lo : (a - b.flat) % b.loop + b.flat;
}

struct divisor_cache {
  vector<nat> primes;
  //  vector<nat> phi;
};

struct gpu_divisor_cache {
  const nat nprimes;
  const nat* primes;
};

__host__
divisor_cache make_cache(nat n) {
  const nat sn = sqrt(n) + 1;
  vector<bool> old(sn);  // sieve of eratosthenes
  vector<nat> primes;
  for (size_t i = 2; i < sn; i++) {
    if (not old[i]) {
      primes.push_back(i);
      for (size_t j = i; j < sn; j += i) {
        old[j] = true;
      }
    }
  }
  return {primes};
}

__device__
nat pow_mod(nat a, nat b, roll c) {
  nat d = 1;
  while (b) {
    if (b % 2) d = ((nat2) a * d) % c;
    a = ((nat2) a * a) % c;
    b /= 2;
  }
  return d;
}

__device__
nat gcd(nat2 a_, nat b) {
  if (b == 0) return a_.lo;  // if a_ is too big, then you asked for it...
  nat a = (nat) (a_ % b);
  while (b) {
    nat c = a % b;
    a = b;
    b = c;
  }
  return a;
}

__device__
nat phi(nat d, gpu_divisor_cache cache, bool known_prime) {
  assert(d != 0);
  if (d == 1) return 1;
  if (known_prime) return d - 1;  // optimization
  nat ret = 1;
  for (nat i = 0; i < cache.nprimes; i++) {
    nat p = cache.primes[i];
    if (p * p > d) break;  // optimization
    if (d % p == 0) {
      d /= p;
      ret *= p - 1;
      while (d % p == 0) {
    d /= p;
    ret *= p;
      }
    }
  }
  if (d != 1) {
    ret *= d - 1;  // d is a prime number
  }
  return ret;
}

/*
__device__
bool is_prime(nat a, gpu_divisor_cache cache) {
  if (a < 2) return false;
  for (nat i = 0; i < cache.nprimes; i++) {
    nat p = cache.primes[i];
    if (a == p) return true;
    if (a % p == 0) return false;
  }
  return true;
}
*/

__device__
roll roll_of_powers(nat a, roll b, gpu_divisor_cache cache, bool known_prime) {
  if (a == 0) return {1, 1};
  if (a == 1) return {0, 1};
  nat flat = 0;
  nat2 a_pow_flat_ = 1;
  while (a_pow_flat_ < b.flat) {
    flat++;
    a_pow_flat_ = a_pow_flat_ * a;
  }
  nat d = b.loop / gcd(a_pow_flat_, b.loop);
  nat c = gcd(d, a);
  while (c != 1) {
    d /= c;
    c = gcd(d, c);
    flat++;
  }
  return {flat, phi(d, cache, known_prime)};
}

// (10 tri 10 + 23) % (roll b)
__device__
nat trimod(roll b, gpu_divisor_cache cache) {
  nat a[] = {10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
  if (10 == 0) return 1 % b;
  roll rolls[10];
  rolls[0] = b;
  for (size_t i = 0; i < 10-1; i++) {
    // assuming b is prime
    rolls[i+1] = roll_of_powers(a[i], rolls[i], cache, i == 0);
  }
  nat c = 1 % b;
  for (size_t i = 10-1; i != (size_t) -1; i--) {
    c = pow_mod(a[i], c, rolls[i]);
  }
  return (c + 23) % b;
}

const nat CORES = 2500;  // keep this a divisor of n
const nat MAXFACTORS = 20;

__device__
void big_sieve(nat start, nat end, gpu_divisor_cache cache, bool* out) {
  for (nat i = 0; start + i < end; i++) {
    out[i] = true;
  }
  for (nat i = 0; start + i < 2; i++) {
    out[i] = false;
  }
  for (nat k = 0; k < cache.nprimes; k++) {
    nat p = cache.primes[k];
    nat j = start / p * p;
    while (j < start or j <= p) j += p;
    for ( ; j < end; j += p) {
      out[j - start] = false;
    }
  }
}

__global__
void trial(nat n, gpu_divisor_cache cache, nat* nfactors, nat* factors,
       nat* canary) {

  nat width = n / CORES;
  nat index = blockIdx.x;

  nat start = index * width;
  nat end = (index + 1) * width;

  const nat CHUNK_SIZE = 10*1000;
  bool chunk[CHUNK_SIZE];
  nat chunk_start;

  for (nat p = start; p < end; p++) {
    // update local prime cache
    if ((p - start) % CHUNK_SIZE == 0) {
      big_sieve(p, p + CHUNK_SIZE, cache, chunk);
      chunk_start = p;
    }

    // check whether p is prime
    if (!chunk[p - chunk_start]) {
      continue;
    }
    canary[index]++;

    // check whether p is a divisor
    if (trimod({0, p}, cache) == 0) {
      assert(nfactors[index] < MAXFACTORS);
      factors[index*MAXFACTORS + nfactors[index]++] = p;
    }

    // bogus code
    //factors[index*MAXFACTORS] = p;
  }

}

int main(void) {
  // define problem size
  nat n = 1L*1000*1000*1000;

  // cache small primes on gpu
  divisor_cache cache = make_cache(n);
  nat pc = cache.primes.size();
  nat* gpu_primes;
  cudaMalloc((void **) &gpu_primes, pc * sizeof(nat));
  cudaMemcpy(gpu_primes, cache.primes.data(), pc * sizeof(nat),
         cudaMemcpyHostToDevice);
  gpu_divisor_cache gpu_cache = {pc, gpu_primes};

  // allocate space for factors
  nat* gpu_nfactors;
  nat* gpu_factors;
  cudaMalloc((void **) &gpu_nfactors, CORES * sizeof(nat));
  cudaMemset(gpu_nfactors, 0, sizeof(nat));
  cudaMalloc((void **) &gpu_factors, CORES * MAXFACTORS * sizeof(nat));

  // allocate space for canaries (debug)
  nat* gpu_canaries;
  cudaMalloc((void **) &gpu_canaries, CORES * sizeof(nat));
  cudaMemset(gpu_canaries, 0, CORES * sizeof(nat));

  // run computation
  cout << "Entering the matrix..." << endl;
  trial<<<CORES,1>>>(n, gpu_cache, gpu_nfactors, gpu_factors, gpu_canaries);

  // read results
  nat nfactors[CORES];
  nat factors[CORES * MAXFACTORS];
  nat canaries[CORES];
  cudaMemcpy(nfactors, gpu_nfactors, CORES * sizeof(nat),
         cudaMemcpyDeviceToHost);
  cudaMemcpy(factors, gpu_factors, CORES * MAXFACTORS * sizeof(nat),
         cudaMemcpyDeviceToHost);
  cudaMemcpy(canaries, gpu_canaries, CORES * sizeof(nat),
         cudaMemcpyDeviceToHost);
  cudaFree(gpu_primes);
  cudaFree(gpu_nfactors);
  cudaFree(gpu_factors);
  cudaFree(gpu_canaries);

  cout << "factors found:\n  ";
  for (nat i = 0; i < CORES; i++) {
    assert(nfactors[i] <= MAXFACTORS);
    for (nat j = 0; j < nfactors[i]; j++) {
      cout << factors[i*MAXFACTORS + j] << " ";
    }
  }
  cout << endl << endl;

  nat tot = 0;
  for (nat i = 0; i < CORES; i++) {
    tot += canaries[i];
  }
  cout << "canaries: " << tot << endl;

  return 0;
}