Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* roll.cu */
- #include <cstdint>
- #include <vector>
- #include <map>
- #include <iostream>
- #include <cmath>
- #include <cassert>
- using namespace std;
- typedef uint64_t nat;
- #include "../cuda_uint128.h"
- typedef uint128_t nat2;
- //typedef __uint128_t nat2;
- struct roll {
- nat flat;
- nat loop;
- };
- __host__ __device__
- nat operator%(nat2 a, nat b) {
- nat res;
- div128to64(a, b, &res);
- return res;
- }
- __host__ __device__
- nat operator%(nat2 a, roll b) {
- return a < b.flat ? a.lo : (a - b.flat) % b.loop + b.flat;
- }
- struct divisor_cache {
- vector<nat> primes;
- // vector<nat> phi;
- };
- struct gpu_divisor_cache {
- const nat nprimes;
- const nat* primes;
- };
- __host__
- divisor_cache make_cache(nat n) {
- const nat sn = sqrt(n) + 1;
- vector<bool> old(sn); // sieve of eratosthenes
- vector<nat> primes;
- for (size_t i = 2; i < sn; i++) {
- if (not old[i]) {
- primes.push_back(i);
- for (size_t j = i; j < sn; j += i) {
- old[j] = true;
- }
- }
- }
- return {primes};
- }
- __device__
- nat pow_mod(nat a, nat b, roll c) {
- nat d = 1;
- while (b) {
- if (b % 2) d = ((nat2) a * d) % c;
- a = ((nat2) a * a) % c;
- b /= 2;
- }
- return d;
- }
- __device__
- nat gcd(nat2 a_, nat b) {
- if (b == 0) return a_.lo; // if a_ is too big, then you asked for it...
- nat a = (nat) (a_ % b);
- while (b) {
- nat c = a % b;
- a = b;
- b = c;
- }
- return a;
- }
- __device__
- nat phi(nat d, gpu_divisor_cache cache, bool known_prime) {
- assert(d != 0);
- if (d == 1) return 1;
- if (known_prime) return d - 1; // optimization
- nat ret = 1;
- for (nat i = 0; i < cache.nprimes; i++) {
- nat p = cache.primes[i];
- if (p * p > d) break; // optimization
- if (d % p == 0) {
- d /= p;
- ret *= p - 1;
- while (d % p == 0) {
- d /= p;
- ret *= p;
- }
- }
- }
- if (d != 1) {
- ret *= d - 1; // d is a prime number
- }
- return ret;
- }
- /*
- __device__
- bool is_prime(nat a, gpu_divisor_cache cache) {
- if (a < 2) return false;
- for (nat i = 0; i < cache.nprimes; i++) {
- nat p = cache.primes[i];
- if (a == p) return true;
- if (a % p == 0) return false;
- }
- return true;
- }
- */
- __device__
- roll roll_of_powers(nat a, roll b, gpu_divisor_cache cache, bool known_prime) {
- if (a == 0) return {1, 1};
- if (a == 1) return {0, 1};
- nat flat = 0;
- nat2 a_pow_flat_ = 1;
- while (a_pow_flat_ < b.flat) {
- flat++;
- a_pow_flat_ = a_pow_flat_ * a;
- }
- nat d = b.loop / gcd(a_pow_flat_, b.loop);
- nat c = gcd(d, a);
- while (c != 1) {
- d /= c;
- c = gcd(d, c);
- flat++;
- }
- return {flat, phi(d, cache, known_prime)};
- }
- // (10 tri 10 + 23) % (roll b)
- __device__
- nat trimod(roll b, gpu_divisor_cache cache) {
- nat a[] = {10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
- if (10 == 0) return 1 % b;
- roll rolls[10];
- rolls[0] = b;
- for (size_t i = 0; i < 10-1; i++) {
- // assuming b is prime
- rolls[i+1] = roll_of_powers(a[i], rolls[i], cache, i == 0);
- }
- nat c = 1 % b;
- for (size_t i = 10-1; i != (size_t) -1; i--) {
- c = pow_mod(a[i], c, rolls[i]);
- }
- return (c + 23) % b;
- }
- const nat CORES = 2500; // keep this a divisor of n
- const nat MAXFACTORS = 20;
- __device__
- void big_sieve(nat start, nat end, gpu_divisor_cache cache, bool* out) {
- for (nat i = 0; start + i < end; i++) {
- out[i] = true;
- }
- for (nat i = 0; start + i < 2; i++) {
- out[i] = false;
- }
- for (nat k = 0; k < cache.nprimes; k++) {
- nat p = cache.primes[k];
- nat j = start / p * p;
- while (j < start or j <= p) j += p;
- for ( ; j < end; j += p) {
- out[j - start] = false;
- }
- }
- }
- __global__
- void trial(nat n, gpu_divisor_cache cache, nat* nfactors, nat* factors,
- nat* canary) {
- nat width = n / CORES;
- nat index = blockIdx.x;
- nat start = index * width;
- nat end = (index + 1) * width;
- const nat CHUNK_SIZE = 10*1000;
- bool chunk[CHUNK_SIZE];
- nat chunk_start;
- for (nat p = start; p < end; p++) {
- // update local prime cache
- if ((p - start) % CHUNK_SIZE == 0) {
- big_sieve(p, p + CHUNK_SIZE, cache, chunk);
- chunk_start = p;
- }
- // check whether p is prime
- if (!chunk[p - chunk_start]) {
- continue;
- }
- canary[index]++;
- // check whether p is a divisor
- if (trimod({0, p}, cache) == 0) {
- assert(nfactors[index] < MAXFACTORS);
- factors[index*MAXFACTORS + nfactors[index]++] = p;
- }
- // bogus code
- //factors[index*MAXFACTORS] = p;
- }
- }
- int main(void) {
- // define problem size
- nat n = 1L*1000*1000*1000;
- // cache small primes on gpu
- divisor_cache cache = make_cache(n);
- nat pc = cache.primes.size();
- nat* gpu_primes;
- cudaMalloc((void **) &gpu_primes, pc * sizeof(nat));
- cudaMemcpy(gpu_primes, cache.primes.data(), pc * sizeof(nat),
- cudaMemcpyHostToDevice);
- gpu_divisor_cache gpu_cache = {pc, gpu_primes};
- // allocate space for factors
- nat* gpu_nfactors;
- nat* gpu_factors;
- cudaMalloc((void **) &gpu_nfactors, CORES * sizeof(nat));
- cudaMemset(gpu_nfactors, 0, sizeof(nat));
- cudaMalloc((void **) &gpu_factors, CORES * MAXFACTORS * sizeof(nat));
- // allocate space for canaries (debug)
- nat* gpu_canaries;
- cudaMalloc((void **) &gpu_canaries, CORES * sizeof(nat));
- cudaMemset(gpu_canaries, 0, CORES * sizeof(nat));
- // run computation
- cout << "Entering the matrix..." << endl;
- trial<<<CORES,1>>>(n, gpu_cache, gpu_nfactors, gpu_factors, gpu_canaries);
- // read results
- nat nfactors[CORES];
- nat factors[CORES * MAXFACTORS];
- nat canaries[CORES];
- cudaMemcpy(nfactors, gpu_nfactors, CORES * sizeof(nat),
- cudaMemcpyDeviceToHost);
- cudaMemcpy(factors, gpu_factors, CORES * MAXFACTORS * sizeof(nat),
- cudaMemcpyDeviceToHost);
- cudaMemcpy(canaries, gpu_canaries, CORES * sizeof(nat),
- cudaMemcpyDeviceToHost);
- cudaFree(gpu_primes);
- cudaFree(gpu_nfactors);
- cudaFree(gpu_factors);
- cudaFree(gpu_canaries);
- cout << "factors found:\n ";
- for (nat i = 0; i < CORES; i++) {
- assert(nfactors[i] <= MAXFACTORS);
- for (nat j = 0; j < nfactors[i]; j++) {
- cout << factors[i*MAXFACTORS + j] << " ";
- }
- }
- cout << endl << endl;
- nat tot = 0;
- for (nat i = 0; i < CORES; i++) {
- tot += canaries[i];
- }
- cout << "canaries: " << tot << endl;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement