Advertisement
JoelSjogren

roll.cu

Feb 22nd, 2020
395
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.34 KB | None | 0 0
  1. /* roll.cu */
  2.  
  3. #include <cstdint>
  4. #include <vector>
  5. #include <map>
  6. #include <iostream>
  7. #include <cmath>
  8. #include <cassert>
  9.  
  10. using namespace std;
  11.  
  12. typedef uint64_t nat;
  13.  
  14. #include "../cuda_uint128.h"
  15. typedef uint128_t nat2;
  16. //typedef __uint128_t nat2;
  17.  
  18. struct roll {
  19.   nat flat;
  20.   nat loop;
  21. };
  22.  
  23. __host__ __device__
  24. nat operator%(nat2 a, nat b) {
  25.   nat res;
  26.   div128to64(a, b, &res);
  27.   return res;
  28. }
  29.  
  30. __host__ __device__
  31. nat operator%(nat2 a, roll b) {
  32.   return a < b.flat ? a.lo : (a - b.flat) % b.loop + b.flat;
  33. }
  34.  
  35. struct divisor_cache {
  36.   vector<nat> primes;
  37.   //  vector<nat> phi;
  38. };
  39.  
  40. struct gpu_divisor_cache {
  41.   const nat nprimes;
  42.   const nat* primes;
  43. };
  44.  
  45. __host__
  46. divisor_cache make_cache(nat n) {
  47.   const nat sn = sqrt(n) + 1;
  48.   vector<bool> old(sn);  // sieve of eratosthenes
  49.   vector<nat> primes;
  50.   for (size_t i = 2; i < sn; i++) {
  51.     if (not old[i]) {
  52.       primes.push_back(i);
  53.       for (size_t j = i; j < sn; j += i) {
  54.         old[j] = true;
  55.       }
  56.     }
  57.   }
  58.   return {primes};
  59. }
  60.  
  61. __device__
  62. nat pow_mod(nat a, nat b, roll c) {
  63.   nat d = 1;
  64.   while (b) {
  65.     if (b % 2) d = ((nat2) a * d) % c;
  66.     a = ((nat2) a * a) % c;
  67.     b /= 2;
  68.   }
  69.   return d;
  70. }
  71.  
  72. __device__
  73. nat gcd(nat2 a_, nat b) {
  74.   if (b == 0) return a_.lo;  // if a_ is too big, then you asked for it...
  75.   nat a = (nat) (a_ % b);
  76.   while (b) {
  77.     nat c = a % b;
  78.     a = b;
  79.     b = c;
  80.   }
  81.   return a;
  82. }
  83.  
  84. __device__
  85. nat phi(nat d, gpu_divisor_cache cache, bool known_prime) {
  86.   assert(d != 0);
  87.   if (d == 1) return 1;
  88.   if (known_prime) return d - 1;  // optimization
  89.   nat ret = 1;
  90.   for (nat i = 0; i < cache.nprimes; i++) {
  91.     nat p = cache.primes[i];
  92.     if (p * p > d) break;  // optimization
  93.     if (d % p == 0) {
  94.       d /= p;
  95.       ret *= p - 1;
  96.       while (d % p == 0) {
  97.     d /= p;
  98.     ret *= p;
  99.       }
  100.     }
  101.   }
  102.   if (d != 1) {
  103.     ret *= d - 1;  // d is a prime number
  104.   }
  105.   return ret;
  106. }
  107.  
  108. /*
  109. __device__
  110. bool is_prime(nat a, gpu_divisor_cache cache) {
  111.   if (a < 2) return false;
  112.   for (nat i = 0; i < cache.nprimes; i++) {
  113.     nat p = cache.primes[i];
  114.     if (a == p) return true;
  115.     if (a % p == 0) return false;
  116.   }
  117.   return true;
  118. }
  119. */
  120.  
  121. __device__
  122. roll roll_of_powers(nat a, roll b, gpu_divisor_cache cache, bool known_prime) {
  123.   if (a == 0) return {1, 1};
  124.   if (a == 1) return {0, 1};
  125.   nat flat = 0;
  126.   nat2 a_pow_flat_ = 1;
  127.   while (a_pow_flat_ < b.flat) {
  128.     flat++;
  129.     a_pow_flat_ = a_pow_flat_ * a;
  130.   }
  131.   nat d = b.loop / gcd(a_pow_flat_, b.loop);
  132.   nat c = gcd(d, a);
  133.   while (c != 1) {
  134.     d /= c;
  135.     c = gcd(d, c);
  136.     flat++;
  137.   }
  138.   return {flat, phi(d, cache, known_prime)};
  139. }
  140.  
  141. // (10 tri 10 + 23) % (roll b)
  142. __device__
  143. nat trimod(roll b, gpu_divisor_cache cache) {
  144.   nat a[] = {10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
  145.   if (10 == 0) return 1 % b;
  146.   roll rolls[10];
  147.   rolls[0] = b;
  148.   for (size_t i = 0; i < 10-1; i++) {
  149.     // assuming b is prime
  150.     rolls[i+1] = roll_of_powers(a[i], rolls[i], cache, i == 0);
  151.   }
  152.   nat c = 1 % b;
  153.   for (size_t i = 10-1; i != (size_t) -1; i--) {
  154.     c = pow_mod(a[i], c, rolls[i]);
  155.   }
  156.   return (c + 23) % b;
  157. }
  158.  
  159. const nat CORES = 2500;  // keep this a divisor of n
  160. const nat MAXFACTORS = 20;
  161.  
  162. __device__
  163. void big_sieve(nat start, nat end, gpu_divisor_cache cache, bool* out) {
  164.   for (nat i = 0; start + i < end; i++) {
  165.     out[i] = true;
  166.   }
  167.   for (nat i = 0; start + i < 2; i++) {
  168.     out[i] = false;
  169.   }
  170.   for (nat k = 0; k < cache.nprimes; k++) {
  171.     nat p = cache.primes[k];
  172.     nat j = start / p * p;
  173.     while (j < start or j <= p) j += p;
  174.     for ( ; j < end; j += p) {
  175.       out[j - start] = false;
  176.     }
  177.   }
  178. }
  179.  
  180. __global__
  181. void trial(nat n, gpu_divisor_cache cache, nat* nfactors, nat* factors,
  182.        nat* canary) {
  183.  
  184.   nat width = n / CORES;
  185.   nat index = blockIdx.x;
  186.  
  187.   nat start = index * width;
  188.   nat end = (index + 1) * width;
  189.  
  190.   const nat CHUNK_SIZE = 10*1000;
  191.   bool chunk[CHUNK_SIZE];
  192.   nat chunk_start;
  193.  
  194.   for (nat p = start; p < end; p++) {
  195.     // update local prime cache
  196.     if ((p - start) % CHUNK_SIZE == 0) {
  197.       big_sieve(p, p + CHUNK_SIZE, cache, chunk);
  198.       chunk_start = p;
  199.     }
  200.  
  201.     // check whether p is prime
  202.     if (!chunk[p - chunk_start]) {
  203.       continue;
  204.     }
  205.     canary[index]++;
  206.  
  207.     // check whether p is a divisor
  208.     if (trimod({0, p}, cache) == 0) {
  209.       assert(nfactors[index] < MAXFACTORS);
  210.       factors[index*MAXFACTORS + nfactors[index]++] = p;
  211.     }
  212.  
  213.     // bogus code
  214.     //factors[index*MAXFACTORS] = p;
  215.   }
  216.  
  217. }
  218.  
  219. int main(void) {
  220.   // define problem size
  221.   nat n = 1L*1000*1000*1000;
  222.  
  223.   // cache small primes on gpu
  224.   divisor_cache cache = make_cache(n);
  225.   nat pc = cache.primes.size();
  226.   nat* gpu_primes;
  227.   cudaMalloc((void **) &gpu_primes, pc * sizeof(nat));
  228.   cudaMemcpy(gpu_primes, cache.primes.data(), pc * sizeof(nat),
  229.          cudaMemcpyHostToDevice);
  230.   gpu_divisor_cache gpu_cache = {pc, gpu_primes};
  231.  
  232.   // allocate space for factors
  233.   nat* gpu_nfactors;
  234.   nat* gpu_factors;
  235.   cudaMalloc((void **) &gpu_nfactors, CORES * sizeof(nat));
  236.   cudaMemset(gpu_nfactors, 0, sizeof(nat));
  237.   cudaMalloc((void **) &gpu_factors, CORES * MAXFACTORS * sizeof(nat));
  238.  
  239.   // allocate space for canaries (debug)
  240.   nat* gpu_canaries;
  241.   cudaMalloc((void **) &gpu_canaries, CORES * sizeof(nat));
  242.   cudaMemset(gpu_canaries, 0, CORES * sizeof(nat));
  243.  
  244.   // run computation
  245.   cout << "Entering the matrix..." << endl;
  246.   trial<<<CORES,1>>>(n, gpu_cache, gpu_nfactors, gpu_factors, gpu_canaries);
  247.  
  248.   // read results
  249.   nat nfactors[CORES];
  250.   nat factors[CORES * MAXFACTORS];
  251.   nat canaries[CORES];
  252.   cudaMemcpy(nfactors, gpu_nfactors, CORES * sizeof(nat),
  253.          cudaMemcpyDeviceToHost);
  254.   cudaMemcpy(factors, gpu_factors, CORES * MAXFACTORS * sizeof(nat),
  255.          cudaMemcpyDeviceToHost);
  256.   cudaMemcpy(canaries, gpu_canaries, CORES * sizeof(nat),
  257.          cudaMemcpyDeviceToHost);
  258.   cudaFree(gpu_primes);
  259.   cudaFree(gpu_nfactors);
  260.   cudaFree(gpu_factors);
  261.   cudaFree(gpu_canaries);
  262.  
  263.   cout << "factors found:\n  ";
  264.   for (nat i = 0; i < CORES; i++) {
  265.     assert(nfactors[i] <= MAXFACTORS);
  266.     for (nat j = 0; j < nfactors[i]; j++) {
  267.       cout << factors[i*MAXFACTORS + j] << " ";
  268.     }
  269.   }
  270.   cout << endl << endl;
  271.  
  272.   nat tot = 0;
  273.   for (nat i = 0; i < CORES; i++) {
  274.     tot += canaries[i];
  275.   }
  276.   cout << "canaries: " << tot << endl;
  277.    
  278.   return 0;
  279. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement