Advertisement
Kaelygon

cuda cubesum

Oct 26th, 2022 (edited)
828
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 1.66 KB | Source Code | 0 0
  1. #include <iostream>
  2. #include <math.h>
  3.  
  4. __device__
  5. inline int iclz_ui128 (__uint128_t u) {
  6.     uint64_t hi = u>>64;
  7.     uint64_t lo = u;
  8.  
  9.     return  
  10.         hi ?
  11.             lo&hi ?  66-__clzll(lo) : 2                             //hi true  -> lo&hi ? 1 : 0
  12.             :
  13.             lo&hi ? 130-__clzll(hi) : 66-__clzll(lo)        //hi false -> lo&hi ? 2 : 1
  14.     ;
  15. }
  16.  
  17. //128 - (count leading zeros ui128) + 2
  18. // Kernel function to add the elements of two arrays
  19. __global__
  20. void quadCverg(__uint128_t n, __uint128_t tdtarg3)
  21. {
  22.   uint index = blockIdx.x * blockDim.x + threadIdx.x;
  23.  
  24.   uint stride = blockDim.x * gridDim.x;
  25.  
  26.     for (__uint128_t a = index; a < n; a += stride){
  27.  
  28.         __uint128_t ai3=a*a*a;
  29.         if(3*ai3>tdtarg3){break;}
  30.  
  31.         for (__uint128_t b = a+1; true; b++) {
  32.  
  33.             __uint128_t bi3=b*b*b;
  34.             if(ai3+2*bi3>tdtarg3){break;}
  35.  
  36.             __uint128_t ab3=ai3+bi3;
  37.             __uint128_t ctarg=tdtarg3-ab3; //ctarg==c*c*c
  38.  
  39.             __uint128_t r0 = 1<<( iclz_ui128(ctarg) / 3);//1<<ceil((128-CLZ) / 3)
  40.             __uint128_t c;
  41.             do{
  42.                 c = r0;
  43.                 r0 = (2*c + ctarg/(c*c))/3 ;
  44.             }
  45.             while (r0 < c);
  46.  
  47.             if(ab3+c*c*c==tdtarg3){
  48.                 printf(
  49.                     "a:%lld b:%lld c:%lld\n\n",
  50.                     (__uint64_t)a,(__uint64_t)b,(__uint64_t)c
  51.                 );
  52.             }
  53.         }
  54.     }
  55. }
  56.  
  57. int main(void)
  58. {
  59.     __uint128_t targ = 131071;
  60.     __uint128_t targ3 = targ*targ*targ;
  61.     // Run kernel on 1M elements on the GPU
  62.     __uint128_t tasks=targ*(__float128)0.693361274350634659846548402128973976+1; //max A = t * 3^(2/3)/3
  63.     __uint64_t blockSize = 1024;
  64.     __uint64_t numBlocks = (tasks + blockSize - 1) / blockSize;
  65.     quadCverg<<<numBlocks,blockSize>>>(tasks, targ3);
  66.  
  67.     // Wait for GPU to finish before accessing on host
  68.     cudaDeviceSynchronize();
  69.    
  70.     return 0;
  71. }
  72. //nvprof ./cubesum
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement