Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <math.h>
- __device__
- inline int iclz_ui128 (__uint128_t u) {
- uint64_t hi = u>>64;
- uint64_t lo = u;
- return
- hi ?
- lo&hi ? 66-__clzll(lo) : 2 //hi true -> lo&hi ? 1 : 0
- :
- lo&hi ? 130-__clzll(hi) : 66-__clzll(lo) //hi false -> lo&hi ? 2 : 1
- ;
- }
- //128 - (count leading zeros ui128) + 2
- // Kernel function to add the elements of two arrays
- __global__
- void quadCverg(__uint128_t n, __uint128_t tdtarg3)
- {
- uint index = blockIdx.x * blockDim.x + threadIdx.x;
- uint stride = blockDim.x * gridDim.x;
- for (__uint128_t a = index; a < n; a += stride){
- __uint128_t ai3=a*a*a;
- if(3*ai3>tdtarg3){break;}
- for (__uint128_t b = a+1; true; b++) {
- __uint128_t bi3=b*b*b;
- if(ai3+2*bi3>tdtarg3){break;}
- __uint128_t ab3=ai3+bi3;
- __uint128_t ctarg=tdtarg3-ab3; //ctarg==c*c*c
- __uint128_t r0 = 1<<( iclz_ui128(ctarg) / 3);//1<<ceil((128-CLZ) / 3)
- __uint128_t c;
- do{
- c = r0;
- r0 = (2*c + ctarg/(c*c))/3 ;
- }
- while (r0 < c);
- if(ab3+c*c*c==tdtarg3){
- printf(
- "a:%lld b:%lld c:%lld\n\n",
- (__uint64_t)a,(__uint64_t)b,(__uint64_t)c
- );
- }
- }
- }
- }
- int main(void)
- {
- __uint128_t targ = 131071;
- __uint128_t targ3 = targ*targ*targ;
- // Run kernel on 1M elements on the GPU
- __uint128_t tasks=targ*(__float128)0.693361274350634659846548402128973976+1; //max A = t * 3^(2/3)/3
- __uint64_t blockSize = 1024;
- __uint64_t numBlocks = (tasks + blockSize - 1) / blockSize;
- quadCverg<<<numBlocks,blockSize>>>(tasks, targ3);
- // Wait for GPU to finish before accessing on host
- cudaDeviceSynchronize();
- return 0;
- }
- //nvprof ./cubesum
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement