Pseudo random numbers C++ Cuda


//nvcc -g ./main.cu -O3 -lpthread -lcudart -o ./main -Xcompiler -rdynamic -lineinfo && ./main
//
//Generates pseudo random numbers. nvidia gtx 1080 and amd 5900x ./main | pv > /dev/zero # 2.8 GB/s
//passes most dieharder tests
//./main | dieharder -B -g 200 -a
//
//variation of ukaelEntropy https://github.com/Kaelygon/ukaelAudio/blob/main/ukaelH/kmath.h

#include <iostream>
#include <vector>
#include <thread>
#include <fstream>
#include <time.h>

__global__
void kaelRandom(uint64_t *data, uint64_t dataCount) {
    __uint64_t index = blockIdx.x * blockDim.x + threadIdx.x;
    __uint64_t stride = blockDim.x * gridDim.x;

    for(uint ti=index;ti<dataCount;ti+=stride){
        uint stId = ti;
        uint ndId = ti+1==dataCount ? 0 : ti+1; //prevent array overflow

        data[stId] = (data[stId] >> 41) | (data[stId] << 23); // bit rotate rorw
        data[stId] += data[ndId]*131 + 13238717; // shift and add
        data[ndId] += data[stId]*129 + 13238689;
    }

}

void seedTime(uint64_t *data, uint dataCount) {
    for (uint64_t i = 0; i < dataCount; i++) {
        data[i] = i * 60618691999346397ULL + 15940286172355421827ULL; // random lcg

        uint64_t timebuf = time(NULL); //entropy from time
        data[i] ^= timebuf*i+timebuf;

        data[i] = (data[i] >> 41) | (data[i] << 23); // bit rotate rorw
        data[i] += (data[i]<<2) + 13238717; //shift and add
    }
}

void seedZero(uint64_t *data, uint dataCount) {
    for (uint64_t i = 0; i < dataCount; i++) {
        data[i] = 0;
    }
}

int main() {

    const uint numBlocks = 512;
    const uint blockSize = 512;
    const uint bufSize = 2;

    uint64_t *dataArray[2];
    uint chunkSize = 2048;
    uint dataCount = chunkSize*blockSize*2; //2 numbers per thread
    size_t dataSize = dataCount*sizeof(uint64_t);

    for(int bi=0;bi<bufSize;bi++){
        cudaMallocManaged(&dataArray[bi], dataSize); // Allocate memory on the GPU
    }

    //initialize to buffer
    seedTime(dataArray[0], dataCount);
    kaelRandom<<<numBlocks, blockSize>>>(dataArray[0], dataCount);

    std::vector<std::thread> writeThread;

    for(int i=0;true;i++){
        cudaMemcpy(dataArray[1], dataArray[0], dataSize, cudaMemcpyDeviceToHost); //copy to buffer dataArray[1]
        kaelRandom<<<numBlocks, blockSize>>>(dataArray[0], dataCount); //calculate random kernel function

        fwrite(dataArray[1], sizeof(uint64_t), dataCount, stdout); //write raw binary 64 while GPU is calculating

        cudaDeviceSynchronize(); // Wait for all threads to finish
    }

    cudaFree(dataArray); // Free allocated memory on the GPU

    std::cout << "\nComputed: " << dataCount/2 << "\n";

    return 0;
}