Untitled

#include <iostream>
#include <utility>
#include <cuda_runtime.h>
using namespace std;

/*
 * Fake destructor for C-like resources, to cope with exceptions. E.g.:
 * void* array = malloc(123);
 * destructor([=]{ free(array); });
 */

#define destructor(f) destructor_helper_macro_1(f, __LINE__)

template<typename F> struct destructor_helper{
    F f;
    ~destructor_helper(){ f(); }
};
template<typename F> destructor_helper<F> make_destructor_helper(F&& f){
    return destructor_helper<F>{std::move(f)};
}
#define destructor_helper_macro_2(f, l) auto destructor_ ## l = make_destructor_helper(f)
#define destructor_helper_macro_1(f, l) destructor_helper_macro_2(f, l)

#define cudacheckpoint_(file, line) { cudaDeviceSynchronize() assertcu; std::cerr<<"cudacheckpoint "<<file<<":"<<line<<std::endl; }
#define cudacheckpoint cudacheckpoint_(__FILE__, __LINE__)

/*
 * Cuda errors to exceptions translation.
 * Append to cuda/cufft calls assertcu/assertcufft, e.g. cudaMemcpy(...) assertcu;
 */

#define assertcufft >>assertcufft_helper()
#define assertcu >>assertcu_helper()

#include <stdexcept>
#include <string>

struct cuda_error : std::runtime_error{
    cuda_error(cudaError err): std::runtime_error(cudaGetErrorString(err)) {}
protected:
    cuda_error(std::string&& err): std::runtime_error(std::move(err)) {}
};

struct assertcu_helper{};
inline int operator>>(cudaError ret, assertcu_helper&&){
    return ret == cudaSuccess ? cudaSuccess : throw cuda_error(ret);
}

template<typename T> T get_device_object(const T& on_device, cudaStream_t stream){
    T on_host;
    cudaMemcpyFromSymbolAsync((void*)&on_host, (const void*)&on_device, sizeof(T), 0, cudaMemcpyDeviceToHost, stream) assertcu;
    cudaStreamSynchronize(stream) assertcu;
    return on_host;
}


#define block_size 256

__global__ void fillmem(int* ints){
    size_t offset = size_t(blockIdx.x) * blockDim.x + threadIdx.x;
    ints[offset] = 42;
}

__device__ bool ok = true;

__global__ void validate(const int* ints){
    size_t offset = size_t(blockIdx.x) * blockDim.x + threadIdx.x;
    if(ints[offset] != 42) ok = false;
}

int main(int argc, char *argv[]){
    int offset = 2;
        if (argc == 2) {
            offset = atoi(argv[1]);
        }
    unsigned int total_ints = (((1024ULL * 1024ULL * 1024ULL * 2) / 4)+offset);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0) assertcu;
    if(deviceProp.major < 3) throw runtime_error("required sm >= 3.0");     //for large grid size in x dimension
    cudaStream_t stream = 0;
    destructor([&]{ cudaStreamDestroy(stream); });
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) assertcu;
    int* ints = 0;
    destructor([&]{ cudaFree(ints); });
    cudaMalloc(&ints, total_ints * 4) assertcu;
    destructor(cudaDeviceSynchronize);
    cudaMemsetAsync(ints, 0, total_ints * 4, stream) assertcu;
    cudaDeviceSynchronize();
    fillmem<<<total_ints / block_size, block_size, 0, stream>>>(ints);
    cudaGetLastError() assertcu;
    validate<<<total_ints / block_size, block_size, 0, stream>>>(ints);
    cudaGetLastError() assertcu;
    if(!get_device_object(ok, stream)) {
        std::cout << offset << " failed\n";
        //throw logic_error("validation failed");
    } else {
        std::cout << offset << " ok\n";
    }
}