Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <cuda_fp16.h>
- __device__ void atomic_add(__half* a, float b) {
- #if __CUDA_ARCH__ >= 700 // CUDA 7.0 supports fp16 atomic add
- atomicAdd(a, __float2half(b));
- #else
- // atomicAdd(&__half2float(a), b); // Error: expression must be an lvalue
- #endif
- }
- __global__ void example(float* value, __half* result) {
- atomic_add(result, *value);
- }
- int main() {
- // Allocate host memory
- float hvalue{1.f}, *dvalue;
- __half hresult{0.f}, *dresult;
- // Copy to device
- cudaMalloc(&dvalue, sizeof(float));
- cudaMalloc(&dresult, sizeof(__half));
- cudaMemcpy(dvalue, &hvalue, sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(dresult, &hresult, sizeof(__half), cudaMemcpyHostToDevice);
- // Call kernel
- example<<<1,3>>>(dvalue, dresult);
- // Copy back to host
- cudaMemcpy(&hresult, dresult, sizeof(__half), cudaMemcpyDeviceToHost);
- // Print result
- printf("%f\n", __half2float(hresult));
- return 0;
- }
- // FAILURE: nvcc -gencode=arch=compute_52,code=compute_52 -o example example.cu; ./example
- // SUCCESS: nvcc -gencode=arch=compute_70,code=compute_70 -o example example.cu; ./example
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement