Advertisement
Guest User

[CUDA Graph] cuBLAS routine produces incorrect result after calling cudaStreamBeginCapture

a guest
Feb 11th, 2022
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.04 KB | None | 0 0
  1. #include <cuda.h>
  2. #include <cuda_runtime.h>
  3. #include <iostream>
  4. #include <ctime>
  5. #include "cublas_v2.h"
  6.  
  7. #define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
  8. inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
  9. {
  10.     if (code != cudaSuccess)
  11.     {
  12.         fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  13.         if (abort) exit(code);
  14.     }
  15. }
  16.  
  17. static const char *cublasErrChk(cublasStatus_t error)
  18. {
  19.     switch (error)
  20.     {
  21.         case CUBLAS_STATUS_SUCCESS:
  22.             return "CUBLAS_STATUS_SUCCESS";
  23.  
  24.         case CUBLAS_STATUS_NOT_INITIALIZED:
  25.             return "CUBLAS_STATUS_NOT_INITIALIZED";
  26.  
  27.         case CUBLAS_STATUS_ALLOC_FAILED:
  28.             return "CUBLAS_STATUS_ALLOC_FAILED";
  29.  
  30.         case CUBLAS_STATUS_INVALID_VALUE:
  31.             return "CUBLAS_STATUS_INVALID_VALUE";
  32.  
  33.         case CUBLAS_STATUS_ARCH_MISMATCH:
  34.             return "CUBLAS_STATUS_ARCH_MISMATCH";
  35.  
  36.         case CUBLAS_STATUS_MAPPING_ERROR:
  37.             return "CUBLAS_STATUS_MAPPING_ERROR";
  38.  
  39.         case CUBLAS_STATUS_EXECUTION_FAILED:
  40.             return "CUBLAS_STATUS_EXECUTION_FAILED";
  41.  
  42.         case CUBLAS_STATUS_INTERNAL_ERROR:
  43.             return "CUBLAS_STATUS_INTERNAL_ERROR";
  44.     }
  45.  
  46.     return "<unknown>";
  47. }
  48.  
  49. int main() {
  50.  
  51.     size_t dims = 4;
  52.  
  53.     double *vec, *mat, *results;
  54.  
  55.     cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
  56.     cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
  57.     cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
  58.  
  59.     printf("Vector:\n");
  60.     for (int i = 1; i < dims + 1; i++) {
  61.         vec[i] = 0.5 * i;
  62.         printf("%.2lf ", vec[i]);
  63.     }
  64.     printf("\n\nMatrix:\n");
  65.  
  66.     for (int i = 1; i < dims * dims + 1; i++) {
  67.         mat[i] = 1.0 * i;
  68.         printf("%.2lf ", mat[i]);
  69.  
  70.         if (i % dims == 0)
  71.             printf("\n");
  72.     }
  73.     printf("\n");
  74.  
  75. // CUDA graph creation
  76.  
  77.   cublasHandle_t handle;
  78.   cublasErrChk( cublasCreate(&handle) );
  79.   cudaGraph_t gemvGraph;
  80.   cudaStream_t stream1, streamForGraph;
  81.   cudaErrChk(cudaStreamCreate(&stream1));
  82.   cublasStatus_t stat = cublasSetStream(handle, stream1);
  83.   std::cout << "stat: " << stat << std::endl;
  84.   cudaErrChk(cudaGraphCreate(&gemvGraph, 0));
  85.   cudaErrChk(cudaStreamCreate(&streamForGraph));
  86.   double alpha = 1.f, beta = 1.f;
  87.  
  88.   cudaErrChk(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
  89.     // multiply mat by vec to get results
  90.   cublasErrChk(
  91.         cublasDgemv(
  92.             handle, CUBLAS_OP_N,
  93.             dims, dims,
  94.             &alpha,
  95.             mat, dims,
  96.             vec, 1,
  97.             &beta,
  98.             results, 1
  99.         )
  100.     );
  101.  
  102.   cudaErrChk(cudaStreamEndCapture(stream1, &gemvGraph));
  103.   cudaStreamSynchronize(stream1);
  104.   cudaDeviceSynchronize();
  105.  
  106.     for (int i = 0; i < dims; i++)
  107.         printf("%.2lf ", results[i]);
  108.     printf("\n");
  109.  
  110.     cudaErrChk( cudaFree(vec) );
  111.     cudaErrChk( cudaFree(mat) );
  112.     cudaErrChk( cudaFree(results) );
  113.  
  114.     return 0;
  115. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement