Jakowlew

minv

Dec 5th, 2020
873
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #pragma once
  2.  
  3. #include <cublas_v2.h>
  4. #include <type_traits>
  5.  
  6. #include "common.cuh"
  7.  
  8. template <typename T>
  9. bool minv(T* a, T* b, int order, const launch_params& params);
  10.  
  11. namespace detail
  12. {
  13.     template <typename T>
  14.     bool minv_cublas(T* a, T* b, int order, cublasHandle_t handle, cudaStream_t stream = 0);
  15.  
  16.     template <typename T>
  17.     bool minv_generic(T* a, T* b, int order, int threadsPerBlock, cudaStream_t stream = 0);
  18. }
  19.  
  20. namespace detail::kernel
  21. {
  22.     template <typename T>
  23.     __global__ void get_idm(T* A, int order);
  24.    
  25.     template <typename T>
  26.     __global__ void get_first_not_null_row(T* A, int* idx, int col, int order);
  27.  
  28.     template <typename T>
  29.     __global__ void swap_rows(T* A, int row1, int row2, int order);
  30.  
  31.     template <typename T>
  32.     __global__ void subtract_rows(T* A, int col, int order);
  33. }
  34.  
  35. template <typename T>
  36. bool minv(T* a, T* b, int order, const launch_params& params)
  37. {
  38.     if constexpr (std::is_floating_point_v<T>)
  39.     {
  40.         if (nullptr != params.cublasHandle)
  41.         {
  42.             return detail::minv_cublas(a, b, order, params.cublasHandle, params.stream);
  43.         }
  44.         else
  45.         {
  46.             return detail::minv_generic(a, b, order, params.threadsPerBlock, params.stream);
  47.         }
  48.     }
  49.     else
  50.     {
  51.         return detail::minv_generic(a, b, order, params.threadsPerBlock, params.stream);
  52.     }
  53. }
  54.  
  55. namespace detail
  56. {
  57.     template <typename T>
  58.     bool minv_cublas(T* a, T* b, int order, cublasHandle_t handle, cudaStream_t stream)
  59.     {
  60.         return false;
  61.     }
  62.  
  63.     template <typename T>
  64.     bool minv_generic(T* a, T* b, int order, int threadsPerBlock, cudaStream_t stream)
  65.     {
  66.         constexpr int bs = 32;
  67.         int order_compl = order + bs - (order % bs);
  68.         T *d_a, *d_id;
  69.  
  70.         cudaMallocHost(&d_a, order_compl * order_compl * sizeof(T));
  71.         cudaMallocHost(&d_id, order_compl * order_compl * sizeof(T));
  72.  
  73.         cudaMemcpyAsync(d_a, a, order * order * sizeof(T), cudaMemcpyHostToDevice, stream);
  74.  
  75.         dim3 threads(bs, bs);
  76.         dim3 grid(order_compl / threads.x, order_compl / threads.y);
  77.         kernel::get_idm<<<grid, threads, 0, stream>>>(d_id, order_compl);
  78.  
  79.         for (int i = 0; i < order; ++i)
  80.         {
  81.             // find first not-null row
  82.             int h_not_null_row;
  83.             int* d_not_null_row;
  84.             cudaMalloc(&d_not_null_row, sizeof(int));
  85.  
  86.             threads = dim3(threadsPerBlock, 1);
  87.             grid = dim3((order + threadsPerBlock - 1) / threadsPerBlock , 1);
  88.             kernel::get_first_not_null_row<<<grid, threads, 0, stream>>>(d_a, d_not_null_row, i, order);
  89.            
  90.             cudaMemcpy(&h_not_null_row, d_not_null_row, sizeof(int), cudaMemcpyDeviceToHost);
  91.             cudaDeviceSynchronize();
  92.  
  93.             // swap with current line
  94.             printf("Swap row %d with row %d (col=%d)\n", i, h_not_null_row, i);
  95.            
  96.             threads = dim3(threadsPerBlock, 1);
  97.             grid = dim3((order + threadsPerBlock - 1) / threadsPerBlock , 1);
  98.             kernel::swap_rows<<<grid, threads, 0, stream>>>(d_a, i, h_not_null_row, order);
  99.            
  100.             cudaDeviceSynchronize();
  101.             print_matrix1(d_a, order, order);
  102.            
  103.             // nullify below and above
  104.            
  105.             // multiply row with a^(-1)
  106.         }
  107.  
  108.         cudaMemcpy2DAsync(b, order * sizeof(T), d_id, order_compl * sizeof(T), order * sizeof(T), order, cudaMemcpyDeviceToHost);
  109.         return true;
  110.     }
  111. }
  112.  
  113. namespace detail::kernel
  114. {
  115.     template <typename T>
  116.     __global__ void get_idm(T* A, int order)
  117.     {
  118.         int row = blockIdx.y * blockDim.y + threadIdx.y;
  119.         int col = blockIdx.x * blockDim.x + threadIdx.x;
  120.  
  121.         A[row * order + col] = row == col ? Id_v<T> : Zero_v<T>;
  122.     }
  123.  
  124.     template <typename T>
  125.     __global__ void get_first_not_null_row(T* A, int* idx, int col, int order)
  126.     {
  127.         int i = blockIdx.x * blockDim.x + threadIdx.x;
  128.         if (i >= order || i <= col) return;
  129.  
  130.         constexpr T zero = Zero_v<T>;
  131.         if (A[i * order + col] != zero)
  132.         {
  133.             *idx = i;
  134.         }
  135.     }
  136.  
  137.     template <typename T>
  138.     __global__ void swap_rows(T* A, int row1, int row2, int order)
  139.     {
  140.         int i = blockIdx.x * blockDim.x + threadIdx.x;
  141.         if (i >= order) return;
  142.  
  143.         T tmp = A[row1 * order + i];
  144.         A[row1 * order + i] = A[row2 * order + i];
  145.         A[row2 * order + i] = tmp;
  146.     }
  147.  
  148.     template <typename T>
  149.     __global__ void subtract_rows(T* A, int col, int order)
  150.     {
  151.         int idx = blockIdx.x * blockDim.x + threadIdx.x;
  152.         int idy = blockIdx.y * blockDim.y + threadIdx.y;
  153.         int offsetx = gridDim.x * blockDim.x;
  154.         int offsety = gridDim.y * blockDim.y;
  155.  
  156.        
  157.     }
  158. }
  159.  
RAW Paste Data