Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #pragma once
- #include <cublas_v2.h>
- #include <type_traits>
- #include "common.cuh"
- template <typename T>
- bool minv(T* a, T* b, int order, const launch_params& params);
- namespace detail
- {
- template <typename T>
- bool minv_cublas(T* a, T* b, int order, cublasHandle_t handle, cudaStream_t stream = 0);
- template <typename T>
- bool minv_generic(T* a, T* b, int order, int threadsPerBlock, cudaStream_t stream = 0);
- }
- namespace detail::kernel
- {
- template <typename T>
- __global__ void get_idm(T* A, int order);
- template <typename T>
- __global__ void get_first_not_null_row(T* A, int* idx, int col, int order);
- template <typename T>
- __global__ void swap_rows(T* A, int row1, int row2, int order);
- template <typename T>
- __global__ void subtract_rows(T* A, int col, int order);
- }
- template <typename T>
- bool minv(T* a, T* b, int order, const launch_params& params)
- {
- if constexpr (std::is_floating_point_v<T>)
- {
- if (nullptr != params.cublasHandle)
- {
- return detail::minv_cublas(a, b, order, params.cublasHandle, params.stream);
- }
- else
- {
- return detail::minv_generic(a, b, order, params.threadsPerBlock, params.stream);
- }
- }
- else
- {
- return detail::minv_generic(a, b, order, params.threadsPerBlock, params.stream);
- }
- }
- namespace detail
- {
- template <typename T>
- bool minv_cublas(T* a, T* b, int order, cublasHandle_t handle, cudaStream_t stream)
- {
- return false;
- }
- template <typename T>
- bool minv_generic(T* a, T* b, int order, int threadsPerBlock, cudaStream_t stream)
- {
- constexpr int bs = 32;
- int order_compl = order + bs - (order % bs);
- T *d_a, *d_id;
- cudaMallocHost(&d_a, order_compl * order_compl * sizeof(T));
- cudaMallocHost(&d_id, order_compl * order_compl * sizeof(T));
- cudaMemcpyAsync(d_a, a, order * order * sizeof(T), cudaMemcpyHostToDevice, stream);
- dim3 threads(bs, bs);
- dim3 grid(order_compl / threads.x, order_compl / threads.y);
- kernel::get_idm<<<grid, threads, 0, stream>>>(d_id, order_compl);
- for (int i = 0; i < order; ++i)
- {
- // find first not-null row
- int h_not_null_row;
- int* d_not_null_row;
- cudaMalloc(&d_not_null_row, sizeof(int));
- threads = dim3(threadsPerBlock, 1);
- grid = dim3((order + threadsPerBlock - 1) / threadsPerBlock , 1);
- kernel::get_first_not_null_row<<<grid, threads, 0, stream>>>(d_a, d_not_null_row, i, order);
- cudaMemcpy(&h_not_null_row, d_not_null_row, sizeof(int), cudaMemcpyDeviceToHost);
- cudaDeviceSynchronize();
- // swap with current line
- printf("Swap row %d with row %d (col=%d)\n", i, h_not_null_row, i);
- threads = dim3(threadsPerBlock, 1);
- grid = dim3((order + threadsPerBlock - 1) / threadsPerBlock , 1);
- kernel::swap_rows<<<grid, threads, 0, stream>>>(d_a, i, h_not_null_row, order);
- cudaDeviceSynchronize();
- print_matrix1(d_a, order, order);
- // nullify below and above
- // multiply row with a^(-1)
- }
- cudaMemcpy2DAsync(b, order * sizeof(T), d_id, order_compl * sizeof(T), order * sizeof(T), order, cudaMemcpyDeviceToHost);
- return true;
- }
- }
- namespace detail::kernel
- {
- template <typename T>
- __global__ void get_idm(T* A, int order)
- {
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
- A[row * order + col] = row == col ? Id_v<T> : Zero_v<T>;
- }
- template <typename T>
- __global__ void get_first_not_null_row(T* A, int* idx, int col, int order)
- {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i >= order || i <= col) return;
- constexpr T zero = Zero_v<T>;
- if (A[i * order + col] != zero)
- {
- *idx = i;
- }
- }
- template <typename T>
- __global__ void swap_rows(T* A, int row1, int row2, int order)
- {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i >= order) return;
- T tmp = A[row1 * order + i];
- A[row1 * order + i] = A[row2 * order + i];
- A[row2 * order + i] = tmp;
- }
- template <typename T>
- __global__ void subtract_rows(T* A, int col, int order)
- {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- int idy = blockIdx.y * blockDim.y + threadIdx.y;
- int offsetx = gridDim.x * blockDim.x;
- int offsety = gridDim.y * blockDim.y;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement