minv

#pragma once

#include <cublas_v2.h>
#include <type_traits>

#include "common.cuh"

template <typename T>
bool minv(T* a, T* b, int order, const launch_params& params);

namespace detail
{
    template <typename T>
    bool minv_cublas(T* a, T* b, int order, cublasHandle_t handle, cudaStream_t stream = 0);

    template <typename T>
    bool minv_generic(T* a, T* b, int order, int threadsPerBlock, cudaStream_t stream = 0);
}

namespace detail::kernel
{
    template <typename T>
    __global__ void get_idm(T* A, int order);

    template <typename T>
    __global__ void get_first_not_null_row(T* A, int* idx, int col, int order);

    template <typename T>
    __global__ void swap_rows(T* A, int row1, int row2, int order);

    template <typename T>
    __global__ void subtract_rows(T* A, int col, int order);
}

template <typename T>
bool minv(T* a, T* b, int order, const launch_params& params)
{
    if constexpr (std::is_floating_point_v<T>)
    {
        if (nullptr != params.cublasHandle)
        {
            return detail::minv_cublas(a, b, order, params.cublasHandle, params.stream);
        }
        else
        {
            return detail::minv_generic(a, b, order, params.threadsPerBlock, params.stream);
        }
    }
    else
    {
        return detail::minv_generic(a, b, order, params.threadsPerBlock, params.stream);
    }
}

namespace detail
{
    template <typename T>
    bool minv_cublas(T* a, T* b, int order, cublasHandle_t handle, cudaStream_t stream)
    {
        return false;
    }

    template <typename T>
    bool minv_generic(T* a, T* b, int order, int threadsPerBlock, cudaStream_t stream)
    {
        constexpr int bs = 32;
        int order_compl = order + bs - (order % bs);
        T *d_a, *d_id;

        cudaMallocHost(&d_a, order_compl * order_compl * sizeof(T));
        cudaMallocHost(&d_id, order_compl * order_compl * sizeof(T));

        cudaMemcpyAsync(d_a, a, order * order * sizeof(T), cudaMemcpyHostToDevice, stream);

        dim3 threads(bs, bs);
        dim3 grid(order_compl / threads.x, order_compl / threads.y);
        kernel::get_idm<<<grid, threads, 0, stream>>>(d_id, order_compl);

        for (int i = 0; i < order; ++i)
        {
            // find first not-null row
            int h_not_null_row;
            int* d_not_null_row;
            cudaMalloc(&d_not_null_row, sizeof(int));

            threads = dim3(threadsPerBlock, 1);
            grid = dim3((order + threadsPerBlock - 1) / threadsPerBlock , 1);
            kernel::get_first_not_null_row<<<grid, threads, 0, stream>>>(d_a, d_not_null_row, i, order);

            cudaMemcpy(&h_not_null_row, d_not_null_row, sizeof(int), cudaMemcpyDeviceToHost);
            cudaDeviceSynchronize();

            // swap with current line
            printf("Swap row %d with row %d (col=%d)\n", i, h_not_null_row, i);

            threads = dim3(threadsPerBlock, 1);
            grid = dim3((order + threadsPerBlock - 1) / threadsPerBlock , 1);
            kernel::swap_rows<<<grid, threads, 0, stream>>>(d_a, i, h_not_null_row, order);

            cudaDeviceSynchronize();
            print_matrix1(d_a, order, order);

            // nullify below and above

            // multiply row with a^(-1)
        }

        cudaMemcpy2DAsync(b, order * sizeof(T), d_id, order_compl * sizeof(T), order * sizeof(T), order, cudaMemcpyDeviceToHost);
        return true;
    }
}

namespace detail::kernel
{
    template <typename T>
    __global__ void get_idm(T* A, int order)
    {
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;

        A[row * order + col] = row == col ? Id_v<T> : Zero_v<T>;
    }

    template <typename T>
    __global__ void get_first_not_null_row(T* A, int* idx, int col, int order)
    {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i >= order || i <= col) return;

        constexpr T zero = Zero_v<T>;
        if (A[i * order + col] != zero)
        {
            *idx = i;
        }
    }

    template <typename T>
    __global__ void swap_rows(T* A, int row1, int row2, int order)
    {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i >= order) return;

        T tmp = A[row1 * order + i];
        A[row1 * order + i] = A[row2 * order + i];
        A[row2 * order + i] = tmp;
    }

    template <typename T>
    __global__ void subtract_rows(T* A, int col, int order)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        int idy = blockIdx.y * blockDim.y + threadIdx.y;
        int offsetx = gridDim.x * blockDim.x;
        int offsety = gridDim.y * blockDim.y;


    }
}