cusparse_liblinear_problem

#define USE_NVCC

#include "cusparse_problem.h"
#include <algorithm>
#include <iostream>
#include <cuda_runtime.h>
#include "cusparse_v2.h"


static bool verbose_debug = false;


static void check_return_code(std::string message, cudaError_t status) {
    if (status != cudaSuccess) {
        std::cerr << "\x1b[91mError performing operation: " << message
            << "; error: " << cudaGetErrorString(status)
            << "\x1b[0m" << std::endl;
    } else if (verbose_debug) {
        std::cerr << "\x1b[35m" << message + " succeeded\x1b[0m" << std::endl;
    }
}


static void check_cusparse_call(std::string message, cusparseStatus_t status) {
    if (status != CUSPARSE_STATUS_SUCCESS) {
        std::cerr << "\x1b[91mError performing operation: " << message
            << "\x1b[0m" << std::endl;
    }
}


template<class T>
static void copy_to_device(std::string name, T *dest, T *src, long num) {
    check_return_code("Copying matrix " + name,
        cudaMemcpy(dest, src, num * sizeof(T), cudaMemcpyHostToDevice));
}


template<class T>
static void copy_to_host(std::string name, T *dest, T *src, long num) {
    check_return_code("Copying matrix " + name,
        cudaMemcpy(dest, src, num * sizeof(T), cudaMemcpyDeviceToHost));
}


template<class T>
static void typed_cumalloc(std::string name, T **dest, long num) {
    check_return_code(
        "Allocating " + name,
        cudaMalloc((void **)(dest), num * sizeof(T)));
}


CusparseCSRMatrix::CusparseCSRMatrix(const problem *prob_old)
        : width(prob_old->n), height(prob_old->l)
{
    std::cerr << "initializing cusparse csr" << std::endl;

    csr_matrix *result = new csr_matrix;

    this->nnz = 0;  // non-zero values
    for(int i=0; i < prob_old->l; i++) {
        feature_node *s = prob_old->x[i];
        while(s->index!=-1) {
            nnz += 1;
            s++;
        }
    }
    std::cerr << "\x1b[94mNum non-zero values: " << nnz << "\x1b[0m" << std::endl;

    int rows_n = prob_old->l + 1;
    host_matrix.csr_values = new double[nnz];
    host_matrix.row_pointers = new int[rows_n];
    host_matrix.column_indices = new int[nnz];

    // fill values
    int nnz_index = 0;
    for (int i = 0; i < prob_old->l; i++) {
        feature_node *s = prob_old->x[i];
        host_matrix.row_pointers[i] = nnz_index;
        while(s->index!=-1) {
            host_matrix.csr_values[nnz_index] = s->value;
            host_matrix.column_indices[nnz_index] = s->index - 1;
            nnz_index += 1;
            s++;
        }
    }
    host_matrix.row_pointers[prob_old->l] = nnz_index;

    // initialize cusparse
    check_cusparse_call("cusparse initialization", cusparseCreate(&cusparse_handle));

    // copy to cuda
    typed_cumalloc("values array", &(cuda_matrix.csr_values), nnz);
    typed_cumalloc("row pointer array", &(cuda_matrix.row_pointers), rows_n);
    typed_cumalloc("column indices", &(cuda_matrix.column_indices), nnz);

    copy_to_device("csr values", cuda_matrix.csr_values, host_matrix.csr_values, nnz);
    copy_to_device("row pointer", cuda_matrix.row_pointers, host_matrix.row_pointers, rows_n);
    copy_to_device(
        "column indices",
        cuda_matrix.column_indices,
        host_matrix.column_indices,
        nnz);

    // create a matrix description for the cusparse library
    check_cusparse_call("create descriptor", cusparseCreateMatDescr(&descr));
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

    // pre-allocated some vectors for spmv
    typed_cumalloc("input vector", &cuda_csr_mv_in, std::max(width, height));
    // typed_cumalloc("dummy vector", &cuda_csr_mv_dummy, std::max(width, height));
    typed_cumalloc("output vector", &cuda_csr_mv_out, std::max(width, height));
}


CusparseCSRMatrix::~CusparseCSRMatrix() {
    cudaFree(cuda_csr_mv_in);
    cudaFree(cuda_csr_mv_out);
    cudaFree(cuda_matrix.column_indices);
    cudaFree(cuda_matrix.row_pointers);
    cudaFree(cuda_matrix.csr_values);
    cusparseDestroy(cusparse_handle);
    cudaDeviceReset();
}


void CusparseCSRMatrix::csr_XTv(double *vector, double *result) const {
    copy_to_device("input vector", cuda_csr_mv_in, vector, height);
    double d_one = 1.0;  // dummy value for alpha
    double d_zero = 0.0;  // dummy value for beta
    std::cerr << "parameters: "
        << "width: " << width
        << ", height: " << height
        << ", nnz: " << nnz
        << std::endl;
    check_return_code("synchronize after input copy", cudaDeviceSynchronize());
    check_cusparse_call(
        "csr_mv",
        cusparseDcsrmv(
            cusparse_handle,
            // CUSPARSE_OPERATION_NON_TRANSPOSE,
            CUSPARSE_OPERATION_TRANSPOSE,
            width,
            height,
            nnz,
            &d_one,
            descr,
            cuda_matrix.csr_values,
            cuda_matrix.row_pointers,
            cuda_matrix.column_indices,
            cuda_csr_mv_in,
            &d_zero,
            cuda_csr_mv_out));
    check_return_code("synchronize after calculation", cudaDeviceSynchronize());
    copy_to_host("output vector", result, cuda_csr_mv_out, width);
    check_return_code("synchronize after output memcopy", cudaDeviceSynchronize());
}