SHOW:
|
|
- or go back to the newest paste.
| 1 | #define USE_NVCC | |
| 2 | ||
| 3 | #include "cusparse_problem.h" | |
| 4 | #include <algorithm> | |
| 5 | #include <iostream> | |
| 6 | #include <cuda_runtime.h> | |
| 7 | #include "cusparse_v2.h" | |
| 8 | ||
| 9 | ||
| 10 | static bool verbose_debug = false; | |
| 11 | ||
| 12 | ||
| 13 | static void check_return_code(std::string message, cudaError_t status) {
| |
| 14 | if (status != cudaSuccess) {
| |
| 15 | std::cerr << "\x1b[91mError performing operation: " << message | |
| 16 | << "; error: " << cudaGetErrorString(status) | |
| 17 | << "\x1b[0m" << std::endl; | |
| 18 | } else if (verbose_debug) {
| |
| 19 | std::cerr << "\x1b[35m" << message + " succeeded\x1b[0m" << std::endl; | |
| 20 | } | |
| 21 | } | |
| 22 | ||
| 23 | ||
| 24 | static void check_cusparse_call(std::string message, cusparseStatus_t status) {
| |
| 25 | if (status != CUSPARSE_STATUS_SUCCESS) {
| |
| 26 | std::cerr << "\x1b[91mError performing operation: " << message | |
| 27 | << "\x1b[0m" << std::endl; | |
| 28 | } | |
| 29 | } | |
| 30 | ||
| 31 | ||
| 32 | template<class T> | |
| 33 | static void copy_to_device(std::string name, T *dest, T *src, long num) {
| |
| 34 | check_return_code("Copying matrix " + name,
| |
| 35 | cudaMemcpy(dest, src, num * sizeof(T), cudaMemcpyHostToDevice)); | |
| 36 | } | |
| 37 | ||
| 38 | ||
| 39 | template<class T> | |
| 40 | static void copy_to_host(std::string name, T *dest, T *src, long num) {
| |
| 41 | check_return_code("Copying matrix " + name,
| |
| 42 | cudaMemcpy(dest, src, num * sizeof(T), cudaMemcpyDeviceToHost)); | |
| 43 | } | |
| 44 | ||
| 45 | ||
| 46 | template<class T> | |
| 47 | static void typed_cumalloc(std::string name, T **dest, long num) {
| |
| 48 | check_return_code( | |
| 49 | "Allocating " + name, | |
| 50 | cudaMalloc((void **)(dest), num * sizeof(T))); | |
| 51 | } | |
| 52 | ||
| 53 | ||
| 54 | CusparseCSRMatrix::CusparseCSRMatrix(const problem *prob_old) | |
| 55 | : width(prob_old->n), height(prob_old->l) | |
| 56 | {
| |
| 57 | std::cerr << "initializing cusparse csr" << std::endl; | |
| 58 | ||
| 59 | csr_matrix *result = new csr_matrix; | |
| 60 | ||
| 61 | this->nnz = 0; // non-zero values | |
| 62 | for(int i=0; i < prob_old->l; i++) {
| |
| 63 | feature_node *s = prob_old->x[i]; | |
| 64 | while(s->index!=-1) {
| |
| 65 | nnz += 1; | |
| 66 | s++; | |
| 67 | } | |
| 68 | } | |
| 69 | std::cerr << "\x1b[94mNum non-zero values: " << nnz << "\x1b[0m" << std::endl; | |
| 70 | ||
| 71 | int rows_n = prob_old->l + 1; | |
| 72 | host_matrix.csr_values = new double[nnz]; | |
| 73 | host_matrix.row_pointers = new int[rows_n]; | |
| 74 | host_matrix.column_indices = new int[nnz]; | |
| 75 | ||
| 76 | // fill values | |
| 77 | int nnz_index = 0; | |
| 78 | for (int i = 0; i < prob_old->l; i++) {
| |
| 79 | feature_node *s = prob_old->x[i]; | |
| 80 | host_matrix.row_pointers[i] = nnz_index; | |
| 81 | while(s->index!=-1) {
| |
| 82 | host_matrix.csr_values[nnz_index] = s->value; | |
| 83 | host_matrix.column_indices[nnz_index] = s->index - 1; | |
| 84 | nnz_index += 1; | |
| 85 | s++; | |
| 86 | } | |
| 87 | } | |
| 88 | host_matrix.row_pointers[prob_old->l] = nnz_index; | |
| 89 | ||
| 90 | // initialize cusparse | |
| 91 | check_cusparse_call("cusparse initialization", cusparseCreate(&cusparse_handle));
| |
| 92 | ||
| 93 | // copy to cuda | |
| 94 | typed_cumalloc("values array", &(cuda_matrix.csr_values), nnz);
| |
| 95 | typed_cumalloc("row pointer array", &(cuda_matrix.row_pointers), rows_n);
| |
| 96 | typed_cumalloc("column indices", &(cuda_matrix.column_indices), nnz);
| |
| 97 | ||
| 98 | copy_to_device("csr values", cuda_matrix.csr_values, host_matrix.csr_values, nnz);
| |
| 99 | copy_to_device("row pointer", cuda_matrix.row_pointers, host_matrix.row_pointers, rows_n);
| |
| 100 | copy_to_device( | |
| 101 | "column indices", | |
| 102 | cuda_matrix.column_indices, | |
| 103 | host_matrix.column_indices, | |
| 104 | nnz); | |
| 105 | ||
| 106 | // create a matrix description for the cusparse library | |
| 107 | check_cusparse_call("create descriptor", cusparseCreateMatDescr(&descr));
| |
| 108 | cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); | |
| 109 | cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); | |
| 110 | ||
| 111 | // pre-allocated some vectors for spmv | |
| 112 | typed_cumalloc("input vector", &cuda_csr_mv_in, std::max(width, height));
| |
| 113 | // typed_cumalloc("dummy vector", &cuda_csr_mv_dummy, std::max(width, height));
| |
| 114 | typed_cumalloc("output vector", &cuda_csr_mv_out, std::max(width, height));
| |
| 115 | } | |
| 116 | ||
| 117 | ||
| 118 | CusparseCSRMatrix::~CusparseCSRMatrix() {
| |
| 119 | cudaFree(cuda_csr_mv_in); | |
| 120 | cudaFree(cuda_csr_mv_out); | |
| 121 | cudaFree(cuda_matrix.column_indices); | |
| 122 | cudaFree(cuda_matrix.row_pointers); | |
| 123 | cudaFree(cuda_matrix.csr_values); | |
| 124 | cusparseDestroy(cusparse_handle); | |
| 125 | cudaDeviceReset(); | |
| 126 | } | |
| 127 | ||
| 128 | ||
| 129 | void CusparseCSRMatrix::csr_XTv(double *vector, double *result) const {
| |
| 130 | copy_to_device("input vector", cuda_csr_mv_in, vector, height);
| |
| 131 | double d_one = 1.0; // dummy value for alpha | |
| 132 | double d_zero = 0.0; // dummy value for beta | |
| 133 | std::cerr << "parameters: " | |
| 134 | << "width: " << width | |
| 135 | << ", height: " << height | |
| 136 | << ", nnz: " << nnz | |
| 137 | << std::endl; | |
| 138 | check_return_code("synchronize after input copy", cudaDeviceSynchronize());
| |
| 139 | check_cusparse_call( | |
| 140 | "csr_mv", | |
| 141 | cusparseDcsrmv( | |
| 142 | cusparse_handle, | |
| 143 | // CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| 144 | CUSPARSE_OPERATION_TRANSPOSE, | |
| 145 | width, | |
| 146 | height, | |
| 147 | nnz, | |
| 148 | &d_one, | |
| 149 | descr, | |
| 150 | cuda_matrix.csr_values, | |
| 151 | cuda_matrix.row_pointers, | |
| 152 | cuda_matrix.column_indices, | |
| 153 | cuda_csr_mv_in, | |
| 154 | &d_zero, | |
| 155 | cuda_csr_mv_out)); | |
| 156 | check_return_code("synchronize after calculation", cudaDeviceSynchronize());
| |
| 157 | copy_to_host("output vector", result, cuda_csr_mv_out, width);
| |
| 158 | check_return_code("synchronize after output memcopy", cudaDeviceSynchronize());
| |
| 159 | } |