SHOW:
|
|
- or go back to the newest paste.
1 | #define USE_NVCC | |
2 | ||
3 | #include "cusparse_problem.h" | |
4 | #include <algorithm> | |
5 | #include <iostream> | |
6 | #include <cuda_runtime.h> | |
7 | #include "cusparse_v2.h" | |
8 | ||
9 | ||
10 | static bool verbose_debug = false; | |
11 | ||
12 | ||
13 | static void check_return_code(std::string message, cudaError_t status) { | |
14 | if (status != cudaSuccess) { | |
15 | std::cerr << "\x1b[91mError performing operation: " << message | |
16 | << "; error: " << cudaGetErrorString(status) | |
17 | << "\x1b[0m" << std::endl; | |
18 | } else if (verbose_debug) { | |
19 | std::cerr << "\x1b[35m" << message + " succeeded\x1b[0m" << std::endl; | |
20 | } | |
21 | } | |
22 | ||
23 | ||
24 | static void check_cusparse_call(std::string message, cusparseStatus_t status) { | |
25 | if (status != CUSPARSE_STATUS_SUCCESS) { | |
26 | std::cerr << "\x1b[91mError performing operation: " << message | |
27 | << "\x1b[0m" << std::endl; | |
28 | } | |
29 | } | |
30 | ||
31 | ||
32 | template<class T> | |
33 | static void copy_to_device(std::string name, T *dest, T *src, long num) { | |
34 | check_return_code("Copying matrix " + name, | |
35 | cudaMemcpy(dest, src, num * sizeof(T), cudaMemcpyHostToDevice)); | |
36 | } | |
37 | ||
38 | ||
39 | template<class T> | |
40 | static void copy_to_host(std::string name, T *dest, T *src, long num) { | |
41 | check_return_code("Copying matrix " + name, | |
42 | cudaMemcpy(dest, src, num * sizeof(T), cudaMemcpyDeviceToHost)); | |
43 | } | |
44 | ||
45 | ||
46 | template<class T> | |
47 | static void typed_cumalloc(std::string name, T **dest, long num) { | |
48 | check_return_code( | |
49 | "Allocating " + name, | |
50 | cudaMalloc((void **)(dest), num * sizeof(T))); | |
51 | } | |
52 | ||
53 | ||
54 | CusparseCSRMatrix::CusparseCSRMatrix(const problem *prob_old) | |
55 | : width(prob_old->n), height(prob_old->l) | |
56 | { | |
57 | std::cerr << "initializing cusparse csr" << std::endl; | |
58 | ||
59 | csr_matrix *result = new csr_matrix; | |
60 | ||
61 | this->nnz = 0; // non-zero values | |
62 | for(int i=0; i < prob_old->l; i++) { | |
63 | feature_node *s = prob_old->x[i]; | |
64 | while(s->index!=-1) { | |
65 | nnz += 1; | |
66 | s++; | |
67 | } | |
68 | } | |
69 | std::cerr << "\x1b[94mNum non-zero values: " << nnz << "\x1b[0m" << std::endl; | |
70 | ||
71 | int rows_n = prob_old->l + 1; | |
72 | host_matrix.csr_values = new double[nnz]; | |
73 | host_matrix.row_pointers = new int[rows_n]; | |
74 | host_matrix.column_indices = new int[nnz]; | |
75 | ||
76 | // fill values | |
77 | int nnz_index = 0; | |
78 | for (int i = 0; i < prob_old->l; i++) { | |
79 | feature_node *s = prob_old->x[i]; | |
80 | host_matrix.row_pointers[i] = nnz_index; | |
81 | while(s->index!=-1) { | |
82 | host_matrix.csr_values[nnz_index] = s->value; | |
83 | host_matrix.column_indices[nnz_index] = s->index - 1; | |
84 | nnz_index += 1; | |
85 | s++; | |
86 | } | |
87 | } | |
88 | host_matrix.row_pointers[prob_old->l] = nnz_index; | |
89 | ||
90 | // initialize cusparse | |
91 | check_cusparse_call("cusparse initialization", cusparseCreate(&cusparse_handle)); | |
92 | ||
93 | // copy to cuda | |
94 | typed_cumalloc("values array", &(cuda_matrix.csr_values), nnz); | |
95 | typed_cumalloc("row pointer array", &(cuda_matrix.row_pointers), rows_n); | |
96 | typed_cumalloc("column indices", &(cuda_matrix.column_indices), nnz); | |
97 | ||
98 | copy_to_device("csr values", cuda_matrix.csr_values, host_matrix.csr_values, nnz); | |
99 | copy_to_device("row pointer", cuda_matrix.row_pointers, host_matrix.row_pointers, rows_n); | |
100 | copy_to_device( | |
101 | "column indices", | |
102 | cuda_matrix.column_indices, | |
103 | host_matrix.column_indices, | |
104 | nnz); | |
105 | ||
106 | // create a matrix description for the cusparse library | |
107 | check_cusparse_call("create descriptor", cusparseCreateMatDescr(&descr)); | |
108 | cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); | |
109 | cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); | |
110 | ||
111 | // pre-allocated some vectors for spmv | |
112 | typed_cumalloc("input vector", &cuda_csr_mv_in, std::max(width, height)); | |
113 | // typed_cumalloc("dummy vector", &cuda_csr_mv_dummy, std::max(width, height)); | |
114 | typed_cumalloc("output vector", &cuda_csr_mv_out, std::max(width, height)); | |
115 | } | |
116 | ||
117 | ||
118 | CusparseCSRMatrix::~CusparseCSRMatrix() { | |
119 | cudaFree(cuda_csr_mv_in); | |
120 | cudaFree(cuda_csr_mv_out); | |
121 | cudaFree(cuda_matrix.column_indices); | |
122 | cudaFree(cuda_matrix.row_pointers); | |
123 | cudaFree(cuda_matrix.csr_values); | |
124 | cusparseDestroy(cusparse_handle); | |
125 | cudaDeviceReset(); | |
126 | } | |
127 | ||
128 | ||
129 | void CusparseCSRMatrix::csr_XTv(double *vector, double *result) const { | |
130 | copy_to_device("input vector", cuda_csr_mv_in, vector, height); | |
131 | double d_one = 1.0; // dummy value for alpha | |
132 | double d_zero = 0.0; // dummy value for beta | |
133 | std::cerr << "parameters: " | |
134 | << "width: " << width | |
135 | << ", height: " << height | |
136 | << ", nnz: " << nnz | |
137 | << std::endl; | |
138 | check_return_code("synchronize after input copy", cudaDeviceSynchronize()); | |
139 | check_cusparse_call( | |
140 | "csr_mv", | |
141 | cusparseDcsrmv( | |
142 | cusparse_handle, | |
143 | // CUSPARSE_OPERATION_NON_TRANSPOSE, | |
144 | CUSPARSE_OPERATION_TRANSPOSE, | |
145 | width, | |
146 | height, | |
147 | nnz, | |
148 | &d_one, | |
149 | descr, | |
150 | cuda_matrix.csr_values, | |
151 | cuda_matrix.row_pointers, | |
152 | cuda_matrix.column_indices, | |
153 | cuda_csr_mv_in, | |
154 | &d_zero, | |
155 | cuda_csr_mv_out)); | |
156 | check_return_code("synchronize after calculation", cudaDeviceSynchronize()); | |
157 | copy_to_host("output vector", result, cuda_csr_mv_out, width); | |
158 | check_return_code("synchronize after output memcopy", cudaDeviceSynchronize()); | |
159 | } |