Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "JOR.h"
- #include <stdio.h>
- #include <assert.h>
- #include <cuda.h>
- #include <cutil.h>
- #include <cusparse.h>
- #define imin(a,b) ((a)<(b)?(a):(b))
- #define imax(a,b) ((a)>(b)?(a):(b))
- #define N 512
- __global__ void kernel_absConv(int n, double *Diag, double *sol, double *sol2, double omega) {
- __device__ __shared__ double cache[N];
- int i = threadIdx.x + blockIdx.x * blockDim.x;
- int li = threadIdx.x;
- double tmp = 0;
- if(i < n) {
- sol2[i] = (1.-omega)*sol[i] + omega*Diag[i]*sol2[i];
- tmp = max(tmp, fabs(sol2[i] - sol[i]));
- i += blockDim.x * gridDim.x;
- }
- cache[li] = tmp;
- __syncthreads();
- for(int j=blockDim.x/2 ; j>0 ; j/=2 ) {
- if(li < j)
- cache[li] = max(cache[li], cache[li+j]);
- __syncthreads();
- }
- if(li == 0) sol[blockIdx.x] = cache[0];
- }
- __global__ void kernel_relConv(int n, double *Diag, double *sol, double *sol2, double omega) {
- __shared__ double cache[N];
- int i = threadIdx.x + blockIdx.x * blockDim.x;
- int li = threadIdx.x;
- double tmp = 0;
- if(i < n) {
- sol2[i] = Diag[i]*sol2[i];
- tmp = max(tmp, fabs(sol2[i] - sol[i])/sol2[i]);
- i += blockDim.x * gridDim.x;
- }
- cache[li] = tmp;
- __syncthreads();
- for(int j=blockDim.x/2 ; j>0 ; j/=2 ) {
- if(li < j)
- cache[li] = max(cache[li], cache[li+j]);
- __syncthreads();
- }
- if(li == 0) sol[blockIdx.x] = cache[0];
- }
- bool Jacobi_Method(
- int n, int nnz, double *non_zeros, int *row_counts, int *cols,
- double *diags_vec , double *b_vec, double *soln,
- double omega, double term_crit_param, int term_crit,
- int max_iters, int *iters) {
- assert(non_zeros != NULL);
- assert(row_counts != NULL);
- assert(cols != NULL);
- assert(diags_vec != NULL);
- assert(soln != NULL);
- bool done;
- double result;
- double *cudaNonZero;
- int *cudaRowStarts;
- int *cudaColsIndex;
- double *cudaSoln, *cudaSoln2, *cudaDiags, *cudaB;
- double *soln2 = new double[n], *tmpsoln;
- cusparseHandle_t handle;
- cusparseStatus_t cuStat = cusparseCreate(&handle), cuStat1;
- if(cuStat != CUSPARSE_STATUS_SUCCESS) {
- fprintf(stderr, "No se pudo inicializar Cusparse\n");
- return 0;
- }
- cusparseMatDescr_t descrA;
- cuStat = cusparseCreateMatDescr( &descrA );
- if(cuStat != CUSPARSE_STATUS_SUCCESS) {
- fprintf(stderr, "Error al crear descripcion de matriz\n");
- return 0;
- }
- cuStat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
- cuStat1 = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
- if(cuStat!=CUSPARSE_STATUS_SUCCESS || cuStat1 != CUSPARSE_STATUS_SUCCESS) {
- fprintf(stderr, "Error al setear descripcion de matriz\n");
- return 0;
- }
- cudaMalloc((void**)&cudaNonZero, nnz*sizeof(double));
- cudaMalloc((void**)&cudaRowStarts, (n+1)*sizeof(int));
- cudaMalloc((void**)&cudaColsIndex, nnz*sizeof(int));
- cudaMalloc((void**)&cudaDiags, n*sizeof(double));
- cudaMalloc((void**)&cudaB, n*sizeof(double));
- cudaMalloc((void**)&cudaSoln, n*sizeof(double));
- cudaMalloc((void**)&cudaSoln2, n*sizeof(double));
- cudaMemcpy(cudaNonZero, non_zeros, (size_t) (nnz*sizeof(double)), cudaMemcpyHostToDevice);
- cudaMemcpy(cudaRowStarts, row_counts, (size_t) ((n+1)*sizeof(int)), cudaMemcpyHostToDevice);
- cudaMemcpy(cudaColsIndex, cols, (size_t) (nnz*sizeof(int)), cudaMemcpyHostToDevice);
- cudaMemcpy(cudaDiags, diags_vec, (size_t) (n*sizeof(double)), cudaMemcpyHostToDevice);
- if(b_vec == NULL)
- cudaMemcpy(cudaB, b_vec, (size_t) (n*sizeof(double)), cudaMemcpyHostToDevice);
- else
- cudaMemset(cudaB, 0., (size_t) (n*sizeof(double)));
- cudaMemcpy(cudaSoln, soln, (size_t) (n*sizeof(double)), cudaMemcpyHostToDevice);
- *iters = 0;
- done = false;
- fprintf(stderr, "HOLA\n");
- while (!done && (*iters) < max_iters) {
- (*iters)++;
- if((*iters) < 2 ) {
- cudaMemcpy(soln2, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
- for(int i=0 ; i<20 ; i++) fprintf(stderr, "%lf ", soln2[i]);
- fprintf(stderr, "\n");
- }
- if(b_vec != NULL) {
- CUDA_SAFE_CALL(cudaMemcpy(cudaSoln2, cudaB, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToDevice) );
- } else {
- cudaMemset(cudaSoln2, 0., (size_t) (n*sizeof(double)) ) ;
- }
- // cudaSoln2 = -1.*A*cudaSoln + 1.*cudaSoln2
- cuStat = cusparseDcsrmv(
- handle,
- CUSPARSE_OPERATION_NON_TRANSPOSE,
- n, n, -1.,
- descrA,
- cudaNonZero,
- cudaRowStarts,
- cudaColsIndex,
- cudaSoln, 1., cudaSoln2 );
- if((*iters) < 2) {
- cudaMemcpy(soln2, cudaSoln2, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
- cudaThreadSynchronize();
- for(int i=0 ; i<20 ; i++) {
- fprintf(stderr, "%lf/%lf ", diags_vec[i], soln2[i]);
- }
- fprintf(stderr, "\n");
- }
- // check convergence
- // (note: doing outside loop means may not need to check all elements)
- const int nblocks = imin( 65000, (n+(N-1))/N);
- switch (term_crit) {
- case 1:
- fprintf(stderr, "A\n");
- kernel_absConv<<<nblocks,N>>>(n, cudaDiags, cudaSoln, cudaSoln2, omega);
- cudaMemcpy(soln, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
- result = 0;
- for(int i=0 ; i<nblocks ; i++) result = imax(result, soln[i]);
- done = result < term_crit_param;
- break;
- case 2:
- kernel_relConv<<<nblocks,N>>>(n, cudaDiags, cudaSoln, cudaSoln2, omega);
- cudaThreadSynchronize();
- if((*iters) < 2 ) {
- cudaMemcpy(soln2, cudaSoln2, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
- cudaThreadSynchronize();
- fprintf(stderr, "R\n");
- for(int i=0 ; i<20 ; i++) fprintf(stderr, "%lf ", soln2[i]);
- fprintf(stderr, "\n");
- }
- cudaMemcpy(soln, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
- result = 0;
- for(int i=0 ; i<nblocks ; i++) result = imax(result, soln[i]);
- done = result < term_crit_param;
- break;
- }
- // prepare for next iteration
- tmpsoln = cudaSoln;
- cudaSoln = cudaSoln2;
- cudaSoln2 = tmpsoln;
- }
- cudaMemcpy(soln, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
- cudaThreadSynchronize();
- delete[] soln2;
- return done;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement