Advertisement
Guest User

Untitled

a guest
Sep 25th, 2017
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.96 KB | None | 0 0
  1. #include "JOR.h"
  2. #include <stdio.h>
  3. #include <assert.h>
  4. #include <cuda.h>
  5. #include <cutil.h>
  6. #include <cusparse.h>
  7.  
  8. #define imin(a,b) ((a)<(b)?(a):(b))
  9. #define imax(a,b) ((a)>(b)?(a):(b))
  10.  
  11. #define N 512
  12.  
  13. __global__ void kernel_absConv(int n, double *Diag, double *sol, double *sol2, double omega) {
  14.  
  15.     __device__ __shared__ double cache[N];
  16.     int i = threadIdx.x + blockIdx.x * blockDim.x;
  17.     int li = threadIdx.x;
  18.  
  19.     double tmp = 0;
  20.     if(i < n) {
  21.         sol2[i] = (1.-omega)*sol[i] + omega*Diag[i]*sol2[i];
  22.         tmp = max(tmp, fabs(sol2[i] - sol[i]));
  23.  
  24.         i += blockDim.x * gridDim.x;
  25.     }
  26.  
  27.     cache[li] = tmp;
  28.     __syncthreads();
  29.  
  30.     for(int j=blockDim.x/2 ; j>0 ; j/=2 ) {
  31.         if(li < j)
  32.             cache[li] = max(cache[li], cache[li+j]);
  33.         __syncthreads();
  34.     }
  35.    
  36.     if(li == 0) sol[blockIdx.x] = cache[0];
  37. }
  38.  
  39. __global__ void kernel_relConv(int n, double *Diag, double *sol, double *sol2, double omega) {
  40.  
  41.     __shared__ double cache[N];
  42.     int i = threadIdx.x + blockIdx.x * blockDim.x;
  43.     int li = threadIdx.x;
  44.  
  45.     double tmp = 0;
  46.     if(i < n) {
  47.         sol2[i] = Diag[i]*sol2[i];
  48.         tmp = max(tmp, fabs(sol2[i] - sol[i])/sol2[i]);
  49.  
  50.         i += blockDim.x * gridDim.x;
  51.     }
  52.  
  53.     cache[li] = tmp;
  54.     __syncthreads();
  55.  
  56.     for(int j=blockDim.x/2 ; j>0 ; j/=2 ) {
  57.         if(li < j)
  58.             cache[li] = max(cache[li], cache[li+j]);
  59.         __syncthreads();
  60.     }
  61.    
  62.     if(li == 0) sol[blockIdx.x] = cache[0];
  63. }
  64.  
  65. bool Jacobi_Method(
  66.                     int n, int nnz, double *non_zeros, int *row_counts, int *cols,
  67.                     double *diags_vec , double *b_vec, double *soln,
  68.                     double omega, double term_crit_param, int term_crit,
  69.                     int max_iters, int *iters) {
  70.  
  71.  
  72.     assert(non_zeros != NULL);
  73.     assert(row_counts != NULL);
  74.     assert(cols != NULL);
  75.     assert(diags_vec != NULL);
  76.     assert(soln != NULL);
  77.  
  78.     bool done;
  79.     double result;
  80.     double *cudaNonZero;
  81.     int *cudaRowStarts;
  82.     int *cudaColsIndex;
  83.     double *cudaSoln, *cudaSoln2, *cudaDiags, *cudaB;
  84.  
  85.     double *soln2 = new double[n], *tmpsoln;
  86.  
  87.     cusparseHandle_t handle;
  88.     cusparseStatus_t cuStat = cusparseCreate(&handle), cuStat1;
  89.     if(cuStat != CUSPARSE_STATUS_SUCCESS) {
  90.         fprintf(stderr, "No se pudo inicializar Cusparse\n");
  91.         return 0;
  92.     }
  93.  
  94.     cusparseMatDescr_t descrA;
  95.     cuStat = cusparseCreateMatDescr( &descrA );
  96.     if(cuStat != CUSPARSE_STATUS_SUCCESS) {
  97.         fprintf(stderr, "Error al crear descripcion de matriz\n");
  98.         return 0;
  99.     }
  100.  
  101.     cuStat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
  102.     cuStat1 = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
  103.     if(cuStat!=CUSPARSE_STATUS_SUCCESS || cuStat1 != CUSPARSE_STATUS_SUCCESS) {
  104.         fprintf(stderr, "Error al setear descripcion de matriz\n");
  105.         return 0;  
  106.     }
  107.  
  108.     cudaMalloc((void**)&cudaNonZero, nnz*sizeof(double));
  109.     cudaMalloc((void**)&cudaRowStarts, (n+1)*sizeof(int));
  110.     cudaMalloc((void**)&cudaColsIndex, nnz*sizeof(int));
  111.     cudaMalloc((void**)&cudaDiags, n*sizeof(double));
  112.     cudaMalloc((void**)&cudaB, n*sizeof(double));
  113.  
  114.     cudaMalloc((void**)&cudaSoln, n*sizeof(double));
  115.     cudaMalloc((void**)&cudaSoln2, n*sizeof(double));
  116.  
  117.     cudaMemcpy(cudaNonZero, non_zeros, (size_t) (nnz*sizeof(double)), cudaMemcpyHostToDevice);
  118.     cudaMemcpy(cudaRowStarts, row_counts, (size_t) ((n+1)*sizeof(int)), cudaMemcpyHostToDevice);
  119.     cudaMemcpy(cudaColsIndex, cols, (size_t) (nnz*sizeof(int)), cudaMemcpyHostToDevice);
  120.     cudaMemcpy(cudaDiags, diags_vec, (size_t) (n*sizeof(double)), cudaMemcpyHostToDevice);
  121.  
  122.     if(b_vec == NULL)
  123.         cudaMemcpy(cudaB, b_vec, (size_t) (n*sizeof(double)), cudaMemcpyHostToDevice);
  124.     else
  125.         cudaMemset(cudaB, 0., (size_t) (n*sizeof(double)));
  126.  
  127.     cudaMemcpy(cudaSoln, soln, (size_t) (n*sizeof(double)), cudaMemcpyHostToDevice);
  128.  
  129.     *iters = 0;
  130.     done = false;
  131.  
  132.     fprintf(stderr, "HOLA\n");
  133.  
  134.     while (!done && (*iters) < max_iters) {
  135.        
  136.         (*iters)++;
  137.  
  138.         if((*iters) < 2 ) {
  139.             cudaMemcpy(soln2, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
  140.             for(int i=0 ; i<20 ; i++) fprintf(stderr, "%lf ", soln2[i]);
  141.             fprintf(stderr, "\n");
  142.         }
  143.  
  144.  
  145.         if(b_vec != NULL) {
  146.             CUDA_SAFE_CALL(cudaMemcpy(cudaSoln2, cudaB, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToDevice) );
  147.         } else {
  148.             cudaMemset(cudaSoln2, 0., (size_t) (n*sizeof(double)) ) ;
  149.         }
  150.        
  151. //      cudaSoln2 = -1.*A*cudaSoln + 1.*cudaSoln2
  152.         cuStat = cusparseDcsrmv(
  153.             handle,
  154.             CUSPARSE_OPERATION_NON_TRANSPOSE,
  155.             n, n, -1.,
  156.             descrA,
  157.             cudaNonZero,
  158.             cudaRowStarts,
  159.             cudaColsIndex,
  160.             cudaSoln, 1., cudaSoln2 );
  161.  
  162.         if((*iters) < 2) {
  163.             cudaMemcpy(soln2, cudaSoln2, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
  164.             cudaThreadSynchronize();
  165.             for(int i=0 ; i<20 ; i++) {
  166.                 fprintf(stderr, "%lf/%lf ", diags_vec[i], soln2[i]);
  167.             }
  168.             fprintf(stderr, "\n");
  169.         }
  170.  
  171.         // check convergence
  172.         // (note: doing outside loop means may not need to check all elements)
  173.         const int nblocks = imin( 65000, (n+(N-1))/N);
  174.         switch (term_crit) {
  175.         case 1:
  176.             fprintf(stderr, "A\n");
  177.  
  178.             kernel_absConv<<<nblocks,N>>>(n, cudaDiags, cudaSoln, cudaSoln2, omega);
  179.  
  180.             cudaMemcpy(soln, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
  181.  
  182.             result = 0;
  183.             for(int i=0 ; i<nblocks ; i++) result = imax(result, soln[i]);
  184.  
  185.             done = result < term_crit_param;
  186.             break;
  187.         case 2:
  188.  
  189.             kernel_relConv<<<nblocks,N>>>(n, cudaDiags, cudaSoln, cudaSoln2, omega);
  190.             cudaThreadSynchronize();
  191.         if((*iters) < 2 ) {
  192.             cudaMemcpy(soln2, cudaSoln2, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
  193.             cudaThreadSynchronize();
  194.             fprintf(stderr, "R\n");
  195.             for(int i=0 ; i<20 ; i++) fprintf(stderr, "%lf ", soln2[i]);
  196.             fprintf(stderr, "\n");
  197.         }
  198.             cudaMemcpy(soln, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
  199.  
  200.             result = 0;
  201.             for(int i=0 ; i<nblocks ; i++) result = imax(result, soln[i]);
  202.  
  203.             done = result < term_crit_param;
  204.             break;
  205.         }
  206.  
  207.  
  208.         // prepare for next iteration
  209.         tmpsoln = cudaSoln;
  210.         cudaSoln = cudaSoln2;
  211.         cudaSoln2 = tmpsoln;
  212.     }
  213.  
  214.     cudaMemcpy(soln, cudaSoln, (size_t) (n*sizeof(double)), cudaMemcpyDeviceToHost);
  215.     cudaThreadSynchronize();
  216.  
  217.     delete[] soln2;
  218.  
  219.     return done;
  220. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement