bolster

GPU-solver

May 31st, 2011
1,636
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #include <utility>
  2. #include <stdlib.h>
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <math.h>
  6.  
  7. #define CUDA_CHK(NAME, ARGS) { \
  8.   cudaError_t cuda_err_code = NAME ARGS; \
  9.   if (cuda_err_code != cudaSuccess) { \
  10.     printf("%s failed with code %d\n", #NAME, cuda_err_code); \
  11.     abort(); \
  12.   } \
  13. }
  14. #ifndef max
  15. #define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
  16. #endif
  17.  
  18. #ifndef min
  19. #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
  20. #endif
  21. #define MAT1 3
  22. #define MAT2 MAT1*MAT1
  23. #define TINY 1.0e-40
  24. #define a(i,j) a[(i)*MAT1+(j)]
  25.  
  26. #define GO 1
  27. #define NOGO 0
  28.  
  29. void Check_Kernel(const char *message){
  30.   cudaError_t error = cudaGetLastError();
  31.   if (error != cudaSuccess){
  32.     fprintf(stderr,"Error: %s:%s\n",message, cudaGetErrorString(error));
  33.   }
  34. }
  35.  
  36. __device__ void d_pivot_decomp(float *a, int *p, int *q){
  37.     int i,j,k;
  38.     int n=MAT1;
  39.     int pi,pj,tmp;
  40.     float max;
  41.     float ftmp;
  42.     for (k=0;k<n;k++){
  43.         pi=-1,pj=-1,max=0.0;
  44.         //find pivot in submatrix a(k:n,k:n)
  45.         for (i=k;i<n;i++) {
  46.             for (j=k;j<n;j++) {
  47.                 if (fabs(a(i,j))>max){
  48.                     max = fabs(a(i,j));
  49.                     pi=i;
  50.                     pj=j;
  51.                 }
  52.             }
  53.         }
  54.         //Swap Row
  55.         tmp=p[k];
  56.         p[k]=p[pi];
  57.         p[pi]=tmp;
  58.         for (j=0;j<n;j++){
  59.             ftmp=a(k,j);
  60.             a(k,j)=a(pi,j);
  61.             a(pi,j)=ftmp;
  62.         }
  63.         //Swap Col
  64.         tmp=q[k];
  65.         q[k]=q[pj];
  66.         q[pj]=tmp;
  67.         for (i=0;i<n;i++){
  68.             ftmp=a(i,k);
  69.             a(i,k)=a(i,pj);
  70.             a(i,pj)=ftmp;
  71.         }
  72.         //END PIVOT
  73.  
  74.         //check pivot size and decompose
  75.         if ((fabs(a(k,k))>TINY)){
  76.             for (i=k+1;i<n;i++){
  77.                 //Column normalisation
  78.                 ftmp=a(i,k)/=a(k,k);
  79.                 for (j=k+1;j<n;j++){
  80.                     //a(ik)*a(kj) subtracted from lower right submatrix elements
  81.                     a(i,j)-=(ftmp*a(k,j));
  82.                 }
  83.             }
  84.         }
  85.         //END DECOMPOSE
  86.     }
  87. }
  88.  
  89.  
  90. __device__ void d_solve(float *a, float *x, int *p, int *q){
  91.     //forward substitution; see  Golub, Van Loan 96
  92.     //And see http://www.cs.rutgers.edu/~richter/cs510/completePivoting.pdf
  93.     int i,ii=0,j;
  94.     float ftmp;
  95.     float xtmp[MAT1];
  96.     //Swap rows (x=Px)
  97.     for (i=0; i<MAT1; i++){
  98.         xtmp[i]=x[p[i]]; //value that should be here
  99.     }
  100.     //Lx=x
  101.     for (i=0;i<MAT1;i++){
  102.         ftmp=xtmp[i];
  103.         if (ii != 0)
  104.             for (j=ii-1;j<i;j++)
  105.                 ftmp-=a(i,j)*xtmp[j];
  106.         else
  107.             if (ftmp!=0.0)
  108.                 ii=i+1;
  109.         xtmp[i]=ftmp;
  110.     }
  111.     //backward substitution
  112.     //partially taken from Sourcebook on Parallel Computing p577
  113.     //solves Uy=z
  114.     xtmp[MAT1-1]/=a(MAT1-1,MAT1-1);
  115.     for (i=MAT1-2;i>=0;i--){
  116.         ftmp=xtmp[i];
  117.         for (j=i+1;j<MAT1;j++){
  118.             ftmp-=a(i,j)*xtmp[j];
  119.         }
  120.         xtmp[i]=(ftmp)/a(i,i);
  121.     }
  122.     for (i=0;i<MAT1;i++)
  123.  
  124.     //Last bit
  125.     //solves x=Qy
  126.     for (i=0;i<MAT1;i++){
  127.         x[i]=xtmp[q[i]];
  128.     }
  129. }
  130.  
  131. __global__ void solve(float *A, float *B, int max){
  132.   //Each thread solves the A[id]x[id]=b[id] problem
  133.   int id= blockDim.x*blockIdx.x + threadIdx.x;
  134.   int p_pivot[MAT1],q_pivot[MAT1];
  135.   if ((GO==1) && (id < max)){
  136.     for (int i=0;i<MAT1;i++) {
  137.         p_pivot[i]=q_pivot[i]=i;
  138.     }
  139.  
  140.     d_pivot_decomp(&A[id*MAT2],&p_pivot[0],&q_pivot[0]);
  141.     d_solve(&A[id*MAT2],&B[id*MAT1],&p_pivot[0],&q_pivot[0]);
  142.   }
  143. }
  144.  
  145. /*
  146. void notmain(){
  147.     //3x3 Matrix
  148.     //float a[]={1,-2,3,2,-5,12,0,2,-10};
  149.     float a[]={1,3,-2,3,5,6,2,4,3};
  150.     float b[]={5,7,8};
  151.     //float a[]={1,2,3,2,-1,1,3,4,-1};
  152.     //float b[]={14,3,8};
  153.     //float a[]={1,-2,1,0,2,2,-2,4,2};
  154.     //float b[]={1,4,2};
  155.     int sig;
  156.     puts("Declared Stuff");
  157.  
  158.     //pivot array (not used currently)
  159.     int* p_pivot = (int *)malloc(sizeof(int)*MAT1);
  160.     int* q_pivot = (int *)malloc(sizeof(int)*MAT1);
  161.     puts("Starting Stuff");
  162.     for (unsigned int i=0; i<MAT1; i++){
  163.         p_pivot[i]=i;
  164.         q_pivot[i]=i;
  165.         printf("%.1lf|",b[i]);
  166.         for (unsigned int j=0;j<MAT1; j++){
  167.             printf("%.1lf,",a(i,j));
  168.         }
  169.         printf("|%d,%d",p_pivot[i],q_pivot[i]);
  170.         puts("");
  171.     }
  172.  
  173.     h_pivot_decomp(&a[0],p_pivot,q_pivot);
  174.     puts("After Pivot");
  175.     for (unsigned int i=0; i<MAT1; i++){
  176.         printf("%.1lf|",b[i]);
  177.         for (unsigned int j=0;j<MAT1; j++){
  178.             printf("%.1lf,",a(i,j));
  179.         }
  180.         printf("|%d,%d",p_pivot[i],q_pivot[i]);
  181.         puts("");
  182.     }
  183.  
  184.     h_solve(&a[0],&b[0],p_pivot,q_pivot);
  185.     puts("Finished Solve");
  186.  
  187.     for (unsigned int i=0; i<MAT1; i++){
  188.         printf("%.1lf|",b[i]);
  189.         for (unsigned int j=0;j<MAT1; j++){
  190.             printf("%.1lf,",a(i,j));
  191.         }
  192.         puts("");
  193.     }
  194. }*/
  195.  
  196.  
  197.  
  198. int main(){
  199.   //What are you actually trying to do:
  200.   //  generate 2 input matrixes, (NxN,Nx1) and 1 output (1xN)
  201.   //  do this over matrixcount length for threadiding
  202.   cudaSetDevice(0);
  203.   const unsigned int matrixcount=1;
  204.   const unsigned int matsize=MAT2*matrixcount;
  205.   const unsigned int vecsize=MAT1*matrixcount;
  206.   float a[]={1,3,-2,3,5,6,2,4,3};
  207.   //const float exampleA[]={7,3,-11,-6,7,10,-11,2,-2};
  208.   //const float exampleA[]={4,3,6,3};
  209.   const float b[]={5,7,8};
  210.   //const float exampleB[]={4,5};
  211.  
  212.   //memory allocations
  213.   float* h_A = (float*)malloc(sizeof(float)*matsize);
  214.   float* h_b = (float*)malloc(sizeof(float)*vecsize);
  215.   float* h_x = (float*)malloc(sizeof(float)*vecsize);
  216.  
  217.   float* d_A;
  218.   float* d_b;
  219.   float* d_x;
  220.   CUDA_CHK(cudaMalloc, (&d_A, sizeof(float)*matsize));
  221.   CUDA_CHK(cudaMalloc, (&d_b, sizeof(float)*vecsize));
  222.   CUDA_CHK(cudaMalloc, (&d_x, sizeof(float)*vecsize));
  223.  
  224.   printf("Mallocd\n");
  225.   //fill matrix and vector with stuff
  226.   for (unsigned int i = 0;i<matrixcount;i++){
  227.     //printf("\n%d\n",i);
  228.     for (unsigned int j = 0; j < MAT1; j++){
  229.       h_b[(i*MAT1)+j]=b[j];
  230.       h_x[(i*MAT1)+j]=-1;
  231.       printf("%.0f,",h_b[(i*MAT1)+j]);
  232.       //printf("\n%d:",j);
  233.       for (unsigned int k=0; k < MAT1; k++){
  234.         //printf("%d,",k);
  235.         h_A[(i*MAT2)+(j*MAT1)+k]=a(j,k);
  236.       }
  237.     }
  238.     puts("\n");
  239.   }
  240.  
  241.   printf("Generated\n");
  242.     //copy values to device
  243.   CUDA_CHK(cudaMemcpy, (d_A, h_A, sizeof(float)*matsize, cudaMemcpyHostToDevice));
  244.   CUDA_CHK(cudaMemcpy, (d_b, h_b, sizeof(float)*vecsize, cudaMemcpyHostToDevice));
  245.   CUDA_CHK(cudaMemcpy, (d_x, h_x, sizeof(float)*vecsize, cudaMemcpyHostToDevice));
  246.  
  247.   printf("Copied\n");
  248.   for (unsigned int i=0; i<matrixcount; i++){
  249.     printf("\n%d:x:A|B",i);
  250.     //printf("%.3lf|",h_x[i*MAT1]);
  251.     for (unsigned int j=0; j<MAT1; j++){
  252.       printf("\n%.3lf:",h_x[i*MAT1+j]);
  253.       for (unsigned int k=0;k<MAT1; k++){
  254.         printf("%.1lf,",h_A[(i*MAT2)+(j*MAT1)+k]);
  255.       }
  256.       printf("|%.3lf",h_b[i*MAT1+j]);
  257.     }
  258.   }
  259.   puts("\n");
  260.  
  261.   //parameters
  262.   dim3 threadsPerBlock(1,1,1);
  263.   dim3 blocksPerGrid((matrixcount + threadsPerBlock.x -1)/threadsPerBlock.x,1,1);
  264.   printf("TPB:%d,BPG:%d\n",threadsPerBlock.x,blocksPerGrid.x);
  265.   //Execute
  266.   cudaEvent_t evt_start, evt_stop;
  267.   CUDA_CHK(cudaEventCreate, (&evt_start));
  268.   CUDA_CHK(cudaEventCreate, (&evt_stop));
  269.   CUDA_CHK(cudaEventRecord, (evt_start,0));
  270.  
  271.   solve<<<blocksPerGrid,threadsPerBlock>>>(d_A,d_b,matrixcount);
  272.   cudaDeviceSynchronize();
  273.   Check_Kernel("Solve");
  274.  
  275.   printf("Ran solve\n");
  276.   CUDA_CHK(cudaEventRecord, (evt_stop, 0));
  277.   CUDA_CHK(cudaEventSynchronize, (evt_stop));
  278.   float total_time;
  279.   CUDA_CHK(cudaEventElapsedTime, (&total_time, evt_start, evt_stop));
  280.   CUDA_CHK(cudaMemcpy, (h_A,d_A, sizeof(float)*matsize, cudaMemcpyDeviceToHost));
  281.   CUDA_CHK(cudaMemcpy, (h_x,d_b, sizeof(float)*vecsize, cudaMemcpyDeviceToHost));
  282.  
  283.   // print timing results
  284.   float one_time = total_time * 1e-3;
  285.  
  286.   printf("time: %g s\n", one_time);
  287.   for (unsigned int i=0; i<matrixcount; i++){
  288.     printf("\n%d:x:A",i);
  289.     //printf("%.3lf|",h_x[i*MAT1]);
  290.     for (unsigned int j=0; j<MAT1; j++){
  291.       printf("\n%.3lf:",h_x[i*MAT1+j]);
  292.       for (unsigned int k=0;k<MAT1; k++){
  293.         printf("%.1lf,",h_A[(i*MAT2)+(j*MAT1)+k]);
  294.       }
  295.     }
  296.   }
  297.   puts("\n");
  298.  
  299.   cudaEventDestroy(evt_start);
  300.   cudaEventDestroy(evt_stop);
  301.   free(h_A);
  302.   free(h_b);
  303.   free(h_x);
  304.   CUDA_CHK(cudaFree, (d_A));
  305.   CUDA_CHK(cudaFree, (d_x));
  306.   //CUDA_CHK(cudaFree, (d_pivot));
  307. }
RAW Paste Data