Untitled

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <cublas_v2.h>

//const int R=16; // количество частотных интервалов всего
//const float pi=3.1415926535897;
//const float sigma=pi/R;

void mMul(int m, int n, int k, const float * a, const float * b, float * c)
{
    for (int i=0;i<m;i++)
        for (int j = 0; j < k; ++j)
        {
            c[i*k+j]=0;
            for (int l=0;l<n;l++)
                c[i*k+j]+=a[i*n+l]*b[l*k+j];
        }
}


int main (int argc, char * argv[])
{
     //matrices A (m x n), B(n x k), C(m x k), D(k x l) and E (m x l)
     const int m=11;
     const int n=11;
     const int k=11;
     const int l=11;

     cudaError_t cudaStat;
     cublasStatus_t stat;
     cublasHandle_t handle;
     //initialization

     stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS)
     {
      printf ("CUBLAS initialization failed\n");
      return EXIT_FAILURE;
     }

     //GPU pointers
     float *dev_A, *dev_B, *dev_C, *dev_D, *dev_E;
     //CPU pointers
     float *host_A, *host_B, *host_C, *host_D, *host_E, *host_result;


     //CPU memory allocation
     //first matrix
     host_A=new float [m*n];

     for (int i=0; i<m; i++)
      for (int j=0; j<n; j++)
           if (i==j)
            host_A[i*n+j]=1;
           else
            host_A[i*n+j]=0;

     //the second
     host_B=new float [n*k];
     for (int i=0; i<n; i++)
      for (int j=0; j<k; j++)
           host_B[i*k+j]=i*k+j;

     //the third
     host_D=new float [k*l];
     for (int i=0; i<k; i++)
      for (int j=0; j<l; j++)
           host_D[i*l+j]=i*l+j;

     //result
     host_C=new float [m*k];
     host_E=new float [m*l];

     //GPU memory allocation
     cudaStat = cudaMalloc ((void**)&dev_A, m*n*sizeof(float));
     if (cudaStat != cudaSuccess) {
      printf ("device memory allocation failed\n");
      return EXIT_FAILURE;
     }
     cudaStat = cudaMalloc ((void**)&dev_B, n*k*sizeof(float));
     if (cudaStat != cudaSuccess) {
      printf ("device memory allocation failed\n");
      return EXIT_FAILURE;
     }
     cudaStat = cudaMalloc ((void**)&dev_C, m*k*sizeof(float));
     if (cudaStat != cudaSuccess) {
      printf ("device memory allocation failed\n");
      return EXIT_FAILURE;
     }
     cudaStat = cudaMalloc ((void**)&dev_D, k*l*sizeof(float));
     if (cudaStat != cudaSuccess) {
      printf ("device memory allocation failed\n");
      return EXIT_FAILURE;
     }
     cudaStat = cudaMalloc ((void**)&dev_E, m*l*sizeof(float));
     if (cudaStat != cudaSuccess) {
      printf ("device memory allocation failed\n");
      return EXIT_FAILURE;
     }

     //copying matriсes
     cudaMemcpy(dev_B, host_B, n*k*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_A, host_A, m*n*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_D, host_D, k*l*sizeof(float), cudaMemcpyHostToDevice);
     float result[2];

     //GPU multiplication
     //scalars
     float scal[2];
     scal[0]=1;
     scal[1]=0;

     stat = cublasSgemm(handle,
            CUBLAS_OP_N, CUBLAS_OP_N,
            k, n, m,
            scal,      // alpha (1)
            dev_B, k,  // ?
            dev_A, n,  // ?
            (scal+1),  // beta (0)
            dev_C, k); // ?

     if (stat != CUBLAS_STATUS_SUCCESS) {
      printf ("Multiplication failed. (1)\n");
      cudaFree (dev_A); cudaFree (dev_B);cudaFree (dev_C);
      delete[] host_A; delete[] host_B; delete[] host_C;
      cublasDestroy(handle);
      return EXIT_FAILURE;
     }
     //CPU multiplication
     mMul(m,n,k,host_A,host_B,host_C);


     cudaThreadSynchronize();

     //copying result from device to host
     host_result=new float[m*k];
     stat = cublasGetVector(m*k, sizeof(float), dev_C, 1, host_result, 1);
     if (stat != CUBLAS_STATUS_SUCCESS) {
      printf ("data upload failed\n");
      cudaFree (dev_A); cudaFree (dev_B); cudaFree (dev_C);
      delete[] host_A; delete[] host_B; delete[] host_C;
      delete[] host_result;
      cublasDestroy(handle);
      return EXIT_FAILURE;
     }

     //comparing

     result[0]=0;
     result[1]=0;
     bool flag = false;
     for (int i=0; i<m*k; i++)
     {
      result[0]+=host_result[i];
      result[1]+=host_C[i];
      if (abs(host_result[i] - host_C[i]) > 0.001)
      {
           flag=true;
           break;
      }
     }
     //if(result[0]!=result[1])
     if (flag)
      printf("Device: %f; Host: %f \n):\n", result[0], result[1]);
     else
      printf("(:\n");

     //The second multiplication
     //CPU
     stat = cublasSgemm(handle,
            CUBLAS_OP_N, CUBLAS_OP_N,
            l, k, m,
                scal,      // alpha (1)
            dev_D, l,  // ?
            dev_C, k,  // ?
            (scal+1),  // beta (0)
            dev_E, l); // ?

     if (stat != CUBLAS_STATUS_SUCCESS) {
      printf ("Multiplication failed (2)\n");
      cudaFree (dev_A);
      cudaFree (dev_B);
      cudaFree (dev_C);
      delete[] host_A;
      delete[] host_B;
      delete[] host_C;
      cublasDestroy(handle);
      return EXIT_FAILURE;
     }

     //CPU multiplication
      mMul(m,k,l,host_C,host_D,host_E);

      //getting result
      float *host_result1=new float[m*l];
      cudaThreadSynchronize();

      //copying result to host
      stat = cublasGetVector(m*l, sizeof(float), dev_E, 1, host_result1, 1);
      if (stat != CUBLAS_STATUS_SUCCESS) {
       printf ("data upload failed\n");
       cudaFree (dev_A); cudaFree (dev_B); cudaFree (dev_C);
       delete[] host_A; delete[] host_B; delete[] host_C;
       delete[] host_result;
       cublasDestroy(handle);
       return EXIT_FAILURE;
      }

      //comparing
      result[0]=0;
      result[1]=0;
      for (int i=0; i<l*m; i++)
      {
       result[1]+=host_E[i];
       result[0]+=host_result1[i];
       if (abs(host_E[i] - host_result1[i]) > 0.001)
       {
        flag=true;

       }
      }
      //if(result[0]!=result[1])
      if (flag)
       printf("Device: %f; Host: %f \n):\n", result[0], result[1]);
      else
       printf("(:\n");


      delete[] host_A;
      delete[] host_B;
      delete[] host_C;

      cudaFree (dev_A);
      cudaFree (dev_B);
      cudaFree (dev_C);

      return (EXIT_SUCCESS);
}