Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cuda.h>
- #include <cuda_runtime_api.h>
- #include <stdio.h>
- #include <cublas_v2.h>
- //const int R=16; // количество частотных интервалов всего
- //const float pi=3.1415926535897;
- //const float sigma=pi/R;
- void mMul(int m, int n, int k, const float * a, const float * b, float * c)
- {
- for (int i=0;i<m;i++)
- for (int j = 0; j < k; ++j)
- {
- c[i*k+j]=0;
- for (int l=0;l<n;l++)
- c[i*k+j]+=a[i*n+l]*b[l*k+j];
- }
- }
- int main (int argc, char * argv[])
- {
- //matrices A (m x n), B(n x k), C(m x k), D(k x l) and E (m x l)
- const int m=11;
- const int n=11;
- const int k=11;
- const int l=11;
- cudaError_t cudaStat;
- cublasStatus_t stat;
- cublasHandle_t handle;
- //initialization
- stat = cublasCreate(&handle);
- if (stat != CUBLAS_STATUS_SUCCESS)
- {
- printf ("CUBLAS initialization failed\n");
- return EXIT_FAILURE;
- }
- //GPU pointers
- float *dev_A, *dev_B, *dev_C, *dev_D, *dev_E;
- //CPU pointers
- float *host_A, *host_B, *host_C, *host_D, *host_E, *host_result;
- //CPU memory allocation
- //first matrix
- host_A=new float [m*n];
- for (int i=0; i<m; i++)
- for (int j=0; j<n; j++)
- if (i==j)
- host_A[i*n+j]=1;
- else
- host_A[i*n+j]=0;
- //the second
- host_B=new float [n*k];
- for (int i=0; i<n; i++)
- for (int j=0; j<k; j++)
- host_B[i*k+j]=i*k+j;
- //the third
- host_D=new float [k*l];
- for (int i=0; i<k; i++)
- for (int j=0; j<l; j++)
- host_D[i*l+j]=i*l+j;
- //result
- host_C=new float [m*k];
- host_E=new float [m*l];
- //GPU memory allocation
- cudaStat = cudaMalloc ((void**)&dev_A, m*n*sizeof(float));
- if (cudaStat != cudaSuccess) {
- printf ("device memory allocation failed\n");
- return EXIT_FAILURE;
- }
- cudaStat = cudaMalloc ((void**)&dev_B, n*k*sizeof(float));
- if (cudaStat != cudaSuccess) {
- printf ("device memory allocation failed\n");
- return EXIT_FAILURE;
- }
- cudaStat = cudaMalloc ((void**)&dev_C, m*k*sizeof(float));
- if (cudaStat != cudaSuccess) {
- printf ("device memory allocation failed\n");
- return EXIT_FAILURE;
- }
- cudaStat = cudaMalloc ((void**)&dev_D, k*l*sizeof(float));
- if (cudaStat != cudaSuccess) {
- printf ("device memory allocation failed\n");
- return EXIT_FAILURE;
- }
- cudaStat = cudaMalloc ((void**)&dev_E, m*l*sizeof(float));
- if (cudaStat != cudaSuccess) {
- printf ("device memory allocation failed\n");
- return EXIT_FAILURE;
- }
- //copying matriсes
- cudaMemcpy(dev_B, host_B, n*k*sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(dev_A, host_A, m*n*sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(dev_D, host_D, k*l*sizeof(float), cudaMemcpyHostToDevice);
- float result[2];
- //GPU multiplication
- //scalars
- float scal[2];
- scal[0]=1;
- scal[1]=0;
- stat = cublasSgemm(handle,
- CUBLAS_OP_N, CUBLAS_OP_N,
- k, n, m,
- scal, // alpha (1)
- dev_B, k, // ?
- dev_A, n, // ?
- (scal+1), // beta (0)
- dev_C, k); // ?
- if (stat != CUBLAS_STATUS_SUCCESS) {
- printf ("Multiplication failed. (1)\n");
- cudaFree (dev_A); cudaFree (dev_B);cudaFree (dev_C);
- delete[] host_A; delete[] host_B; delete[] host_C;
- cublasDestroy(handle);
- return EXIT_FAILURE;
- }
- //CPU multiplication
- mMul(m,n,k,host_A,host_B,host_C);
- cudaThreadSynchronize();
- //copying result from device to host
- host_result=new float[m*k];
- stat = cublasGetVector(m*k, sizeof(float), dev_C, 1, host_result, 1);
- if (stat != CUBLAS_STATUS_SUCCESS) {
- printf ("data upload failed\n");
- cudaFree (dev_A); cudaFree (dev_B); cudaFree (dev_C);
- delete[] host_A; delete[] host_B; delete[] host_C;
- delete[] host_result;
- cublasDestroy(handle);
- return EXIT_FAILURE;
- }
- //comparing
- result[0]=0;
- result[1]=0;
- bool flag = false;
- for (int i=0; i<m*k; i++)
- {
- result[0]+=host_result[i];
- result[1]+=host_C[i];
- if (abs(host_result[i] - host_C[i]) > 0.001)
- {
- flag=true;
- break;
- }
- }
- //if(result[0]!=result[1])
- if (flag)
- printf("Device: %f; Host: %f \n):\n", result[0], result[1]);
- else
- printf("(:\n");
- //The second multiplication
- //CPU
- stat = cublasSgemm(handle,
- CUBLAS_OP_N, CUBLAS_OP_N,
- l, k, m,
- scal, // alpha (1)
- dev_D, l, // ?
- dev_C, k, // ?
- (scal+1), // beta (0)
- dev_E, l); // ?
- if (stat != CUBLAS_STATUS_SUCCESS) {
- printf ("Multiplication failed (2)\n");
- cudaFree (dev_A);
- cudaFree (dev_B);
- cudaFree (dev_C);
- delete[] host_A;
- delete[] host_B;
- delete[] host_C;
- cublasDestroy(handle);
- return EXIT_FAILURE;
- }
- //CPU multiplication
- mMul(m,k,l,host_C,host_D,host_E);
- //getting result
- float *host_result1=new float[m*l];
- cudaThreadSynchronize();
- //copying result to host
- stat = cublasGetVector(m*l, sizeof(float), dev_E, 1, host_result1, 1);
- if (stat != CUBLAS_STATUS_SUCCESS) {
- printf ("data upload failed\n");
- cudaFree (dev_A); cudaFree (dev_B); cudaFree (dev_C);
- delete[] host_A; delete[] host_B; delete[] host_C;
- delete[] host_result;
- cublasDestroy(handle);
- return EXIT_FAILURE;
- }
- //comparing
- result[0]=0;
- result[1]=0;
- for (int i=0; i<l*m; i++)
- {
- result[1]+=host_E[i];
- result[0]+=host_result1[i];
- if (abs(host_E[i] - host_result1[i]) > 0.001)
- {
- flag=true;
- }
- }
- //if(result[0]!=result[1])
- if (flag)
- printf("Device: %f; Host: %f \n):\n", result[0], result[1]);
- else
- printf("(:\n");
- delete[] host_A;
- delete[] host_B;
- delete[] host_C;
- cudaFree (dev_A);
- cudaFree (dev_B);
- cudaFree (dev_C);
- return (EXIT_SUCCESS);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement