Cogli

supplementary code

Jan 4th, 2016
165
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 6.72 KB | None | 0 0
  1. ==========================      main.cpp              =============================
  2.  
  3. #include "cuda_runtime.h"
  4. #include "device_launch_parameters.h"
  5. #include <stdlib.h>
  6. #include <stdio.h>
  7. //#include <time.h>
  8.  
  9. // N must equal to M, and both must be even numbers
  10. #define N 256
  11. #define M 256
  12.  
  13. void WriteDataFile(const char *name, int w, int h, const float *in, const float *out)
  14. {
  15.     FILE *stream;
  16.     stream = fopen(name, "wb");
  17.  
  18.     float data = 202021.25f;
  19.     fwrite(&data, sizeof(float), 1, stream);
  20.     fwrite(&w,    sizeof(w), 1, stream);
  21.     fwrite(&h,    sizeof(h), 1, stream);
  22.  
  23.     // added by Robert
  24.     double din  = 0.0;
  25.     double dout = 0.0;
  26.  
  27.     for (int i = 0; i < h; ++i)
  28.         for (int j = 0; j < w; ++j)
  29.         {
  30.             const int pos = j + i * h;
  31.             fwrite(in  + pos, sizeof(float),  1, stream);
  32.             fwrite(out + pos, sizeof(float), 1, stream);
  33.  
  34.             din  += in[pos];
  35.             dout += out[pos];
  36.         }
  37.  
  38.     fclose(stream);
  39.     printf("din  sum = %f\n", din);
  40.     printf("dout sum = %f\n", dout);
  41. }
  42.  
  43. void cuIDCT(float *b, int n, int m, float *a);
  44.  
  45. int main()
  46. {
  47.     // host memory allocation
  48.     float *h_in   = new float [N * M];
  49.     float *h_out  = new float [N * M];
  50.     float *h_temp = new float [N * M];
  51.  
  52.     // input data initiliazation
  53.     // uncomment below line if you want a different initilization each time
  54.     //srand((unsigned)time(NULL));
  55.     for (int i = 0; i < N * M; i++)
  56.     {
  57.         h_in[i]   = (float)rand()/(float)RAND_MAX;
  58.         h_out[i]  = h_in[i];
  59.         h_temp[i] = h_in[i];
  60.     }
  61.  
  62.     // cuIDCT(h_in, N, M, h_out);
  63.  
  64.     // iteratively use cuIDCT
  65.     for (int i = 0; i < 4; i++)
  66.     {
  67.         if (i % 2 == 0)
  68.             cuIDCT(h_out, N, M, h_temp);
  69.         else
  70.             cuIDCT(h_temp, N, M, h_out);
  71.     }
  72.  
  73.     // write data, for further visualization using MATLAB
  74.     WriteDataFile("test.flo", N, M, h_in, h_out);
  75.  
  76.     // cleanup
  77.     delete [] h_in;
  78.     delete [] h_out;
  79.     delete [] h_temp;
  80.     cudaDeviceReset();
  81. }
  82.  
  83.  
  84. ==========================      cuIDCT.cu              =============================
  85. #include <stdio.h>
  86. #include <stdlib.h>
  87. #include <cuda.h>
  88. #include <cufft.h>
  89. #include <cuComplex.h>
  90. #include <helper_cuda.h>
  91. #include "assert.h"
  92.  
  93. // round up n/m
  94. inline int iDivUp(int n, int m)
  95. {
  96.     return (n + m - 1) / m;
  97. }
  98.  
  99. typedef cufftComplex complex;
  100.  
  101. #define PI 3.1415926535897932384626433832795028841971693993751
  102.  
  103. #define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
  104. inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
  105. {
  106.     if( CUFFT_SUCCESS != err) {
  107.         fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
  108.             _cudaGetErrorEnum(err)); \
  109.             cudaDeviceReset(); assert(0); \
  110.     }
  111. }
  112.  
  113.  
  114. __global__
  115.     void idct_ComputeWeightsKernel(const int n, complex *ww)
  116. {
  117.     const int pos = threadIdx.x + blockIdx.x * blockDim.x;
  118.  
  119.     if (pos >= n) return;
  120.  
  121.     ww[pos].x =  sqrtf(2*n) * cosf(pos*PI/(2*n));
  122.     ww[pos].y =  sqrtf(2*n) * sinf(pos*PI/(2*n));
  123. }
  124.  
  125. __global__
  126.     void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
  127. {
  128.     const int ix = threadIdx.x + blockIdx.x * blockDim.x;
  129.     const int iy = threadIdx.y + blockIdx.y * blockDim.y;
  130.     if (ix >= n || iy >= m) return;
  131.  
  132.     const int pos = ix + iy*n;
  133.  
  134.     // Compute precorrection factor
  135.     ww[0].x = ww[0].x / sqrtf(2);
  136.     ww[0].y = ww[0].y / sqrtf(2);
  137.  
  138.     y[iy + ix*m].x = ww[iy].x * b[pos];
  139.     y[iy + ix*m].y = ww[iy].y * b[pos];
  140. }
  141.  
  142. __global__
  143.     void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
  144. {
  145.     const int ix = threadIdx.x + blockIdx.x * blockDim.x;
  146.     const int iy = threadIdx.y + blockIdx.y * blockDim.y;
  147.     if (ix >= n || iy >= m) return;
  148.  
  149.     const int pos = ix + iy*n;
  150.  
  151.     yy[iy + ix*n].x = y[pos].x / (float) n;
  152.     yy[iy + ix*n].y = y[pos].y / (float) n;
  153. }
  154.  
  155. __global__
  156.     void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
  157. {
  158.     const int ix = threadIdx.x + blockIdx.x * blockDim.x;
  159.     const int iy = threadIdx.y + blockIdx.y * blockDim.y;
  160.     if (ix >= n || iy >= m) return;
  161.  
  162.     const int pos = ix + iy*n;
  163.  
  164.     // Re-order elements of each column according to equations (5.93) and (5.94) in Jain
  165.     if (iy < n/2)
  166.     {
  167.         a[ix + 2*iy*n]     = yy[pos].x;
  168.         a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
  169.     }
  170. }
  171.  
  172. /**
  173. * a = idct(b), where a is of size [n m].
  174. * @param b, input array
  175. * @param n, first dimension of a
  176. * @param m, second dimension of a
  177. * @param a, output array
  178. */
  179. void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
  180. {
  181.     const int data_size   = n * m * sizeof(float);
  182.  
  183.     // device memory allocation
  184.     float *d_in, *d_out;
  185.     checkCudaErrors(cudaMalloc(&d_in,  data_size));
  186.     checkCudaErrors(cudaMalloc(&d_out, data_size));
  187.  
  188.     // transfer data from host to device
  189.     checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice));
  190.  
  191.     // compute IDCT using CUDA
  192.     // begin============================================
  193.     // Compute weights
  194.     complex *ww;
  195.     checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex)));
  196.     dim3 threads(256);
  197.     dim3 blocks(iDivUp(n, threads.x));
  198.     idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
  199.  
  200.     complex *y;
  201.     complex *yy;
  202.     cufftHandle plan;
  203.  
  204.     dim3 threads1(32, 6);
  205.     dim3 blocks2(iDivUp(n,   threads1.x), iDivUp(m, threads1.y)); // for even case
  206.  
  207.     int Length[1] = {m}; // for each IFFT, the length is m
  208.  
  209.     checkCudaErrors(cudaMalloc(&y,  n*m*sizeof(complex)));
  210.  
  211.     idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
  212.  
  213.     cufftSafeCall(cufftPlanMany(&plan, 1, Length,
  214.                                 Length, 1, m,
  215.                                 Length, 1, m, CUFFT_C2C, n));
  216.     cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m]
  217.  
  218.     checkCudaErrors(cudaMalloc(&yy,  n*m*sizeof(complex)));
  219.     Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
  220.     Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
  221.     // end============================================
  222.  
  223.     // transfer result from device to host
  224.     checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost));
  225.  
  226.     // cleanup
  227.     cufftDestroy(plan);
  228.     checkCudaErrors(cudaFree(ww));
  229.     checkCudaErrors(cudaFree(y));
  230.     checkCudaErrors(cudaFree(yy));
  231.     checkCudaErrors(cudaFree(d_in));
  232.     checkCudaErrors(cudaFree(d_out));
  233. }
  234.  
  235.  
  236. ==========================      output result              =============================
  237. din  sum = 32924.227881
  238. dout sum = 21801.502256
  239.  
  240. din  sum = 32924.227881
  241. dout sum = 21801.502256
  242.  
  243. din  sum = 32924.227881
  244. dout sum = 21801.502256
  245.  
  246. din  sum = 32924.227881
  247. dout sum = 21801.502256
  248.  
  249. din  sum = 32924.227881
  250. dout sum = 16902.673393
  251.  
  252. din  sum = 32924.227881
  253. dout sum = 21794.061701
  254.  
  255. din  sum = 32924.227881
  256. dout sum = 21801.502256
  257.  
  258. din  sum = 32924.227881
  259. dout sum = 21801.502256
  260.  
  261. din  sum = 32924.227881
  262. dout sum = 21785.359646
Add Comment
Please, Sign In to add comment