Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ========================== main.cpp =============================
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdlib.h>
- #include <stdio.h>
- //#include <time.h>
- // N must equal to M, and both must be even numbers
- #define N 256
- #define M 256
- void WriteDataFile(const char *name, int w, int h, const float *in, const float *out)
- {
- FILE *stream;
- stream = fopen(name, "wb");
- float data = 202021.25f;
- fwrite(&data, sizeof(float), 1, stream);
- fwrite(&w, sizeof(w), 1, stream);
- fwrite(&h, sizeof(h), 1, stream);
- // added by Robert
- double din = 0.0;
- double dout = 0.0;
- for (int i = 0; i < h; ++i)
- for (int j = 0; j < w; ++j)
- {
- const int pos = j + i * h;
- fwrite(in + pos, sizeof(float), 1, stream);
- fwrite(out + pos, sizeof(float), 1, stream);
- din += in[pos];
- dout += out[pos];
- }
- fclose(stream);
- printf("din sum = %f\n", din);
- printf("dout sum = %f\n", dout);
- }
- void cuIDCT(float *b, int n, int m, float *a);
- int main()
- {
- // host memory allocation
- float *h_in = new float [N * M];
- float *h_out = new float [N * M];
- float *h_temp = new float [N * M];
- // input data initiliazation
- // uncomment below line if you want a different initilization each time
- //srand((unsigned)time(NULL));
- for (int i = 0; i < N * M; i++)
- {
- h_in[i] = (float)rand()/(float)RAND_MAX;
- h_out[i] = h_in[i];
- h_temp[i] = h_in[i];
- }
- // cuIDCT(h_in, N, M, h_out);
- // iteratively use cuIDCT
- for (int i = 0; i < 4; i++)
- {
- if (i % 2 == 0)
- cuIDCT(h_out, N, M, h_temp);
- else
- cuIDCT(h_temp, N, M, h_out);
- }
- // write data, for further visualization using MATLAB
- WriteDataFile("test.flo", N, M, h_in, h_out);
- // cleanup
- delete [] h_in;
- delete [] h_out;
- delete [] h_temp;
- cudaDeviceReset();
- }
- ========================== cuIDCT.cu =============================
- #include <stdio.h>
- #include <stdlib.h>
- #include <cuda.h>
- #include <cufft.h>
- #include <cuComplex.h>
- #include <helper_cuda.h>
- #include "assert.h"
- // round up n/m
- inline int iDivUp(int n, int m)
- {
- return (n + m - 1) / m;
- }
- typedef cufftComplex complex;
- #define PI 3.1415926535897932384626433832795028841971693993751
- #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
- inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
- {
- if( CUFFT_SUCCESS != err) {
- fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
- _cudaGetErrorEnum(err)); \
- cudaDeviceReset(); assert(0); \
- }
- }
- __global__
- void idct_ComputeWeightsKernel(const int n, complex *ww)
- {
- const int pos = threadIdx.x + blockIdx.x * blockDim.x;
- if (pos >= n) return;
- ww[pos].x = sqrtf(2*n) * cosf(pos*PI/(2*n));
- ww[pos].y = sqrtf(2*n) * sinf(pos*PI/(2*n));
- }
- __global__
- void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y)
- {
- const int ix = threadIdx.x + blockIdx.x * blockDim.x;
- const int iy = threadIdx.y + blockIdx.y * blockDim.y;
- if (ix >= n || iy >= m) return;
- const int pos = ix + iy*n;
- // Compute precorrection factor
- ww[0].x = ww[0].x / sqrtf(2);
- ww[0].y = ww[0].y / sqrtf(2);
- y[iy + ix*m].x = ww[iy].x * b[pos];
- y[iy + ix*m].y = ww[iy].y * b[pos];
- }
- __global__
- void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy)
- {
- const int ix = threadIdx.x + blockIdx.x * blockDim.x;
- const int iy = threadIdx.y + blockIdx.y * blockDim.y;
- if (ix >= n || iy >= m) return;
- const int pos = ix + iy*n;
- yy[iy + ix*n].x = y[pos].x / (float) n;
- yy[iy + ix*n].y = y[pos].y / (float) n;
- }
- __global__
- void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a)
- {
- const int ix = threadIdx.x + blockIdx.x * blockDim.x;
- const int iy = threadIdx.y + blockIdx.y * blockDim.y;
- if (ix >= n || iy >= m) return;
- const int pos = ix + iy*n;
- // Re-order elements of each column according to equations (5.93) and (5.94) in Jain
- if (iy < n/2)
- {
- a[ix + 2*iy*n] = yy[pos].x;
- a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x;
- }
- }
- /**
- * a = idct(b), where a is of size [n m].
- * @param b, input array
- * @param n, first dimension of a
- * @param m, second dimension of a
- * @param a, output array
- */
- void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m]
- {
- const int data_size = n * m * sizeof(float);
- // device memory allocation
- float *d_in, *d_out;
- checkCudaErrors(cudaMalloc(&d_in, data_size));
- checkCudaErrors(cudaMalloc(&d_out, data_size));
- // transfer data from host to device
- checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice));
- // compute IDCT using CUDA
- // begin============================================
- // Compute weights
- complex *ww;
- checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex)));
- dim3 threads(256);
- dim3 blocks(iDivUp(n, threads.x));
- idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww);
- complex *y;
- complex *yy;
- cufftHandle plan;
- dim3 threads1(32, 6);
- dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case
- int Length[1] = {m}; // for each IFFT, the length is m
- checkCudaErrors(cudaMalloc(&y, n*m*sizeof(complex)));
- idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y);
- cufftSafeCall(cufftPlanMany(&plan, 1, Length,
- Length, 1, m,
- Length, 1, m, CUFFT_C2C, n));
- cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m]
- checkCudaErrors(cudaMalloc(&yy, n*m*sizeof(complex)));
- Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy);
- Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out);
- // end============================================
- // transfer result from device to host
- checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost));
- // cleanup
- cufftDestroy(plan);
- checkCudaErrors(cudaFree(ww));
- checkCudaErrors(cudaFree(y));
- checkCudaErrors(cudaFree(yy));
- checkCudaErrors(cudaFree(d_in));
- checkCudaErrors(cudaFree(d_out));
- }
- ========================== output result =============================
- din sum = 32924.227881
- dout sum = 21801.502256
- din sum = 32924.227881
- dout sum = 21801.502256
- din sum = 32924.227881
- dout sum = 21801.502256
- din sum = 32924.227881
- dout sum = 21801.502256
- din sum = 32924.227881
- dout sum = 16902.673393
- din sum = 32924.227881
- dout sum = 21794.061701
- din sum = 32924.227881
- dout sum = 21801.502256
- din sum = 32924.227881
- dout sum = 21801.502256
- din sum = 32924.227881
- dout sum = 21785.359646
Add Comment
Please, Sign In to add comment