Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jul 15th, 2012  |  syntax: None  |  size: 5.12 KB  |  hits: 8  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. CUDA kernel is slower than CPU
  2. ##MEX file testCPU.c:##
  3.  
  4. #include "mex.h"
  5. void mexFunction( int nlhs, mxArray *plhs[],
  6.         int nrhs, const mxArray *prhs[] ) {
  7.  
  8.     int i, varLen;
  9.     unsigned char *vars, *output;
  10.  
  11.     vars = mxGetPr(prhs[0]);
  12.     plhs[0] = mxCreateLogicalMatrix(2048000, 1);
  13.     output = mxGetPr(plhs[0]);
  14.     for (i=0;i<2048000;i++){
  15.         output[i] = vars[i] & vars[2048000+i];
  16.     }
  17. }
  18.        
  19. mex testCPU.c
  20.        
  21. vars = ~~(randi(2,2048000,2)-1);
  22.        
  23. tic;testCPU(vars);toc;
  24.        
  25. #CUDA file testGPU.cu#
  26. #include "mex.h"
  27. #include "cuda.h"
  28.  
  29. __global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
  30.     int idx = blockIdx.x*blockDim.x+threadIdx.x;
  31.     out[idx] = in[idx] && in[idx+N];
  32. }
  33.  
  34.  
  35. void mexFunction( int nlhs, mxArray *plhs[],
  36.         int nrhs, const mxArray *prhs[] ) {
  37.  
  38.     int i;
  39.     unsigned char *vars, *output, *gpu, *gpures;
  40.  
  41.     vars = (unsigned char*)mxGetData(prhs[0]);
  42.  
  43.     plhs[0] = mxCreateLogicalMatrix(2048000, 1);
  44.     output = (unsigned char*)mxGetData(plhs[0]);      
  45.  
  46.     cudaEvent_t start, stop;
  47.     cudaEventCreate(&start);
  48.     cudaEventCreate(&stop);
  49.     float dt_ms;
  50.  
  51.     // input GPU malloc
  52.     cudaEventRecord(start, 0);
  53.     cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
  54.     cudaEventRecord(stop, 0);
  55.     cudaEventSynchronize(stop);
  56.     cudaEventElapsedTime(&dt_ms, start, stop);
  57.     printf("GPU input malloc: %f ms, %in", dt_ms, cudaGetLastError());
  58.  
  59.     // output GPU malloc
  60.     cudaEventRecord(start, 0);
  61.     cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
  62.     cudaEventRecord(stop, 0);
  63.     cudaEventSynchronize(stop);
  64.     cudaEventElapsedTime(&dt_ms, start, stop);
  65.     printf("GPU output malloc: %f ms, %in", dt_ms, cudaGetLastError());
  66.  
  67.     // copy from CPU to GPU
  68.     cudaEventRecord(start, 0);
  69.     cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
  70.     cudaEventRecord(stop, 0);
  71.     cudaEventSynchronize(stop);
  72.     cudaEventElapsedTime(&dt_ms, start, stop);
  73.     printf("copy input from CPU to GPU: %f ms, %in", dt_ms, cudaGetLastError());
  74.  
  75.     dim3 dimBlock(32);
  76.     printf("thread count: %in", dimBlock.x);
  77.     dim3 dimGrid(2048000/dimBlock.x);
  78.     printf("block count: %in", dimGrid.x);
  79.  
  80.     // --- KERNEL ---
  81.     cudaEventRecord(start, 0);
  82.     logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
  83.     cudaEventRecord(stop, 0);
  84.     cudaEventSynchronize(stop);
  85.     cudaEventElapsedTime(&dt_ms, start, stop);
  86.     printf("GPU kernel: %f ms, %in", dt_ms, cudaGetLastError());
  87.  
  88.     // result from GPU to CPU
  89.     cudaEventRecord(start, 0);
  90.     cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
  91.     cudaEventRecord(stop, 0);
  92.     cudaEventSynchronize(stop);
  93.     cudaEventElapsedTime(&dt_ms, start, stop);
  94.     printf("copy output from GPU to CPU: %f ms, %in", dt_ms, cudaGetLastError());
  95.  
  96.  
  97.     cudaFree(gpu);
  98.     cudaFree(gpures);
  99.  
  100. }
  101.        
  102. nvmex -f nvmexopts_9.bat testGPU.cu
  103. -I"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv4.2include"
  104. -L"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv4.2libx64" -lcudart -lcufft
  105.        
  106. GPU input malloc: 0.772160 ms, 0
  107. GPU output malloc: 0.041728 ms, 0
  108. copy input from CPU to GPU: 1.494784 ms, 0
  109. thread count: 32
  110. block count: 64000
  111. *** GPU kernel: 3.761216 ms, 0 ***
  112. copy output from GPU to CPU: 1.203488 ms, 0
  113.        
  114. __global__ void logical_and(uchar4* in, uchar4* out, int N) {
  115.     int idx = blockIdx.x*blockDim.x+threadIdx.x;
  116.     out[idx].x = in[idx].x & in[idx+N].x;
  117.     out[idx].y = in[idx].y & in[idx+N].y;
  118.     out[idx].z = in[idx].z & in[idx+N].z;
  119.     out[idx].w = in[idx].w & in[idx+N].w;
  120. }
  121.        
  122. uchar4 *gpu, *gpures;
  123.  
  124. // 32 was worst, 64,128,256,512 were similar
  125. dim3 dimBlock(128);
  126. // block count is now 4xtimes smaller
  127. dim3 dimGrid(512000/dimBlock.x);
  128.        
  129. GPU input malloc: 0.043360 ms, 0
  130. GPU output malloc: 0.038592 ms, 0
  131. copy input from CPU to GPU: 1.499584 ms, 0
  132. thread count: 128
  133. block count: 4000
  134. *** GPU kernel: 0.131296 ms, 0 ***
  135. copy output from GPU to CPU: 1.281120 ms, 0
  136.        
  137. __global__ void logical_and(uchar4* in, uchar4* out, int N) {
  138.     int idx = blockIdx.x*blockDim.x+threadIdx.x;
  139.  
  140.     out[idx].x = in[idx].x & in[idx+N].x;
  141.     out[idx].y = in[idx].y & in[idx+N].y;
  142.     out[idx].z = in[idx].z & in[idx+N].z;
  143.     out[idx].w = in[idx].w & in[idx+N].w;
  144. }
  145.        
  146. __global__ void logical_and(uchar4* in, uchar4* out, int N) {
  147.     int idx = blockIdx.x*blockDim.x+threadIdx.x;  
  148.     uchar4 buff;
  149.  
  150.     buff.x = in[idx].x;
  151.     buff.y = in[idx].y;
  152.     buff.z = in[idx].z;
  153.     buff.w = in[idx].w;
  154.  
  155.     buff.x &= in[idx+N].x;
  156.     buff.y &= in[idx+N].y;
  157.     buff.z &= in[idx+N].z;
  158.     buff.w &= in[idx+N].w;
  159.  
  160.     out[idx].x = buff.x;
  161.     out[idx].y = buff.y;
  162.     out[idx].z = buff.z;
  163.     out[idx].w = buff.w;
  164. }
  165.        
  166. __global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
  167.     int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
  168.     unsigned int buff[chunksize];
  169.     #pragma unroll
  170.     for ( int k = 0 ; k < chunksize ; k++ )
  171.         buff[k] = in[ blockDim.x*k + idx ];
  172.     #pragma unroll
  173.     for ( int k = 0 ; k < chunksize ; k++ )
  174.         buff[k] &= in[ blockDim.x*k + idx + N ];
  175.     #pragma unroll
  176.     for ( int k = 0 ; k < chunksize ; k++ )
  177.         out[ blockDim.x*k + idx ] = buff[k];
  178. }
  179.        
  180. #define chunksize 4