- CUDA kernel is slower than CPU
- ##MEX file testCPU.c:##
- #include "mex.h"
- void mexFunction( int nlhs, mxArray *plhs[],
- int nrhs, const mxArray *prhs[] ) {
- int i, varLen;
- unsigned char *vars, *output;
- vars = mxGetPr(prhs[0]);
- plhs[0] = mxCreateLogicalMatrix(2048000, 1);
- output = mxGetPr(plhs[0]);
- for (i=0;i<2048000;i++){
- output[i] = vars[i] & vars[2048000+i];
- }
- }
- mex testCPU.c
- vars = ~~(randi(2,2048000,2)-1);
- tic;testCPU(vars);toc;
- #CUDA file testGPU.cu#
- #include "mex.h"
- #include "cuda.h"
- __global__ void logical_and(unsigned char* in, unsigned char* out, int N) {
- int idx = blockIdx.x*blockDim.x+threadIdx.x;
- out[idx] = in[idx] && in[idx+N];
- }
- void mexFunction( int nlhs, mxArray *plhs[],
- int nrhs, const mxArray *prhs[] ) {
- int i;
- unsigned char *vars, *output, *gpu, *gpures;
- vars = (unsigned char*)mxGetData(prhs[0]);
- plhs[0] = mxCreateLogicalMatrix(2048000, 1);
- output = (unsigned char*)mxGetData(plhs[0]);
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- float dt_ms;
- // input GPU malloc
- cudaEventRecord(start, 0);
- cudaMalloc( (void **) &gpu, sizeof(unsigned char)*4096000);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&dt_ms, start, stop);
- printf("GPU input malloc: %f ms, %in", dt_ms, cudaGetLastError());
- // output GPU malloc
- cudaEventRecord(start, 0);
- cudaMalloc( (void **) &gpures, sizeof(unsigned char)*2048000);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&dt_ms, start, stop);
- printf("GPU output malloc: %f ms, %in", dt_ms, cudaGetLastError());
- // copy from CPU to GPU
- cudaEventRecord(start, 0);
- cudaMemcpy( gpu, vars, sizeof(unsigned char)*4096000, cudaMemcpyHostToDevice);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&dt_ms, start, stop);
- printf("copy input from CPU to GPU: %f ms, %in", dt_ms, cudaGetLastError());
- dim3 dimBlock(32);
- printf("thread count: %in", dimBlock.x);
- dim3 dimGrid(2048000/dimBlock.x);
- printf("block count: %in", dimGrid.x);
- // --- KERNEL ---
- cudaEventRecord(start, 0);
- logical_and<<<dimGrid, dimBlock>>>(gpu, gpures, 2048000);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&dt_ms, start, stop);
- printf("GPU kernel: %f ms, %in", dt_ms, cudaGetLastError());
- // result from GPU to CPU
- cudaEventRecord(start, 0);
- cudaMemcpy( output, gpures, sizeof(unsigned char)*2048000, cudaMemcpyDeviceToHost );
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&dt_ms, start, stop);
- printf("copy output from GPU to CPU: %f ms, %in", dt_ms, cudaGetLastError());
- cudaFree(gpu);
- cudaFree(gpures);
- }
- nvmex -f nvmexopts_9.bat testGPU.cu
- -I"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv4.2include"
- -L"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv4.2libx64" -lcudart -lcufft
- GPU input malloc: 0.772160 ms, 0
- GPU output malloc: 0.041728 ms, 0
- copy input from CPU to GPU: 1.494784 ms, 0
- thread count: 32
- block count: 64000
- *** GPU kernel: 3.761216 ms, 0 ***
- copy output from GPU to CPU: 1.203488 ms, 0
- __global__ void logical_and(uchar4* in, uchar4* out, int N) {
- int idx = blockIdx.x*blockDim.x+threadIdx.x;
- out[idx].x = in[idx].x & in[idx+N].x;
- out[idx].y = in[idx].y & in[idx+N].y;
- out[idx].z = in[idx].z & in[idx+N].z;
- out[idx].w = in[idx].w & in[idx+N].w;
- }
- uchar4 *gpu, *gpures;
- // 32 was worst, 64,128,256,512 were similar
- dim3 dimBlock(128);
- // block count is now 4xtimes smaller
- dim3 dimGrid(512000/dimBlock.x);
- GPU input malloc: 0.043360 ms, 0
- GPU output malloc: 0.038592 ms, 0
- copy input from CPU to GPU: 1.499584 ms, 0
- thread count: 128
- block count: 4000
- *** GPU kernel: 0.131296 ms, 0 ***
- copy output from GPU to CPU: 1.281120 ms, 0
- __global__ void logical_and(uchar4* in, uchar4* out, int N) {
- int idx = blockIdx.x*blockDim.x+threadIdx.x;
- out[idx].x = in[idx].x & in[idx+N].x;
- out[idx].y = in[idx].y & in[idx+N].y;
- out[idx].z = in[idx].z & in[idx+N].z;
- out[idx].w = in[idx].w & in[idx+N].w;
- }
- __global__ void logical_and(uchar4* in, uchar4* out, int N) {
- int idx = blockIdx.x*blockDim.x+threadIdx.x;
- uchar4 buff;
- buff.x = in[idx].x;
- buff.y = in[idx].y;
- buff.z = in[idx].z;
- buff.w = in[idx].w;
- buff.x &= in[idx+N].x;
- buff.y &= in[idx+N].y;
- buff.z &= in[idx+N].z;
- buff.w &= in[idx+N].w;
- out[idx].x = buff.x;
- out[idx].y = buff.y;
- out[idx].z = buff.z;
- out[idx].w = buff.w;
- }
- __global__ void logical_and(unsigned int* in, unsigned int* out, int N) {
- int idx = blockIdx.x*blockDim.x*chunksize+threadIdx.x;
- unsigned int buff[chunksize];
- #pragma unroll
- for ( int k = 0 ; k < chunksize ; k++ )
- buff[k] = in[ blockDim.x*k + idx ];
- #pragma unroll
- for ( int k = 0 ; k < chunksize ; k++ )
- buff[k] &= in[ blockDim.x*k + idx + N ];
- #pragma unroll
- for ( int k = 0 ; k < chunksize ; k++ )
- out[ blockDim.x*k + idx ] = buff[k];
- }
- #define chunksize 4