__global__ void gpuKernel ( BYTE *src,float *aux, /* source, destination */ size_t memPitch, /* memory pitch */ size_t memPitchAux, int w,int h, /* size of picture */ BYTE *Rng,int sizeRng, /* ranges */ BYTE *Dir,int sizeDir /* directions */ ) { int rx = ((VEC2*)Rng)->x; // range of calculations (sizeXofwindow = 2*rx+1) int ry = ((VEC2*)Rng)->y; // range of calculations (sizeYofwindow = 2*ry+1) int i = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in devmem int j = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in devmem int idx = j * memPitchAux/sizeof(float) + i; int i0 = i-rx, i1 = i+rx; // the range of calculation for columns int j0 = j-ry, j1 = j+ry; // the range of calculation for rows float g=0.0f ,g0=0.0f; if((i0>=0) && (i1< w) && (j0>=0) && (j1< h)) { g = 0.0; for(int n=0; n<5; n++) { for(int js=jsB; jsBufGPU[0],data->memPitch, /* to GPU (device) */ data->BufCPU[0],data->imgPitch, /* from CPU (host) */ data->imgWidth, data->imgHeight, /* size of data (image) */ cudaMemcpyHostToDevice)); cudaThreadSynchronize(); [..] dim3 dimBlock(data->dimBLX,data->dimBLY,1); dim3 dimGrid(data->memPitch/dimBlock.x ,data->imgHeight/dimBlock.y,1); size_t memPitch = data->memPitch; float *auxD; size_t auxDPitch; size_t auxHPitch; checkCudaErrors(cudaMallocPitch((void**)&auxD,&auxDPitch,w*sizeof(float),h)); auxHPitch = w*sizeof(float); float *auxH = (float *) malloc(h*auxHPitch); gpuKernel<<>> (data->BufGPU[0],auxD, memPitch,auxDPitch,w, h, Rng,sRng, Dir,sDir); cudaThreadSynchronize(); checkCudaErrors(cudaMemcpy2D(auxH,auxHPitch, // to CPU (host) auxD,auxDPitch, // from GPU (device) auxHPitch, h, // size of data (image) cudaMemcpyDeviceToHost)); cudaThreadSynchronize(); [..] }