View difference between Paste ID: <a href="/E60Rvu9j">E60Rvu9j</a> and <a href="/post/view"></a>

View difference between Paste ID: E60Rvu9j and

SHOW: | | - or go back to the newest paste.


 __global__ void gpuKernel
(
  BYTE *src,float *aux, /* source, destination */
  size_t memPitch, /* memory pitch */
  size_t memPitchAux, 
  int w,int h, /* size of picture  */
  BYTE *Rng,int sizeRng, /* ranges */
  BYTE *Dir,int sizeDir /* directions */ 
)
{
  int rx = ((VEC2*)Rng)->x; // range of calculations (sizeXofwindow = 2*rx+1)
  int ry = ((VEC2*)Rng)->y; // range of calculations (sizeYofwindow = 2*ry+1)

  int i = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in devmem
  int j = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in devmem
  int idx  = j * memPitchAux/sizeof(float) + i; 
  
  int i0 = i-rx, i1 = i+rx; // the range of calculation for columns
  int j0 = j-ry, j1 = j+ry; // the range of calculation for rows

  float g=0.0f ,g0=0.0f;
  
 if((i0>=0) && (i1< w) && (j0>=0) && (j1< h))
 {
    g = 0.0;
  
  for(int n=0; n<5; n++)
    {
     for(int js=jsB; js<swy-jsE; js++)
      {
	for(int is=isB; is<swx-isE; is++)
	{
	  int ic = (int) fabsf(src[(j0+js)*memPitch+i0+is]- 
                               src[(j0+js+jsD)*memPitch+i0+is+isD]);
	  c[ic] += 1.0f; 
	}
      }
        
        g = fmaxf(g,g0*s);
     } 
    
    aux[idx] = g;
    __syncthreads();
 } 
 else
 {
  aux[idx] = 0;
 }
}

int main()
{
[..]
   checkCudaErrors(cudaMemcpy2D(data->BufGPU[0],data->memPitch, /* to GPU (device) */
		data->BufCPU[0],data->imgPitch, /* from CPU (host) */
		data->imgWidth, data->imgHeight, /* size of data (image) */
		cudaMemcpyHostToDevice));
  cudaThreadSynchronize();
 
[..]
  dim3 dimBlock(data->dimBLX,data->dimBLY,1);
  dim3 dimGrid(data->memPitch/dimBlock.x ,data->imgHeight/dimBlock.y,1);

  size_t memPitch = data->memPitch;
 
  float *auxD;	
  size_t auxDPitch;
  size_t auxHPitch;
  checkCudaErrors(cudaMallocPitch((void**)&auxD,&auxDPitch,w*sizeof(float),h));

  auxHPitch = w*sizeof(float);
  float *auxH = (float *) malloc(h*auxHPitch);

  gpuKernel<<<dimGrid,dimBlock>>> (data->BufGPU[0],auxD,
		                   memPitch,auxDPitch,w, h,
		                   Rng,sRng,
		                   Dir,sDir);
  cudaThreadSynchronize();

  checkCudaErrors(cudaMemcpy2D(auxH,auxHPitch,  // to CPU (host)
               auxD,auxDPitch,  // from GPU (device)
	       auxHPitch, h, // size of data (image)
               cudaMemcpyDeviceToHost));
  
  cudaThreadSynchronize();
[..]
}

1	-
1	+	__global__ void gpuKernel
2		(
3		BYTE src,float aux, /* source, destination */
4		size_t memPitch, /* memory pitch */
5		size_t memPitchAux,
6		int w,int h, /* size of picture */
7		BYTE Rng,int sizeRng, / ranges */
8		BYTE Dir,int sizeDir / directions */
9		)
10		{
11		int rx = ((VEC2)Rng)->x; // range of calculations (sizeXofwindow = 2rx+1)
12		int ry = ((VEC2)Rng)->y; // range of calculations (sizeYofwindow = 2ry+1)
13
14		int i = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in devmem
15		int j = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in devmem
16		int idx = j * memPitchAux/sizeof(float) + i;
17
18		int i0 = i-rx, i1 = i+rx; // the range of calculation for columns
19		int j0 = j-ry, j1 = j+ry; // the range of calculation for rows
20
21		float g=0.0f ,g0=0.0f;
22
23		if((i0>=0) && (i1< w) && (j0>=0) && (j1< h))
24		{
25		g = 0.0;
26
27		for(int n=0; n<5; n++)
28		{
29		for(int js=jsB; js<swy-jsE; js++)
30		{
31		for(int is=isB; is<swx-isE; is++)
32		{
33		int ic = (int) fabsf(src[(j0+js)*memPitch+i0+is]-
34		src[(j0+js+jsD)*memPitch+i0+is+isD]);
35		c[ic] += 1.0f;
36		}
37		}
38
39		g = fmaxf(g,g0*s);
40		}
41
42		aux[idx] = g;
43		__syncthreads();
44		}
45		else
46		{
47		aux[idx] = 0;
48		}
49		}
50
51		int main()
52		{
53		[..]
54		checkCudaErrors(cudaMemcpy2D(data->BufGPU[0],data->memPitch, /* to GPU (device) */
55		data->BufCPU[0],data->imgPitch, /* from CPU (host) */
56		data->imgWidth, data->imgHeight, /* size of data (image) */
57		cudaMemcpyHostToDevice));
58		cudaThreadSynchronize();
59
60		[..]
61		dim3 dimBlock(data->dimBLX,data->dimBLY,1);
62		dim3 dimGrid(data->memPitch/dimBlock.x ,data->imgHeight/dimBlock.y,1);
63
64		size_t memPitch = data->memPitch;
65
66		float *auxD;
67		size_t auxDPitch;
68		size_t auxHPitch;
69		checkCudaErrors(cudaMallocPitch((void*)&auxD,&auxDPitch,wsizeof(float),h));
70
71		auxHPitch = w*sizeof(float);
72		float auxH = (float ) malloc(h*auxHPitch);
73
74		gpuKernel<<<dimGrid,dimBlock>>> (data->BufGPU[0],auxD,
75		memPitch,auxDPitch,w, h,
76		Rng,sRng,
77		Dir,sDir);
78		cudaThreadSynchronize();
79
80		checkCudaErrors(cudaMemcpy2D(auxH,auxHPitch, // to CPU (host)
81		auxD,auxDPitch, // from GPU (device)
82		auxHPitch, h, // size of data (image)
83		cudaMemcpyDeviceToHost));
84
85		cudaThreadSynchronize();
86		[..]
87		}