View difference between Paste ID: E60Rvu9j and
SHOW: | | - or go back to the newest paste.
1-
1+
 __global__ void gpuKernel
2
(
3
  BYTE *src,float *aux, /* source, destination */
4
  size_t memPitch, /* memory pitch */
5
  size_t memPitchAux, 
6
  int w,int h, /* size of picture  */
7
  BYTE *Rng,int sizeRng, /* ranges */
8
  BYTE *Dir,int sizeDir /* directions */ 
9
)
10
{
11
  int rx = ((VEC2*)Rng)->x; // range of calculations (sizeXofwindow = 2*rx+1)
12
  int ry = ((VEC2*)Rng)->y; // range of calculations (sizeYofwindow = 2*ry+1)
13
14
  int i = blockIdx.x * blockDim.x + threadIdx.x; // x-coordinate of pixel = column in devmem
15
  int j = blockIdx.y * blockDim.y + threadIdx.y; // y-coordinate of pixel = row in devmem
16
  int idx  = j * memPitchAux/sizeof(float) + i; 
17
  
18
  int i0 = i-rx, i1 = i+rx; // the range of calculation for columns
19
  int j0 = j-ry, j1 = j+ry; // the range of calculation for rows
20
21
  float g=0.0f ,g0=0.0f;
22
  
23
 if((i0>=0) && (i1< w) && (j0>=0) && (j1< h))
24
 {
25
    g = 0.0;
26
  
27
  for(int n=0; n<5; n++)
28
    {
29
     for(int js=jsB; js<swy-jsE; js++)
30
      {
31
	for(int is=isB; is<swx-isE; is++)
32
	{
33
	  int ic = (int) fabsf(src[(j0+js)*memPitch+i0+is]- 
34
                               src[(j0+js+jsD)*memPitch+i0+is+isD]);
35
	  c[ic] += 1.0f; 
36
	}
37
      }
38
        
39
        g = fmaxf(g,g0*s);
40
     } 
41
    
42
    aux[idx] = g;
43
    __syncthreads();
44
 } 
45
 else
46
 {
47
  aux[idx] = 0;
48
 }
49
}
50
51
int main()
52
{
53
[..]
54
   checkCudaErrors(cudaMemcpy2D(data->BufGPU[0],data->memPitch, /* to GPU (device) */
55
		data->BufCPU[0],data->imgPitch, /* from CPU (host) */
56
		data->imgWidth, data->imgHeight, /* size of data (image) */
57
		cudaMemcpyHostToDevice));
58
  cudaThreadSynchronize();
59
 
60
[..]
61
  dim3 dimBlock(data->dimBLX,data->dimBLY,1);
62
  dim3 dimGrid(data->memPitch/dimBlock.x ,data->imgHeight/dimBlock.y,1);
63
64
  size_t memPitch = data->memPitch;
65
 
66
  float *auxD;	
67
  size_t auxDPitch;
68
  size_t auxHPitch;
69
  checkCudaErrors(cudaMallocPitch((void**)&auxD,&auxDPitch,w*sizeof(float),h));
70
71
  auxHPitch = w*sizeof(float);
72
  float *auxH = (float *) malloc(h*auxHPitch);
73
74
  gpuKernel<<<dimGrid,dimBlock>>> (data->BufGPU[0],auxD,
75
		                   memPitch,auxDPitch,w, h,
76
		                   Rng,sRng,
77
		                   Dir,sDir);
78
  cudaThreadSynchronize();
79
80
  checkCudaErrors(cudaMemcpy2D(auxH,auxHPitch,  // to CPU (host)
81
               auxD,auxDPitch,  // from GPU (device)
82
	       auxHPitch, h, // size of data (image)
83
               cudaMemcpyDeviceToHost));
84
  
85
  cudaThreadSynchronize();
86
[..]
87
}