Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void
- gpuPdist(float *out, float *in, int n, int m){
- __shared__ float Ys[16][16];
- __shared__ float Xs[16][16];
- int bx = blockIdx.x, by = blockIdx.y;
- int tx = threadIdx.x, ty = threadIdx.y;
- int yBegin = by * 16 * m;
- int xBegin = bx * 16 * m;
- int yEnd = yBegin + m - 1, y, x, k, o;
- float tmp, s = 0;
- for(y=yBegin,x=xBegin;
- y<=yEnd;
- y+=16,x+=16){
- Ys[ty][tx] = in[y + ty*m + tx];
- Xs[tx][ty] = in[x + ty*m + tx];
- //*** note the transpose of Xs
- __syncthreads();
- for(k=0;k<16;k++){
- tmp = Ys[ty][k] - Xs[k][tx];
- s += tmp*tmp;
- }
- __syncthreads();
- }
- o = by*16*n + ty*n + bx*16 + tx;
- out[o] = sqrtf(s);
- }
Add Comment
Please, Sign In to add comment