Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.78 KB | None | 0 0
  1. checkCudaErrors(cudaMalloc((void**)&d_A, h*h * sizeof(float)));
  2. checkCudaErrors(cudaMalloc((void**)&d_out, h*h * sizeof(float)));
  3. checkCudaErrors(cudaMemcpy(d_A, A, h*h * sizeof(float), cudaMemcpyHostToDevice));
  4. dim3 blockSize = (BLOCK_SIZE, BLOCK_SIZE);
  5. dim3 gridSize = ((h+BLOCK_SIZE-1)/BLOCK_SIZE, (h + BLOCK_SIZE - 1) / BLOCK_SIZE);
  6.  
  7. LaplaceDifference << <gridSize, blockSize >> > (d_A, h, d_out);
  8. checkCudaErrors(cudaMemcpy(B, d_out, h*h * sizeof(float), cudaMemcpyDeviceToHost));
  9.  
  10. int idx = blockIdx.x*blockDim.x + threadIdx.x;
  11. int idy = blockIdx.y*blockDim.y + threadIdx.y;
  12.  
  13.  
  14. __shared__ float A_ds[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
  15.  
  16. int n = 1;
  17. //Load data in shared memory
  18. int halo_index_left = (blockIdx.x - 1)*blockDim.x + threadIdx.x;
  19. int halo_index_right = (blockIdx.x + 1)*blockDim.x + threadIdx.x;
  20. int halo_index_up = (blockIdx.y - 1)*blockDim.y + threadIdx.y;
  21. int halo_index_down = (blockIdx.y + 1)*blockDim.y + threadIdx.y;
  22.  
  23. A_ds[n + threadIdx.y][n + threadIdx.x] = A[idy * h +idx];
  24.  
  25. if (threadIdx.x >= blockDim.x - n) {
  26. A_ds[threadIdx.y + n][threadIdx.x - (blockDim.x - n)] = (halo_index_left < 0) ? 0 : A[idy*h + halo_index_left];
  27. }
  28. if (threadIdx.x < n) {
  29. A_ds[threadIdx.y + n][blockDim.x + n + threadIdx.x] = (halo_index_right >= h) ? 0 : A[idy*h + halo_index_right];
  30. }
  31. if (threadIdx.y >= blockDim.y - n) {
  32. A_ds[threadIdx.y - (blockDim.y - n)][threadIdx.x+n] = (halo_index_up < 0) ? 0 : A[halo_index_up*h + idx];
  33. }
  34. if (threadIdx.y < n) {
  35. A_ds[blockDim.y + n + threadIdx.y][threadIdx.x + n] = (halo_index_down >= h) ? 0 : A[halo_index_down*h + idx];
  36. }
  37.  
  38.  
  39. __syncthreads();
  40.  
  41. P[idy*h + idx] = 0.25*(A_ds[threadIdx.y + n - 1][threadIdx.x + n] + A_ds[threadIdx.y + n + 1][threadIdx.x + n] + A_ds[threadIdx.y + n][threadIdx.x + n - 1] + A_ds[threadIdx.y + n][threadIdx.x + n + 1]);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement