Advertisement
Guest User

Untitled

a guest
Dec 25th, 2011
373
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Fortran 46.73 KB | None | 0 0
  1. !  coursework_2011.cuf
  2. !
  3. !  FUNCTIONS:
  4. !  COURSEW      - Entry point of console application.
  5. !
  6. !
  7. !
  8. !****************************************************************************
  9. !
  10. !  PROGRAM:  GPU_cSPH_TVD
  11. !
  12. !  PURPOSE:  Use GPU computing for cSPH-TVD method
  13. !
  14. !****************************************************************************
  15.  
  16. module thrust
  17.  
  18. interface thrustsort
  19. subroutine sort_int( input,N) bind(C,name="sort_int_wrapper")
  20. use iso_c_binding
  21. integer(c_int),device:: input(*)
  22. integer(c_int),value:: N
  23. end subroutine
  24.  
  25. subroutine sort_float( input,N) bind(C,name="sort_float_wrapper")
  26. use iso_c_binding
  27. real(c_float),device:: input(*)
  28. integer(c_int),value:: N
  29. end subroutine
  30.  
  31. subroutine sort_double( input,N) bind(C,name="sort_double_wrapper")
  32. use iso_c_binding
  33. real(c_double),device:: input(*)
  34. integer(c_int),value:: N
  35. end subroutine
  36.  
  37. end interface
  38.  
  39. end module thrust
  40.  
  41. module fluid
  42.   use iso_c_binding
  43.   implicit none
  44.   integer, parameter :: dimen = 1
  45.   integer, parameter :: grid  = 512
  46.   integer, parameter :: prk   = 4
  47.   type Fluids
  48.     ! density = P, energy = e
  49.     ! U = (U_x, U_y, U_z)
  50.     ! r = (x, y, z)
  51.     real(kind=prk) :: density(grid**dimen), energy(grid**dimen)
  52.     real(kind=prk) :: U(grid**dimen,dimen) !U_x(:), U_y(:), U_z(:) ! speed components
  53.     real(kind=prk) :: r(grid**dimen,dimen) !x(:), y(:), z(:)
  54.   end type
  55.  
  56.   type test
  57.     real, allocatable :: p(:)
  58.   end type
  59.  
  60.   type Fluidstemp
  61.     sequence
  62.     real(kind=prk) :: z, energy, density
  63.     real(kind=prk) :: U(dimen)
  64.     real(kind=prk) :: r(dimen)
  65.   end type
  66.  
  67.   type Parameters
  68.     real(kind=prk) dx ! center of cells = dx/2 + i*dx
  69.     integer Nx, Ny, Nz ! dimension number of cells, example: Nx = 10, Ny = 15, Nz = 20 -> [10x15x20]
  70.     integer N !total number of cells: [10x15x20] = 3000
  71.     real(kind=prk) monaghan_multiplier, smoothing_length
  72.     real(kind=prk) gamma, rho_0, c_0, p_0, cnst, eps, courant
  73.     real(kind=prk) :: nablaW0(3 ** dimen, dimen)
  74.     real(kind=prk) :: ta, tb, tc, minus
  75.     integer totalcells, neighbourscount
  76.     integer dimensions, dim2d, dim3d !1D, 2D or 3D
  77.     integer WNx, WNy
  78.   end type
  79.  
  80. end module fluid
  81.  
  82. module CUDASphTvd
  83.   use cudafor
  84.   use cudadevice
  85.   use thrust
  86.   use fluid
  87.   integer, parameter :: BLOCK_SIZE=16
  88.   type(Fluids), allocatable, device :: allfluid, allfluidnext, allfluidhalf, allforces !allfluidhalf = (allfluid+allfluidnext) / 2
  89.   type(Fluids), allocatable, device :: allfluxes(:)
  90.   real(kind=prk), allocatable, device :: time_steps(:)
  91.   logical, allocatable, device :: boolflags(:)
  92.   type(Parameters), constant :: params
  93.   real(kind=prk), allocatable, device :: tau
  94.   real(kind=prk), allocatable, device :: tautemp(:)
  95.   contains
  96.  
  97.   attributes(global) subroutine cuda_calculate_forces_sph_predictor(in, outforce)
  98.     type(Fluids), device :: in, outforce
  99.     integer :: i, j, k, idx, idx_k
  100.     !temporary variables for speed-up:
  101.     real(kind=prk) :: H_i, H_k, sqrU_i
  102.     !temporary 'fluid' variables:
  103.     real(kind=prk) :: i_density, i_energy, theta_k, tmp
  104.     real(kind=prk) :: Ui(dimen), i_impulse(dimen)
  105.     real(kind=prk) :: Uk(dimen), tmpW(dimen)
  106.  
  107.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  108.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  109.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  110.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  111.  
  112.     sqrU_i = 0.0
  113.     !calculate Fi:
  114.     !get fluid_i characteristics (save in temp):
  115.     if (in%density(idx) > params%eps) then
  116.       Ui(1:params%dimensions) = in%U(idx,:) / in%density(idx)
  117.     else
  118.       Ui = 0.0
  119.     endif
  120.    
  121.     i_density = in%density(idx)
  122.     i_energy  = in%energy(idx)
  123.     i_impulse = 0.0
  124.  
  125.     !calculate H_i
  126.     sqrU_i  = dot_product(Ui, Ui)
  127.     tmp     = 2.0 * ( (params%gamma - 1.0) * (i_energy - (i_density * sqrU_i) / 2.0) )
  128.     if (tmp > params%eps) then
  129.       H_i   = sqrt(tmp)
  130.     else
  131.       H_i   = 0.0
  132.     endif
  133. !print *, "H_i", tmp
  134.     !calculate sum(W_{ik}) and H_k and sum(Ui+Uk):
  135.     H_k     = 0.0
  136.     theta_k = 0.0
  137.     i_energy= 0.0
  138.  
  139.     !sum_{k} {Ui +Uk} = sum_{k} {Ui} + sum_{k} {Uk} = k * Ui + sum_{k} {Uk} :
  140.     do i_k = i - 1, i + 1
  141.       if ( (i_k >= 0) .AND. (i_k < params%Nz) ) then
  142.     do j_k = j - 1, j + 1
  143.       if ( (j_k >= 0) .AND. (j_k < params%Ny) ) then
  144.         do k_k = k - 1, k + 1
  145.           if ( (k_k >= 0) .AND. (k_k < params%Nx) ) then
  146.         idx_k     = params%Ny * params%Nx * i_k + params%Nx * j_k + k_k + 1
  147.  
  148.         if (in%density(idx_k) > params%eps) then
  149.           Uk(1:params%dimensions) = in%U(idx_k,:) / in%density(idx_k)
  150.         else
  151.           Uk                      = 0.0
  152.         endif
  153.         sqrU_i    = dot_product(Uk, Uk)
  154.         tmp       = 2.0 * ( (params%gamma - 1.0) * (in%energy(idx_k) - (in%density(idx_k) * sqrU_i) / 2.0))
  155.         if (tmp > params%eps) then
  156.           H_k     = sqrt(tmp)
  157.         else
  158.           H_k     = 0.0
  159.         endif
  160.         tmpW(1:params%dimensions) = params%nablaW0(params%WNx * params%WNy * (i_k - i + 1) + params%WNx * (j_k - j + 1) + k_k - k + 2,:)
  161.  
  162.         if (H_i > params%eps) then
  163.           i_impulse = i_impulse - tmpW * H_k - tmpW * i_density / H_i * ftheta_k(idx_k)
  164.           i_energy  = i_energy - H_k * dot_product(Ui + Uk, tmpW) / 2.0 - ftheta_k(idx_k) * dot_product(in%U(idx,:), tmpW) / H_i
  165.         else
  166.           i_impulse = i_impulse - tmpW * H_k
  167.           i_energy  = i_energy - H_k * dot_product(Ui + Uk, tmpW) / 2.0
  168.         endif
  169.  
  170.           endif
  171.         enddo
  172.       endif
  173.     enddo
  174.       endif
  175.     enddo
  176.  
  177.  
  178.     !calculate predictor forces:
  179.     i_impulse          = H_i * i_impulse
  180.     i_energy           = H_i * i_energy
  181.     outforce%density(idx) = 0.0
  182.     if (i_density > params%eps) then
  183. !print *, "i_impulse, i_energy", i_impulse, i_energy
  184.       outforce%U(idx,:)     = i_impulse(1:params%dimensions)
  185.       outforce%energy(idx)  = i_energy
  186.     else
  187.       outforce%U(idx,:)     = 0.0
  188.       outforce%energy(idx)  = 0.0
  189.     endif
  190.  
  191.   end subroutine
  192.  
  193.   attributes(device) function ftheta_k(indx)
  194.     real(kind=prk) :: ftheta_k, coord
  195.     integer :: indx
  196.  
  197.     coord = params%smoothing_length / 2.0 + real(indx, kind=prk) * params%smoothing_length
  198.  
  199.     if (indx < params%totalcells/3 .AND. indx > params%totalcells/6) then
  200. !      ftheta_k = -4.0 / ( Exp(2.0 * coord) + 2.0 + Exp(-2.0 * coord) )
  201.       ftheta_k = params%ta * coord ** 2 + params%tb * coord + params%tc
  202.     else
  203.       ftheta_k = 0.0
  204.     endif
  205.       ftheta_k = 0.0
  206.   end function ftheta_k
  207.  
  208.   attributes(global) subroutine cuda_calculate_dt(in, outdt)
  209.     type(Fluids), device :: in
  210.     real(kind=prk), dimension(:), device :: outdt
  211.     real(kind=prk) :: Cs
  212.     integer :: idx, i, j, k
  213.  
  214.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  215.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  216.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  217.  
  218.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  219.     outdt(idx) = 10000000.0
  220.     if (in%density(idx) > params%eps) then
  221.       Cs = params%gamma * (params%gamma - 1.0) * (in%energy(idx) - in%density(idx)*dot_product(in%U(idx,:) / in%density(idx),in%U(idx,:) / in%density(idx)) / 2.0) / in%density(idx)
  222.       Cs = (abs(Cs > params%eps) * sqrt(abs(Cs)))
  223.       outdt(idx) = params%courant * min(params%smoothing_length / (2.0 * maxval(abs(in%U(idx,:) / in%density(idx)))), params%smoothing_length / (maxval(abs(in%U(idx,:) / in%density(idx))) + Cs))
  224.     endif
  225.  
  226.   end subroutine cuda_calculate_dt
  227.  
  228.   attributes(global) subroutine cuda_calculate_predictor_sph(in, inforce, outhalf, tau)
  229.     integer idx, i, j, k
  230.     !temporary 'fluid' variables:
  231.     real(kind=prk), device :: tau
  232.     type(Fluids), device :: in, inforce, outhalf
  233.     !formula 16:
  234.  
  235.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  236.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  237.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  238.  
  239.     !calculate H_i:
  240.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  241.     !calculate predictor based on calculated forces inb4:
  242.  
  243.     outhalf%density(idx) = in%density(idx) + (tau / 2.0) * inforce%density(idx)
  244.     if (in%density(idx) > params%eps) then
  245.       outhalf%U(idx,:)     = in%U(idx,:) + tau * inforce%U(idx,:) / 2.0
  246.       outhalf%energy(idx)  = in%energy(idx) + tau * inforce%energy(idx) / 2.0
  247.       outhalf%r(idx,:)     = in%r(idx,:) + tau * ( in%U(idx,:) / in%density(idx) + outhalf%U(idx,:)  / outhalf%density(idx) ) / 4.0
  248.     else
  249.       outhalf%U(idx,:)     = 0.0
  250.       outhalf%energy(idx)  = 0.0
  251.       outhalf%r(idx,:)     = in%r(idx,:)
  252.     endif
  253.  
  254.   end subroutine cuda_calculate_predictor_sph
  255.  
  256.   attributes(global) subroutine cuda_calculate_forces_sph_corrector(in, inhalf, outnext)
  257.     integer idx, idx_k, i, j, k, i_k, j_k, k_k
  258.     !temporary variables for speed-up:
  259.     real(kind=prk) H_i, H_k, sqrU_i
  260.     !temporary 'fluid' variables:
  261.     real(kind=prk) i_density, i_energy, theta_k, tmp
  262.     real(kind=prk) :: Ui(dimen), ri(dimen), i_impulse(dimen)
  263.     real(kind=prk) :: Uk(dimen), rk(dimen), Wr(dimen), tmpW(dimen)
  264.     type(Fluids), device :: in, inhalf, outnext
  265.     !formula 16:
  266.  
  267.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  268.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  269.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  270.  
  271.     !calculate H_i:
  272.     sqrU_i = 0.0
  273.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  274.     !calculate Fi:
  275.     !get fluid_i characteristics (save in temp):
  276.     if (inhalf%density(idx) > params%eps) then
  277.       Ui(1:params%dimensions) = inhalf%U(idx,:) / inhalf%density(idx)
  278.     else
  279.       Ui = 0.0
  280.     endif
  281.     ri(1:params%dimensions) = inhalf%r(idx,:)
  282.     i_density = inhalf%density(idx)
  283.     i_energy  = inhalf%energy(idx)
  284.     i_impulse = 0.0
  285.  
  286.     !calculate H_i
  287.     sqrU_i  = dot_product(Ui, Ui)
  288.     tmp     = 2.0 * ( (params%gamma - 1.0) * (i_energy - (i_density * sqrU_i) / 2.0) )
  289.     if (tmp > params%eps) then
  290.       H_i     = sqrt(tmp)
  291.     else
  292.       H_i     = 0.0
  293.     endif
  294.  
  295.     !calculate sum(W_{ik}) and H_k and sum(Ui+Uk):
  296.     H_k     = 0.0
  297.     theta_k = 0.0
  298.     i_energy= 0.0
  299.  
  300.     do i_k = i - 1, i + 1
  301.       if ( (i_k >= 0) .AND. (i_k < params%Nz) ) then
  302.     do j_k = j - 1, j + 1
  303.       if ( (j_k >= 0) .AND. (j_k < params%Ny) ) then
  304.         do k_k = k - 1, k + 1
  305.           if ( (k_k >= 0) .AND. (k_k < params%Nx) ) then
  306.         idx_k     = params%Ny * params%Nx * i_k + params%Nx * j_k + k_k + 1
  307.         if (inhalf%density(idx_k) > params%eps) then
  308.           Uk(1:params%dimensions) = inhalf%U(idx_k,:) / inhalf%density(idx_k)
  309.         else
  310.           Uk                      = 0.0
  311.         endif
  312.         sqrU_i    = dot_product(Uk, Uk)
  313.         tmp       = 2.0 * ( (params%gamma - 1.0) * (inhalf%energy(idx_k) - (inhalf%density(idx_k) * sqrU_i) / 2.0) )
  314.         if (tmp > params%eps) then
  315.           H_k     = sqrt(tmp)
  316.         else
  317.           H_k     = 0.0
  318.         endif
  319.         rk(1:params%dimensions) = inhalf%r(idx_k,:)
  320.         call calculateSmoothingGradientW(ri(1:params%dimensions), rk(1:params%dimensions), Wr(1:params%dimensions))
  321.         tmpW      = params%nablaW0(params%WNx * params%WNy * (i_k - i + 1) + params%WNx * (j_k - j + 1) + k_k - k + 2,:)
  322.  
  323.         if (H_i > params%eps) then
  324.           i_impulse(1:params%dimensions) = i_impulse(1:params%dimensions) - Wr(1:params%dimensions) * H_k - tmpW * i_density / H_i * ftheta_k(idx_k)
  325.           i_energy  = i_energy - H_k * dot_product(Ui + Uk, Wr) / 2.0 - ftheta_k(idx_k) * dot_product(inhalf%U(idx,:), tmpW) / H_i
  326.         else
  327.           i_impulse(1:params%dimensions) = i_impulse(1:params%dimensions) - Wr(1:params%dimensions) * H_k
  328.           i_energy  = i_energy - H_k * dot_product(Ui + Uk, Wr) / 2.0
  329.         endif
  330.  
  331.           endif
  332.         enddo
  333.       endif
  334.     enddo
  335.       endif
  336.     enddo
  337.  
  338.     !calculate corrector forces:
  339.     i_impulse(1:params%dimensions) = H_i * i_impulse(1:params%dimensions)
  340.     i_energy                       = H_i * i_energy
  341.  
  342. !    call syncthreads()
  343.  
  344.     outnext%density(idx)   = in%density(idx) + 0.0 ! + 0.0 * (tau / 2.0)
  345.     if (i_density > params%eps) then
  346.       outnext%U(idx,:)     = in%U(idx,:) + tau * i_impulse(1:params%dimensions)
  347.       outnext%energy(idx)  = in%energy(idx) + tau * i_energy
  348.       outnext%r(idx,:)     = in%r(idx,:) + tau * ( in%U(idx,:) / i_density + outnext%U(idx,:) / i_density ) / 2.0
  349.     else
  350.       outnext%U(idx,:)     = 0.0
  351.       outnext%energy(idx)  = 0.0
  352.       outnext%r(idx,:)     = in%r(idx,:)
  353.     endif
  354.  
  355.  
  356. !calculate q(n+1/2):
  357.  
  358. !     inhalf%density(idx) = ( in%density(idx) + outnext%density(idx) ) / 2.0
  359. !     inhalf%energy(idx)  = ( in%energy(idx) + outnext%energy(idx) ) / 2.0
  360. !     inhalf%r(idx,:)     = ( in%r(idx,:) + outnext%r(idx,:) ) / 2.0
  361. !     inhalf%U(idx,:)     = ( in%U(idx,:) + outnext%U(idx,:) ) / 2.0
  362.  
  363.   end subroutine cuda_calculate_forces_sph_corrector
  364.  
  365.   attributes(global) subroutine cuda_calculate_fluidhalf(in, innext, outhalf)
  366.     type(Fluids), device :: in, outhalf, innext
  367.     integer :: i, j, k, idx
  368. !calculate q(n+1/2):
  369.  
  370.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  371.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  372.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  373.  
  374.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  375.     outhalf%density(idx) = ( in%density(idx) + innext%density(idx) ) / 2.0
  376.     outhalf%energy(idx)  = ( in%energy(idx) + innext%energy(idx) ) / 2.0
  377.     outhalf%r(idx,:)     = ( in%r(idx,:) + innext%r(idx,:) ) / 2.0
  378.     outhalf%U(idx,:)     = ( in%U(idx,:) + innext%U(idx,:) ) / 2.0
  379.  
  380.   end subroutine cuda_calculate_fluidhalf
  381.  
  382.   attributes(global) subroutine cuda_calculate_flux_tvd(in, inhalf, outq0, outfluxes, outflags)
  383.     integer idx, idx_k, i, j, k, i_k, j_k, k_k, dims, dim2dim, dim3dim
  384.     integer, dimension(:) :: offset(3), ijk(3)
  385.     logical, dimension(:), device :: outflags
  386.     type(Fluids), device :: in, inhalf, outq0 !in = n+1 from corrector !outq0 = n
  387.     type(Fluids), device :: outfluxes(dimen)
  388.     type(Fluidstemp) :: neighbrs(3 + (dimen - 1) * 2), fluidhalf(4), ql, qr, Fl, Fr
  389.     real(kind=prk) :: dr1, dr2, dr3, L, pl, Cl
  390.     real(kind=prk) :: Vrl(dimen), Vrr(dimen)
  391.     real(kind=prk) :: aa
  392. !    print *, "calculate_flux_tvd"
  393. !    neighbours = 3 + (params%dimensions - 1) * 2
  394. !    print *, "calculate_flux_tvd debug"
  395.     neighbrs%density  = 0.0
  396.     neighbrs%energy   = 0.0
  397.     fluidhalf%density = 0.0
  398.     fluidhalf%energy  = 0.0
  399.  
  400.     do i = 1, params%dimensions
  401.       neighbrs(:)%r(i)  = 0.0
  402.       neighbrs(:)%U(i)  = 0.0
  403.       fluidhalf(:)%r(i) = 0.0
  404.       fluidhalf(:)%U(i) = 0.0
  405.     enddo
  406.     offset(1)         = 1
  407.     offset(2)         = params%Nx
  408.     offset(3)         = params%Nx * params%Ny
  409.     dim2dim           = abs(params%dimensions == 1) * 1
  410.     dim3dim           = abs(params%dimensions <  3) * 1
  411.  
  412.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  413.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  414.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  415.  
  416.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  417.  
  418.  do dims = 1, params%dimensions
  419.           outfluxes(dims)%density(idx) = 0.0
  420.           outfluxes(dims)%energy(idx)  = 0.0
  421.           outfluxes(dims)%U(idx,:)     = 0.0
  422.  enddo
  423.  
  424.     !for each particle do:
  425.     if ( (i >= 2 - params%dim3d) .AND. (i <= params%Nz-2 + dim3dim) ) then
  426.       if ( (j >= 2 - params%dim2d) .AND. (j <= params%Ny-2 + dim2dim) ) then
  427.     if ( (k >= 2) .AND. (k <= params%Nx-2) ) then
  428.  
  429.       ijk(1) = k; ijk(2) = j; ijk(3) = i;
  430.  
  431.       !get center cell (we will calculate fluxes relatively this cell):
  432. !    print *, "idx, ", idx
  433. !    print *, "ijk, ", ijk
  434.  
  435.       !take all neighbours (1d: 3, 2d: 5(cross), 3d: 7(3d-cross)):
  436.       if ( (k - 1 >= 0) .AND. (k - 1 < params%Nx) ) then
  437.         neighbrs(1)%density = in%density(idx-1)
  438.         neighbrs(1)%energy  = in%energy(idx-1)
  439.         neighbrs(1)%r(:)    = in%r(idx-1,:)
  440.         neighbrs(1)%U(:)    = in%U(idx-1,:)
  441.       endif
  442.       neighbrs(2)%density = in%density(idx)
  443.       neighbrs(2)%energy  = in%energy(idx)
  444.       neighbrs(2)%r(:)     = in%r(idx,:)
  445.       neighbrs(2)%U(:)     = in%U(idx,:)
  446.       if ( (k + 1 >= 0) .AND. (k + 1 < params%Nx) ) then
  447.         neighbrs(3)%density = in%density(idx+1)
  448.         neighbrs(3)%energy  = in%energy(idx+1)
  449.         neighbrs(3)%r(:)     = in%r(idx+1,:)
  450.         neighbrs(3)%U(:)     = in%U(idx+1,:)
  451.       endif
  452.  
  453.       !add neighbours if dim > 1:
  454.       if ( params%dimensions > 1 ) then
  455.         if ( (j - 1 >= 0) .AND. (j - 1 < params%Ny) ) then
  456.           idx_k = idx - params%Nx !params%Ny * params%Nx * i_k + params%Nx * j_k + k_k + 1
  457.           neighbrs(4)%density = in%density(idx_k)
  458.           neighbrs(4)%energy  = in%energy(idx_k)
  459.           neighbrs(4)%r(:)     = in%r(idx_k,:)
  460.           neighbrs(4)%U(:)     = in%U(idx_k,:)
  461.         endif
  462.         if ( (j + 1 >= 0) .AND. (j + 1 < params%Ny) ) then
  463.           idx_k = idx + params%Nx
  464.           neighbrs(5)%density = in%density(idx_k)
  465.           neighbrs(5)%energy  = in%energy(idx_k)
  466.           neighbrs(5)%r(:)     = in%r(idx_k,:)
  467.           neighbrs(5)%U(:)     = in%U(idx_k,:)
  468.         endif
  469.       endif
  470.  
  471.       !add neighbours if dim > 2:
  472.       if ( params%dimensions > 2 ) then
  473.         if ( (i - 1 >= 0) .AND. (i - 1 < params%Nz) ) then
  474.           idx_k = idx - params%Nx * params%Ny !params%Ny * params%Nx * i_k + params%Nx * j_k + k_k + 1
  475.           neighbrs(6)%density = in%density(idx_k)
  476.           neighbrs(6)%energy  = in%energy(idx_k)
  477.           neighbrs(6)%r(:)     = in%r(idx_k,:)
  478.           neighbrs(6)%U(:)     = in%U(idx_k,:)
  479.         endif
  480.         if ( (i + 1 >= 0) .AND. (i + 1 < params%Nz) ) then
  481.           idx_k = idx + params%Nx * params%Ny
  482.           neighbrs(7)%density = in%density(idx_k)
  483.           neighbrs(7)%energy  = in%energy(idx_k)
  484.           neighbrs(7)%r(:)     = in%r(idx_k,:)
  485.           neighbrs(7)%U(:)     = in%U(idx_k,:)
  486.         endif
  487.       endif
  488.       !Okay, now lets decide if we need to calculate fluxes (check density):
  489. !take components based on current dimension:
  490.       if (maxval(neighbrs%density) > params%eps) then
  491.         outflags(idx) = .TRUE.
  492.         do dims = 1, params%dimensions
  493.  
  494.           do i_k = 0, 3
  495.         fluidhalf(i_k+1)%density = inhalf%density(idx + offset(dims) * (i_k - 2))
  496.         fluidhalf(i_k+1)%energy  = inhalf%energy(idx + offset(dims) * (i_k - 2))
  497.         fluidhalf(i_k+1)%U(:)    = inhalf%U(idx + offset(dims) * (i_k - 2),:)
  498.         fluidhalf(i_k+1)%r(:)    = inhalf%r(idx + offset(dims) * (i_k - 2),:)
  499.           enddo
  500.  
  501.   !calc ql:
  502.  
  503.           dr1 = fluidhalf(2)%r(dims) - fluidhalf(1)%r(dims)
  504.           dr2 = fluidhalf(3)%r(dims) - fluidhalf(2)%r(dims)
  505.           dr3 = fluidhalf(4)%r(dims) - fluidhalf(3)%r(dims)
  506.  
  507.           if (fluidhalf(2)%density > params%eps) then
  508.         drx = (params%smoothing_length / 2.0 - (fluidhalf(2)%r(dims) - (params%dx/2 + (ijk(dims) - 1) * params%dx)) / 2.0)
  509.  
  510.         call minmod((fluidhalf(3)%density - fluidhalf(2)%density) / dr2, (fluidhalf(2)%density - fluidhalf(1)%density) / dr1, L)
  511.         ql%density = fluidhalf(2)%density + drx * L
  512.  
  513.         call minmod((fluidhalf(3)%energy - fluidhalf(2)%energy) / dr2, (fluidhalf(2)%energy - fluidhalf(1)%energy) / dr1, L)
  514.         ql%energy = fluidhalf(2)%energy + drx * L
  515.         do j_k = 1, params%dimensions
  516.           call minmod((fluidhalf(3)%U(j_k) - fluidhalf(2)%U(j_k)) / dr2, (fluidhalf(2)%U(j_k) - fluidhalf(1)%U(j_k)) / dr1, L)
  517.           ql%U(j_k) = fluidhalf(2)%U(j_k) + drx * L
  518.         enddo
  519.  
  520. !       Vrl = ql%U(1,:) !Vrl(1) = Vxl, Vrl(2) = Vyl, Vrl(3) = Vzl
  521.         if (ql%density > params%eps) then
  522.           Vrl = ql%U(:) / ql%density
  523.           pl  = abs((params%gamma - 1.0) * (ql%energy - ql%density*dot_product(Vrl,Vrl) / 2.0) > params%eps) * (params%gamma - 1.0) * (ql%energy - ql%density*dot_product(Vrl,Vrl) / 2.0)
  524.           Cl  = sqrt(params%gamma * pl / ql%density)
  525.         else
  526.           Vrl = 0.0
  527.           pl  = abs((params%gamma - 1.0) * (ql%energy - ql%density*dot_product(Vrl,Vrl) / 2.0) > params%eps) * (params%gamma - 1.0) * (ql%energy - ql%density*dot_product(Vrl,Vrl) / 2.0)
  528.           Cl  = 0.0
  529.         endif
  530.         Fl%density = ql%U(dims)
  531.         Fl%U(:)    = ql%U(:) * Vrl(dims)
  532.         Fl%energy  = ql%energy * Vrl(dims)
  533.           else
  534.         ql%density = 0.0
  535.         ql%energy  = 0.0
  536.         ql%U(:)    = 0.0
  537.         Fl%density = 0.0
  538.         Fl%energy  = 0.0
  539.         Fl%U(:)    = 0.0
  540.         Cl         = 0.0
  541.         Vrl        = 0.0
  542.           endif
  543. !    print *, "t_debug4"
  544.  
  545.   !calc qr:
  546.           if (fluidhalf(3)%density > params%eps) then
  547.  
  548.         drx = (params%smoothing_length / 2.0 + (fluidhalf(3)%r(dims) - (params%dx / 2.0 + ijk(dims) * params%dx)) / 2.0)
  549.  
  550.         call minmod((fluidhalf(3)%density - fluidhalf(2)%density) / dr2, (fluidhalf(4)%density - fluidhalf(3)%density) / dr3, L)
  551.         qr%density = fluidhalf(3)%density - drx * L
  552.         call minmod((fluidhalf(3)%energy - fluidhalf(2)%energy) / dr2, (fluidhalf(4)%energy - fluidhalf(3)%energy) / dr3, L)
  553.         qr%energy = fluidhalf(3)%energy - drx * L
  554.  
  555.         do j_k = 1, params%dimensions
  556.           call minmod((fluidhalf(3)%U(j_k) - fluidhalf(2)%U(j_k)) / dr2, (fluidhalf(4)%U(j_k) - fluidhalf(3)%U(j_k)) / dr3, L)
  557.           qr%U(j_k) = fluidhalf(3)%U(j_k) - drx * L
  558.         enddo
  559.  
  560. !       Vrr = qr%U(1,:) !Vrr(1) = Vxr, Vrr(2) = Vyr, Vrr(3) = Vzr
  561.         if (qr%density > params%eps) then
  562.           Vrr = qr%U(:) / qr%density
  563.           pr  = abs((params%gamma - 1.0) * (qr%energy - qr%density*dot_product(Vrr,Vrr) / 2.0) > params%eps) * (params%gamma - 1.0) * (qr%energy - qr%density*dot_product(Vrr,Vrr) / 2.0)
  564.           Cr  = sqrt(params%gamma * pr / qr%density)
  565.         else
  566.           Cr  = 0.0
  567.           Vrr = 0.0
  568.           pr  = abs((params%gamma - 1.0) * (qr%energy - qr%density*dot_product(Vrr,Vrr) / 2.0) > params%eps) * (params%gamma - 1.0) * (qr%energy - qr%density*dot_product(Vrr,Vrr) / 2.0)
  569.         endif
  570.  
  571.         Fr%density = qr%U(dims)
  572.         Fr%U(:)    = qr%U(:) * Vrr(dims)
  573.         Fr%energy  = qr%energy * Vrr(dims)
  574.           else
  575.         qr%density = 0.0
  576.         qr%energy  = 0.0
  577.         qr%U(:)    = 0.0
  578.         Fr%density = 0.0
  579.         Fr%energy  = 0.0
  580.         Fr%U(:)    = 0.0
  581.         Cr         = 0.0
  582.         Vrr        = 0.0
  583.           endif
  584.   !calc Fx:
  585.  
  586.           if ((Cl > params%eps) .OR. (Cr > params%eps)) then
  587.         Ll = min(Vrl(dims) - Cl, Vrr(dims) - Cr)
  588.         Lr = max(Vrr(dims) + Cr, Vrl(dims) + Cl)
  589.         bb = max(abs(Ll), abs(Lr))
  590.         outfluxes(dims)%density(idx) = ( Fr%density + Fl%density + bb * (ql%density - qr%density) ) / 2.0
  591.         outfluxes(dims)%energy(idx)  = ( Fr%energy  + Fl%energy  + bb * (ql%energy  - qr%energy)  ) / 2.0
  592.         outfluxes(dims)%U(idx,:)     = ( Fr%U(:)    + Fl%U(:)    + bb * (ql%U(:)    - qr%U(:))    ) / 2.0
  593.           else
  594.         outfluxes(dims)%density(idx) = 0.0
  595.         outfluxes(dims)%energy(idx)  = 0.0
  596.         outfluxes(dims)%U(idx,:)     = 0.0
  597.           endif
  598.  
  599.         enddo
  600.       else
  601.         outflags(idx) = .FALSE.
  602.         do dims = 1, params%dimensions
  603.           outfluxes(dims)%density(idx) = 0.0
  604.           outfluxes(dims)%energy(idx)  = 0.0
  605.           outfluxes(dims)%U(idx,:)     = 0.0
  606.         enddo
  607.       endif
  608. !       outfluxes(:)%density(idx) = 0.0
  609.  
  610.     endif
  611.       endif
  612.     endif
  613.  
  614.   end subroutine cuda_calculate_flux_tvd
  615.  
  616.   attributes(global) subroutine cuda_finalize(outq0, in, influxes, inflags)
  617.     type(Fluids), device :: in, outq0 !in = n+1 from corrector !outq0 = n
  618.     type(Fluids), device :: influxes(dimen)
  619.     type(Fluidstemp) :: fluidhalf(dimen)
  620.     integer, dimension(:) :: offset(3)
  621.     logical, dimension(:), device :: inflags
  622.     integer :: dims, i, j, k, idx
  623.  
  624.     offset(1)         = 1
  625.     offset(2)         = params%Nx
  626.     offset(3)         = params%Nx * params%Ny
  627.  
  628.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  629.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  630.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  631.  
  632.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  633.  
  634.     if ( (i >= 2 - params%dim3d) .AND. (i <= params%Nz-3 + params%dim3d) ) then
  635.       if ( (j >= 2 - params%dim2d) .AND. (j <= params%Ny-3 + params%dim2d) ) then
  636.     if ( (k >= 2) .AND. (k <= params%Nx-3) ) then
  637.  
  638.       if (inflags(idx)) then
  639. !calculate all neighbour fluxes (idx + offset(dims)):
  640.         do dims = 1, params%dimensions
  641.           fluidhalf(dims)%density = influxes(dims)%density(idx + offset(dims))
  642.           fluidhalf(dims)%energy  = influxes(dims)%energy(idx + offset(dims))
  643.           fluidhalf(dims)%U(:)    = influxes(dims)%U(idx + offset(dims),:)
  644.         enddo
  645. !calculate final values:
  646.         outq0%density(idx) = in%density(idx) + tau / params%smoothing_length * (sum(influxes(:)%density(idx)) - sum(fluidhalf(1:params%dimensions)%density))
  647.         outq0%energy(idx)  = in%energy(idx)  + tau / params%smoothing_length * (sum(influxes(:)%energy(idx))  - sum(fluidhalf(1:params%dimensions)%energy))
  648.         do dims = 1, params%dimensions
  649.           outq0%U(idx,dims)  = in%U(idx,dims) + tau / params%smoothing_length * (sum(influxes(:)%U(idx,dims)) - sum(fluidhalf(1:params%dimensions)%U(dims)))
  650.         enddo
  651.  
  652.       else
  653.         outq0%density(idx) = in%density(idx)
  654.         outq0%energy(idx)  = in%energy(idx)
  655.         outq0%U(idx,:)     = in%U(idx,:)
  656.       endif
  657.  
  658.     endif
  659.       endif
  660.     endif
  661.   end subroutine cuda_finalize
  662.  
  663.   attributes(device) subroutine minmod(a, b, out)
  664.     real(kind=prk), device ::  a, b, out
  665.     out = ( 1.0 / 2.0) * (sign(1.0, a) + sign(1.0, b)) * min(abs(a), abs(b))
  666.   end subroutine minmod
  667.  
  668.   subroutine cuda_setupSystem(inparams)
  669.  
  670.   end subroutine
  671.  
  672.   attributes(global) subroutine cuda_boundary_conditions(ptr)
  673.     integer j, k, idx, idx_j, idx_k
  674.     type(Fluids), device :: ptr
  675.     integer ti, tj, tk, tidx
  676.     ti   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  677.     tj   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  678.     tk   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  679.  
  680. !    idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  681.  
  682.     if (params%dimensions == 1) then
  683.       ptr%density(1) = ptr%density(3)
  684.       ptr%density(2) = ptr%density(1)
  685.       ptr%density(params%Nx) = ptr%density(params%Nx - 2)
  686.       ptr%density(params%Nx-1) = ptr%density(params%Nx)
  687.       ptr%energy(1) = ptr%energy(3)
  688.       ptr%energy(2) = ptr%energy(1)
  689.       ptr%energy(params%Nx) = ptr%energy(params%Nx - 2)
  690.       ptr%energy(params%Nx-1) = ptr%energy(params%Nx)
  691.       ptr%U(1,1) = ptr%U(3,1)
  692.       ptr%U(2,1) = ptr%U(1,1)
  693.       ptr%U(params%Nx,1) = ptr%U(params%Nx - 2,1)
  694.       ptr%U(params%Nx-1,1) = ptr%U(params%Nx,1)
  695.     endif
  696.  
  697.     if (params%dimensions == 2) then
  698. ! copy x:
  699.       if ( (k >= 0) .AND. (k <= params%Nx-1) ) then
  700. !      do k = 0, params%Nx-1
  701.     idx   = k + 1
  702.     idx_j = params%Nx + idx
  703.     idx_k = 2 * params%Nx + idx
  704.     ptr%density(idx) = ptr%density(idx_k)
  705.     ptr%density(idx_j) = ptr%density(idx)
  706.     ptr%density((params%Ny - 1) * params%Nx + idx) = ptr%density((params%Ny - 3) * params%Nx + idx)
  707.     ptr%density((params%Ny - 2) * params%Nx + idx) = ptr%density((params%Ny - 1) * params%Nx + idx)
  708.     ptr%energy(idx) = ptr%energy(idx_k)
  709.     ptr%energy(idx_j) = ptr%energy(idx)
  710.     ptr%energy((params%Ny - 1) * params%Nx + idx) = ptr%energy((params%Ny - 3) * params%Nx + idx)
  711.     ptr%energy((params%Ny - 2) * params%Nx + idx) = ptr%energy((params%Ny - 1) * params%Nx + idx)
  712.     ptr%U(idx,1) = ptr%U(idx_k,1)
  713.     ptr%U(idx_j,1) = ptr%U(idx,1)
  714.     ptr%U(idx,2) = ptr%U(idx_k,2)
  715.     ptr%U(idx_j,2) = ptr%U(idx,2)
  716.     ptr%U((params%Ny - 1) * params%Nx + idx,1) = ptr%U((params%Ny - 3) * params%Nx + idx,1)
  717.     ptr%U((params%Ny - 2) * params%Nx + idx,1) = ptr%U((params%Ny - 1) * params%Nx + idx,1)
  718.     ptr%U((params%Ny - 1) * params%Nx + idx,2) = ptr%U((params%Ny - 3) * params%Nx + idx,2)
  719.     ptr%U((params%Ny - 2) * params%Nx + idx,2) = ptr%U((params%Ny - 1) * params%Nx + idx,2)
  720.       endif
  721. ! copy y:
  722.       if ( (j >= 0) .AND. (j <= params%Ny-1) ) then
  723. !      do k = 0, params%Ny-1
  724.     idx   = params%Nx * j + 1
  725.     idx_j = idx + 1
  726.     idx_k = idx + 2
  727.     ptr%density(idx) = ptr%density(idx_k)
  728.     ptr%density(idx_j) = ptr%density(idx)
  729.     ptr%density(params%Nx + idx - 1) = ptr%density(params%Nx + idx - 3)
  730.     ptr%density(params%Nx + idx - 2) = ptr%density(params%Nx + idx - 1)
  731.     ptr%energy(idx) = ptr%energy(idx_k)
  732.     ptr%energy(idx_j) = ptr%energy(idx)
  733.     ptr%energy(params%Nx + idx - 1) = ptr%energy(params%Nx + idx - 3)
  734.     ptr%energy(params%Nx + idx - 2) = ptr%energy(params%Nx + idx - 1)
  735.     ptr%U(idx,1) = ptr%U(idx_k,1)
  736.     ptr%U(idx_j,1) = ptr%U(idx,1)
  737.     ptr%U(idx,2) = ptr%U(idx_k,2)
  738.     ptr%U(idx_j,2) = ptr%U(idx,2)
  739.     ptr%U(params%Nx + idx - 1,1) = ptr%U(params%Nx + idx - 3,1)
  740.     ptr%U(params%Nx + idx - 2,1) = ptr%U(params%Nx + idx - 1,1)
  741.     ptr%U(params%Nx + idx - 1,2) = ptr%U(params%Nx + idx - 3,2)
  742.     ptr%U(params%Nx + idx - 2,2) = ptr%U(params%Nx + idx - 1,2)
  743.       endif
  744.     endif
  745.  
  746.     if (params%dimensions == 3) then
  747. !copy (for [y x z])
  748.  
  749.       if ( (j >= 0) .AND. (j <= params%Nz-1) ) then
  750.     if ( (k >= 0) .AND. (k <= params%Ny-1) ) then
  751.       idx   = params%Nx * params%Ny * j + params%Nx * k + 1
  752.       idx_j = idx + 1
  753.       idx_k = idx + 2
  754.       ptr%density(idx) = ptr%density(idx_k)
  755.       ptr%density(idx_j) = ptr%density(idx)
  756.       ptr%density(params%Nx + idx - 1) = ptr%density(params%Nx + idx - 3)
  757.       ptr%density(params%Nx + idx - 2) = ptr%density(params%Nx + idx - 1)
  758.       ptr%energy(idx) = ptr%energy(idx_k)
  759.       ptr%energy(idx_j) = ptr%energy(idx)
  760.       ptr%energy(params%Nx + idx - 1) = ptr%energy(params%Nx + idx - 3)
  761.       ptr%energy(params%Nx + idx - 2) = ptr%energy(params%Nx + idx - 1)
  762.       ptr%U(idx,1) = ptr%U(idx_k,1)
  763.       ptr%U(idx_j,1) = ptr%U(idx,1)
  764.       ptr%U(idx,2) = ptr%U(idx_k,2)
  765.       ptr%U(idx_j,2) = ptr%U(idx,2)
  766.       ptr%U(idx,3) = ptr%U(idx_k,3)
  767.       ptr%U(idx_j,3) = ptr%U(idx,3)
  768.       ptr%U(params%Nx + idx - 1,1) = ptr%U(params%Nx + idx - 3,1)
  769.       ptr%U(params%Nx + idx - 2,1) = ptr%U(params%Nx + idx - 1,1)
  770.       ptr%U(params%Nx + idx - 1,2) = ptr%U(params%Nx + idx - 3,2)
  771.       ptr%U(params%Nx + idx - 2,2) = ptr%U(params%Nx + idx - 1,2)
  772.       ptr%U(params%Nx + idx - 1,3) = ptr%U(params%Nx + idx - 3,3)
  773.       ptr%U(params%Nx + idx - 2,3) = ptr%U(params%Nx + idx - 1,3)
  774.     endif
  775.       endif
  776.  
  777. !copy (for [x x z])
  778.       if ( (j >= 0) .AND. (j <= params%Nz-1) ) then
  779.     if ( (k >= 0) .AND. (k <= params%Nx-1) ) then
  780.       idx   = params%Nx * params%Ny * j + k + 1
  781.       idx_j = idx + params%Nx
  782.       idx_k = idx + 2 * params%Nx
  783.       ptr%density(idx) = ptr%density(idx_k)
  784.       ptr%density(idx_j) = ptr%density(idx)
  785.       ptr%density((params%Ny - 1) * params%Nx + idx) = ptr%density((params%Ny - 3) * params%Nx + idx)
  786.       ptr%density((params%Ny - 2) * params%Nx + idx) = ptr%density((params%Ny - 1) * params%Nx + idx)
  787.       ptr%energy(idx) = ptr%energy(idx_k)
  788.       ptr%energy(idx_j) = ptr%energy(idx)
  789.       ptr%energy((params%Ny - 1) * params%Nx + idx) = ptr%energy((params%Ny - 3) * params%Nx + idx)
  790.       ptr%energy((params%Ny - 2) * params%Nx + idx) = ptr%energy((params%Ny - 1) * params%Nx + idx)
  791.       ptr%U(idx,1) = ptr%U(idx_k,1)
  792.       ptr%U(idx_j,1) = ptr%U(idx,1)
  793.       ptr%U(idx,2) = ptr%U(idx_k,2)
  794.       ptr%U(idx_j,2) = ptr%U(idx,2)
  795.       ptr%U(idx,3) = ptr%U(idx_k,3)
  796.       ptr%U(idx_j,3) = ptr%U(idx,3)
  797.       ptr%U((params%Ny - 1) * params%Nx + idx,1) = ptr%U((params%Ny - 3) * params%Nx + idx,1)
  798.       ptr%U((params%Ny - 2) * params%Nx + idx,1) = ptr%U((params%Ny - 1) * params%Nx + idx,1)
  799.       ptr%U((params%Ny - 1) * params%Nx + idx,2) = ptr%U((params%Ny - 3) * params%Nx + idx,2)
  800.       ptr%U((params%Ny - 2) * params%Nx + idx,2) = ptr%U((params%Ny - 1) * params%Nx + idx,2)
  801.       ptr%U((params%Ny - 1) * params%Nx + idx,3) = ptr%U((params%Ny - 3) * params%Nx + idx,3)
  802.       ptr%U((params%Ny - 2) * params%Nx + idx,3) = ptr%U((params%Ny - 1) * params%Nx + idx,3)
  803.     endif
  804.       endif
  805.  
  806. !copy (for [x x y])
  807.       if ( (j >= 0) .AND. (j <= params%Ny-1) ) then
  808.     if ( (k >= 0) .AND. (k <= params%Nx-1) ) then
  809.       idx   = params%Nx * j + k + 1
  810.       idx_j = idx + params%Nx * params%Ny
  811.       idx_k = idx + params%Nx * params%Ny * 2
  812.       ptr%density(idx) = ptr%density(idx_k)
  813.       ptr%density(idx_j) = ptr%density(idx)
  814.       ptr%density(params%Nx * params%Ny * (params%Nz - 1) + idx) = ptr%density(params%Nx * params%Ny * (params%Nz - 3) + idx)
  815.       ptr%density(params%Nx * params%Ny * (params%Nz - 2) + idx) = ptr%density(params%Nx * params%Ny * (params%Nz - 1) + idx)
  816.       ptr%energy(idx) = ptr%energy(idx_k)
  817.       ptr%energy(idx_j) = ptr%energy(idx)
  818.       ptr%energy(params%Nx * params%Ny * (params%Nz - 1) + idx) = ptr%energy(params%Nx * params%Ny * (params%Nz - 3) + idx)
  819.       ptr%energy(params%Nx * params%Ny * (params%Nz - 2) + idx) = ptr%energy(params%Nx * params%Ny * (params%Nz - 1) + idx)
  820.       ptr%U(idx,1) = ptr%U(idx_k,1)
  821.       ptr%U(idx_j,1) = ptr%U(idx,1)
  822.       ptr%U(idx,2) = ptr%U(idx_k,2)
  823.       ptr%U(idx_j,2) = ptr%U(idx,2)
  824.       ptr%U(idx,3) = ptr%U(idx_k,3)
  825.       ptr%U(idx_j,3) = ptr%U(idx,3)
  826.       ptr%U(params%Nx * params%Ny * (params%Nz - 1) + idx,1) = ptr%U(params%Nx * params%Ny * (params%Nz - 3) + idx,1)
  827.       ptr%U(params%Nx * params%Ny * (params%Nz - 2) + idx,1) = ptr%U(params%Nx * params%Ny * (params%Nz - 1) + idx,1)
  828.       ptr%U(params%Nx * params%Ny * (params%Nz - 1) + idx,2) = ptr%U(params%Nx * params%Ny * (params%Nz - 3) + idx,2)
  829.       ptr%U(params%Nx * params%Ny * (params%Nz - 2) + idx,2) = ptr%U(params%Nx * params%Ny * (params%Nz - 1) + idx,2)
  830.       ptr%U(params%Nx * params%Ny * (params%Nz - 1) + idx,3) = ptr%U(params%Nx * params%Ny * (params%Nz - 3) + idx,3)
  831.       ptr%U(params%Nx * params%Ny * (params%Nz - 2) + idx,3) = ptr%U(params%Nx * params%Ny * (params%Nz - 1) + idx,3)
  832.     endif
  833.       endif
  834.  
  835.     endif
  836.  
  837.   end subroutine
  838.  
  839.   attributes(global) subroutine initial_conditions(allf)
  840.     type(Fluids), device :: allf
  841.     integer :: i, j, k, dims, ijk(3), N
  842.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  843.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  844.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  845.     N   = params%Nx
  846.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  847.     ijk(1) = k
  848.     ijk(2) = j
  849.     ijk(3) = i
  850. !    print *, "idx", idx
  851. !    print *, "ijk", i, j, k
  852.  
  853.    if (idx >  (params%totalcells * 0.51) .AND. idx < (params%totalcells * 0.61)) then ! wave
  854.       allfluid%density(idx) = 11.3
  855.       allfluid%U(idx,:)     = -22.2
  856.       allfluid%energy(idx)  = 45.0
  857. !   if (idx <  (params%totalcells * 0.4)) then
  858.    else if (idx >  (params%totalcells * 0.39) .AND. idx < (params%totalcells * 0.49)) then ! wave
  859.       allfluid%density(idx) = 11.3
  860.       allfluid%U(idx,:)     = 22.2
  861.       allfluid%energy(idx)  = 45.0
  862.    else
  863.      allfluid%density(idx) = 11.3
  864.      allfluid%U(idx,:)     = 0.0
  865.      allfluid%energy(idx)  = 0.0
  866.    endif
  867.  
  868. !    else
  869. !     allfluid%density(idx) = 20.0
  870. !     allfluid%U(idx,:)     = 0.0
  871. !     allfluid%energy(idx)  = 0.0
  872. !    endif
  873.  
  874.     do dims = 1, params%dimensions
  875.       allf%r(idx,dims) = params%dx / 2.0 + ijk(dims) * params%dx
  876.     enddo
  877.   end subroutine
  878.  
  879.   subroutine calculateSmoothingGradientW0(inp)
  880.     type(Parameters) :: inp
  881.     real(kind=prk) dW, s
  882.     real(kind=prk) dr_ik(inp%dimensions)
  883.     integer i, j, k, i_max, j_max, k_max, dims, ijk(inp%dimensions)
  884.     if ( inp%dimensions == 3 ) then
  885.       i_max = 3; j_max = 3; k_max = 3;
  886.     else if ( inp%dimensions == 2 ) then
  887.       i_max = 1; j_max = 3; k_max = 3;
  888.     else
  889.       i_max = 1; j_max = 1; k_max = 3;
  890.     endif
  891.  
  892.    do i = 0, i_max-1
  893.       do j = 0, j_max-1
  894.     do k = 0, k_max-1
  895.  
  896.       if (inp%dimensions == 1) then
  897.         ijk(1) = k;
  898.       else if (inp%dimensions == 2) then
  899.         ijk(1) = j; ijk(2) = k;
  900.       else
  901.         ijk(1) = i; ijk(2) = j; ijk(3) = k;
  902.       endif
  903.  
  904.       do dims = 1, inp%dimensions
  905.         dr_ik(dims) = inp%smoothing_length*(1-ijk(dims))
  906.       enddo
  907.  
  908.       s  = dot_product(dr_ik, dr_ik)
  909.       s  = sqrt(s);
  910.       dW = gradW(s, inp);
  911.       inp%nablaW0(inp%WNx * inp%WNx * i + inp%WNx * j + k + 1, :) = dW * dr_ik
  912.  
  913.     enddo
  914.       enddo
  915.     enddo
  916.  
  917.   end subroutine
  918.  
  919.   function gradW(s, inp)
  920.     type(Parameters), intent(in) :: inp
  921.     real(kind=prk) :: s, ss, dW, gradW
  922.     ss = s / inp%smoothing_length
  923.     if ((ss > 0) .AND. (ss < 1.0)) then
  924.       dW = (-3.0 + 2.25 * ss ) * ss
  925.     else if((ss < 2.0) .AND. (ss >= 1.0)) then
  926.       dW = (2.0 - ss); dW = -0.75 * dW * dW
  927.     else
  928.       dW=0.0;
  929.     endif
  930.     if(s > inp%eps) then
  931.       gradW = inp%cnst * dW / s / inp%smoothing_length
  932.     else
  933.       gradW = 0.0
  934.     endif
  935.   end function gradW
  936.  
  937.   attributes(device) subroutine calculateSmoothingGradientW(r_i, r_k, W)
  938.     real(kind=prk) q, s
  939.     real(kind=prk), device :: r_i(dimen), r_k(dimen), W(dimen)
  940.     s = sqrt(SUM((r_i - r_k) ** 2))
  941.     q = s / params%smoothing_length
  942.     if (s > params%eps) then
  943.       !if s (0;1]
  944.       if ( (q > 0) .AND. (q <= 1.0) ) then
  945.     W = (r_i - r_k) * params%cnst * ( -3.0 + 2.25 * q ) * q / s / params%smoothing_length
  946.       !      W = params%cnst * (r_k - r_i) * (4.0 * params%smoothing_length - 3.0 * s)
  947.       !if s (1;2] :
  948.       else if ( (q > 1.0) .AND. (q <= 2.0) ) then
  949.     W = (r_i - r_k) * params%cnst * ( -0.75 * (2.0 - q) ** 2 ) / s / params%smoothing_length
  950.       !      W = params%cnst * (r_k - r_i) * ( (s - 2.0 * params%smoothing_length) ** 2 ) / s
  951.       else
  952.     W = 0.0
  953.       endif
  954.     else
  955.     W = 0.0
  956.     endif
  957.   end subroutine
  958.  
  959.  
  960.   subroutine cuda_run(t_save, t_end, t_step)
  961.     real(kind=prk) :: t_save, t_end, t_step
  962.     real(kind=prk) :: total_time, current_time, end_time, save_time
  963.     real(kind=prk) :: iter, h_tau
  964.     type(dim3) :: blocks, threads
  965.     character(len=70) :: fn, tmp
  966.     type(Fluids) :: h_all
  967.     type(Fluidstemp), device :: minz, maxz
  968.     type(Fluidstemp) :: h_minz, h_maxz
  969.     type(Parameters) :: outp
  970.  type ( cudaEvent ) :: startEvent , stopEvent
  971.  real :: time, random, tmpa
  972.  integer :: istat
  973.     integer, parameter :: outunit=44
  974.     integer :: filenum, i
  975.     logical :: flag
  976.     outp = params
  977.     if (outp%dimensions == 1) then
  978.     blocks = dim3(outp%Nx / BLOCK_SIZE, 1, 1)
  979.     threads = dim3(BLOCK_SIZE, 1, 1)
  980.     else if (outp%dimensions == 2) then
  981.     blocks = dim3(outp%Nx / BLOCK_SIZE, outp%Ny / BLOCK_SIZE, 1)
  982.     threads = dim3(BLOCK_SIZE, BLOCK_SIZE, 1)
  983.     else
  984.     blocks = dim3(outp%Nx / BLOCK_SIZE, outp%Ny / BLOCK_SIZE, outp%Nz / BLOCK_SIZE)
  985.     threads = dim3(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
  986.     endif
  987.  
  988. !print *, "run!"
  989.  
  990.     flag         = .FALSE.
  991.     filenum      = 1
  992.     current_time = 0.0
  993.     save_time    = 0.0    
  994.     iter         = 1.0
  995.     end_time     = t_end
  996.  
  997.     time_steps = 1000000.0
  998.     call initial_conditions<<<blocks, threads>>>(allfluid)
  999.  
  1000. !FTheta plot:
  1001.     h_all = allfluid
  1002.     ! build filename -- i.grd
  1003.     write(fn,fmt='(a)') 'ftheta.txt'
  1004.  
  1005.     ! open it with a fixed unit number
  1006.     open(unit=outunit,file=fn, form='formatted')
  1007.  
  1008.     ! write something
  1009.     do i = 1, outp%totalcells
  1010.       write(outunit, '(F0.7,x,F0.7)') h_all%r(i,1), host_ftheta_k(i, outp)
  1011.     enddo
  1012.     ! close it
  1013.     close(outunit)
  1014.  
  1015. !Calculate:
  1016.  
  1017.     do while (current_time < end_time)
  1018.  
  1019.     write(tmp,fmt='(a,i0,a)') '(',outp%Nx, 'F)'
  1020.  
  1021. !      do while (iter < 6)
  1022.  
  1023.       call cuda_boundary_conditions<<<blocks, threads>>>(allfluid)
  1024.  
  1025.       call cuda_calculate_dt<<<blocks, threads>>>(allfluid, time_steps)
  1026.  
  1027.       call thrustsort(time_steps,size(time_steps))
  1028.  
  1029.       istat = cudaMemcpy(tau, time_steps(1), 1, cudaMemcpyDeviceToDevice)
  1030.       h_tau = tau
  1031.  
  1032.       call cuda_calculate_forces_sph_predictor<<<blocks, threads>>>(allfluid, allforces)
  1033.  
  1034.       call cuda_calculate_predictor_sph<<<blocks, threads>>>(allfluid, allforces, allfluidhalf, tau)
  1035.  
  1036.       call cuda_boundary_conditions<<<blocks, threads>>>(allfluidhalf)
  1037.  
  1038.       call cuda_calculate_forces_sph_corrector<<<blocks, threads>>>(allfluid, allfluidhalf, allfluidnext)
  1039.  
  1040.       call cuda_boundary_conditions<<<blocks, threads>>>(allfluidnext)
  1041.  
  1042.       call cuda_calculate_fluidhalf<<<blocks, threads>>>(allfluid, allfluidnext, allfluidhalf)
  1043.  
  1044.       call cuda_calculate_flux_tvd<<<blocks, threads>>>(allfluidnext, allfluidhalf, allfluid, allfluxes, boolflags)
  1045.  
  1046. !   h_all  = allfluxes(1)
  1047. ! print *, "Debug: density sum:", sum(abs(h_all%density))
  1048. ! print *, "Debug: energy  sum:", sum(abs(h_all%energy))
  1049. ! print *, "Debug: impulse sum:", sum(abs(h_all%U))
  1050.  
  1051.       call cuda_finalize<<<blocks, threads>>>(allfluid, allfluidnext, allfluxes, boolflags)
  1052.  
  1053.       if (current_time >= save_time) then
  1054.  
  1055.     call integral_values<<<blocks,threads>>>(allfluidnext, allfluid)
  1056.  
  1057.     istat = cudaThreadSynchronize()
  1058.    
  1059.     h_tau  = tau
  1060.     h_all  = allfluidnext
  1061. print *, "density sum:", sum(abs(h_all%density))
  1062. print *, "energy  sum:", sum(abs(h_all%energy))
  1063. print *, "impulse sum:", sum(abs(h_all%U))
  1064.     print *, "current time:", current_time
  1065.     print *, "save time:", save_time
  1066.     print *, "dt:", h_tau
  1067.     print *, "iteration:", iter
  1068. random = 0.0
  1069. do i = 1, outp%totalcells
  1070.   if (h_all%density(i) > outp%eps) then
  1071.       tmpa = outp%gamma * h_all%energy(i) / h_all%density(i)
  1072.       if (tmpa > outp%eps) then
  1073.     tmpa = sqrt(tmpa)
  1074.     tmpa = sqrt(dot_product(h_all%U(i,:),h_all%U(i,:))) / tmpa
  1075.       endif
  1076.     if (tmpa > random) then
  1077.       random = tmpa
  1078.     endif
  1079.   endif
  1080. enddo
  1081. print *, 'Mach: ', random
  1082.  
  1083. !   call minmax<<<1,1>>>(allfluidnext, minz, maxz)
  1084.     istat = cudaMemcpy(time_steps, allfluidnext%density, grid**dimen, cudaMemcpyDeviceToDevice)
  1085.     call thrustsort(time_steps,size(time_steps))
  1086.     h_minz%density = time_steps(1)
  1087.     h_maxz%density = time_steps(grid**dimen)
  1088.  
  1089.     istat = cudaMemcpy(time_steps, allfluidnext%energy, grid**dimen, cudaMemcpyDeviceToDevice)
  1090.     call thrustsort(time_steps,size(time_steps))
  1091.     h_minz%energy  = time_steps(1)
  1092.     h_maxz%energy  = time_steps(grid**dimen)
  1093.  
  1094.     do i = 1, dimen
  1095.       istat = cudaMemcpy(time_steps, allfluidnext%U(:,i), grid**dimen, cudaMemcpyDeviceToDevice)
  1096.       call thrustsort(time_steps,size(time_steps))
  1097.       h_minz%U(i)    = time_steps(1)
  1098.       h_maxz%U(i)    = time_steps(grid**dimen)
  1099.     enddo
  1100.  
  1101.     h_all  = allfluidnext
  1102.  
  1103. !save data to hard drive:
  1104.     ! build filename -- i.grd
  1105.     write(fn,fmt='(a,i4,a)') '1d_cuda_rho_', filenum, '.txt'
  1106.  
  1107.     ! open it with a fixed unit number
  1108.     open(unit=outunit,file=fn, form='formatted')
  1109.  
  1110.     ! write something
  1111.     do i = 1, outp%totalcells
  1112.       write(outunit, '(F0.7,x,F0.7)') h_all%r(i,1), h_all%density(i)
  1113.     enddo
  1114.     ! close it
  1115.     close(outunit)
  1116.  
  1117.     ! build filename -- i.grd
  1118.     write(fn,fmt='(a,i4,a)') '1d_cuda_p_', filenum, '.txt'
  1119.  
  1120.     ! open it with a fixed unit number
  1121.     open(unit=outunit,file=fn, form='formatted')
  1122.  
  1123.     ! write something
  1124.     do i = 1, outp%totalcells
  1125.       write(outunit, '(F0.7,x,F0.7)') h_all%r(i,1), h_all%energy(i)
  1126.     enddo
  1127.  
  1128.     ! close it
  1129.     close(outunit)
  1130.  
  1131.     ! build filename -- i.grd
  1132.     write(fn,fmt='(a,i4,a)') '1d_cuda_Vx_', filenum, '.txt'
  1133.  
  1134.     ! open it with a fixed unit number
  1135.     open(unit=outunit,file=fn, form='formatted')
  1136.  
  1137.     ! write something
  1138.     do i = 1, outp%totalcells
  1139.       write(outunit, '(F0.7,x,F0.7)') h_all%r(i,1), h_all%U(i,1)
  1140.     enddo
  1141.     ! close it
  1142.     close(outunit)
  1143.  
  1144.  
  1145.     filenum   = filenum + 1
  1146.     save_time = save_time + t_save
  1147.  
  1148.       endif
  1149.  
  1150. !print *, 'current_time, h_tau', current_time, h_tau
  1151. !print *, 'iter', iter
  1152.  
  1153.       current_time = current_time + h_tau
  1154.  
  1155.       iter = iter + 1.0
  1156.  
  1157.     enddo
  1158.  
  1159.  
  1160.  
  1161.   end subroutine
  1162.  
  1163.   function host_ftheta_k(indx, inp)
  1164.     real(kind=prk) :: host_ftheta_k, coord
  1165.     type(Parameters) :: inp
  1166.     integer :: indx
  1167.  
  1168.     coord = inp%smoothing_length / 2.0 + real(indx, kind=prk) * inp%smoothing_length
  1169.  
  1170.     if (indx < inp%totalcells/3 .AND. indx > inp%totalcells/6) then
  1171. !      ftheta_k = -4.0 / ( Exp(2.0 * coord) + 2.0 + Exp(-2.0 * coord) )
  1172.       host_ftheta_k = inp%ta * coord ** 2 + inp%tb * coord + inp%tc
  1173.     else
  1174.       host_ftheta_k = 0.0
  1175.     endif
  1176. !      ftheta_k = 0.0
  1177.   end function host_ftheta_k
  1178.  
  1179.   attributes(global) subroutine minmax(in, outmin, outmax)
  1180.     type(Fluids), device :: in
  1181.     type(Fluidstemp), device :: outmax, outmin
  1182.     integer i
  1183.     !find min and max values
  1184.     outmin%density = minval(in%density(:))
  1185.     outmin%energy  = minval(in%energy(:))
  1186.     outmax%density = maxval(in%density(:))
  1187.     outmax%energy  = maxval(in%energy(:))
  1188.     do i = 1, params%dimensions
  1189.       outmin%U(i)   = minval(in%U(:,i))
  1190.       outmax%U(i)   = maxval(in%U(:,i))
  1191.     enddo
  1192.   end subroutine
  1193.  
  1194.   attributes(global) subroutine integral_values(out, in)
  1195.     type(Fluids), device :: out, in
  1196.     real(kind=prk) :: tmp(dimen)
  1197.     integer :: i, j, k, idx
  1198.     tmp = 0.0
  1199.  
  1200.     i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
  1201.     j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
  1202.     k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
  1203.  
  1204.     idx = params%Ny * params%Nx * i + params%Nx * j + k + 1
  1205.  
  1206. !density:
  1207.     out%density(idx) = in%density(idx)
  1208. !pressure:
  1209.     if (in%density(idx) > params%eps) then
  1210. !velocity:
  1211.       tmp(1:params%dimensions) = in%U(idx,:) / in%density(idx)
  1212.       out%U(idx,:) = tmp(1:params%dimensions)
  1213.     else
  1214.       tmp = 0.0
  1215.       out%U(idx,:) = 0.0
  1216.     endif      
  1217.     out%energy(idx) = (params%gamma - 1.0) * (in%energy(idx) - 0.5 * in%density(idx) * dot_product(tmp, tmp))
  1218.     if (out%energy(idx) < params%eps) then
  1219.       out%energy(idx) = 0.0
  1220.     endif
  1221.   end subroutine
  1222.  
  1223.   subroutine fthetacoef(inp)
  1224.     real(kind=prk) :: minus
  1225.     real(kind=prk) :: x1, x2, x3, y1, y2, y3
  1226.     type(Parameters) :: inp
  1227.     minus = 0.0
  1228.  
  1229. !calc x1-x3,y1-y3:
  1230.  
  1231.     x1 = inp%smoothing_length / 2.0 + (inp%totalcells / 6.0) * inp%smoothing_length
  1232.     x3 = inp%smoothing_length / 2.0 + (inp%totalcells / 3.0) * inp%smoothing_length
  1233.     x2 = (x3 + x1) / 2.0
  1234.     y1 = 0.0
  1235.     y2 = minus
  1236.     y3 = 0.0
  1237.  
  1238. print *, x1, x2, x3
  1239. print *, y1, y2, y3
  1240. !calc coef:
  1241.  
  1242.     inp%ta = (y3 - (x3 * (y2 - y1) + x2 * y1 - x1 * y2) / (x2 - x1) ) / (x3 * (x3 - x1 - x2) + x1 * x2)
  1243.     inp%tb = (y2 - y1) / (x2 - x1) - inp%ta * (x1 + x2)
  1244.     inp%tc = (x2 * y1 - x1 * y2) / (x2 - x1) + inp%ta * x1 * x2
  1245.  
  1246.   end subroutine
  1247.  
  1248.  
  1249. end module CUDASphTvd
  1250.  
  1251. program GPU_cSPH_TVD
  1252.   use CUDASphTvd
  1253.   use cudafor
  1254.   use thrust
  1255.   implicit none
  1256.   type(Parameters) :: inparams
  1257.   type(test), allocatable, device :: z(:)
  1258.   real, allocatable, device :: zz(:)
  1259.   type(test) :: h_z
  1260.   type(test), device :: d_z
  1261.   integer :: istat
  1262.  
  1263. ! ! cuda events for elapsing
  1264. ! type ( cudaEvent ) :: startEvent , stopEvent
  1265. ! real :: time, random
  1266. ! integer :: istat
  1267. !
  1268. ! ! Create events
  1269. ! istat = cudaEventCreate ( startEvent )
  1270. ! istat = cudaEventCreate ( stopEvent )
  1271. !
  1272. !
  1273. ! istat = cudaEventRecord ( startEvent , 0)
  1274. ! call thrustsort(gpuData,size(gpuData))
  1275. !
  1276. ! istat = cudaEventRecord ( stopEvent , 0)
  1277. ! istat = cudaEventSynchronize ( stopEvent )
  1278. ! istat = cudaEventElapsedTime ( time , startEvent , stopEvent )
  1279. !
  1280. ! print *," Sorted array in:",time," (ms)"
  1281.  
  1282.   inparams%smoothing_length = 0.5
  1283.   inparams%dx               = 0.5
  1284.   inparams%courant          = 0.0001
  1285.   inparams%eps              = 0.0000001
  1286. !  host%gamma            = 1.5
  1287.   inparams%gamma            = 1.5
  1288.   inparams%rho_0            = 1
  1289.   inparams%c_0              = 1
  1290.   inparams%p_0              = 1
  1291.   inparams%dimensions       = dimen
  1292.   inparams%Nx               = grid
  1293.   inparams%Ny               = abs(dimen > 1) * grid + abs(dimen /= 2)
  1294.   inparams%Nz               = abs(dimen > 2) * grid + abs(dimen /= 3)
  1295.   inparams%cnst             = 10.0 / 7.0 / 3.1415926535897932384626433832795
  1296.   inparams%totalcells       = inparams%Nx * inparams%Ny * inparams%Nz
  1297.   inparams%neighbourscount  = 3 ** inparams%dimensions
  1298.  
  1299.   if (inparams%dimensions == 1) then
  1300.     inparams%dim3d = 2; inparams%dim2d = 2
  1301.     inparams%Wnx   = 0; inparams%Wny   = 0
  1302.   else if (inparams%dimensions == 2) then
  1303.     inparams%dim3d = 2; inparams%dim2d = 0
  1304.     inparams%Wnx   = 3; inparams%Wny   = 0
  1305.   else if (inparams%dimensions == 3) then
  1306.     inparams%dim3d = 0; inparams%dim2d = 0
  1307.     inparams%Wnx   = 3; inparams%Wny   = 3
  1308.   endif
  1309. !  print *, "dbg!"
  1310.  
  1311.   call calculateSmoothingGradientW0(inparams)
  1312. !  print *, "dbg!"
  1313.  
  1314.   call fthetacoef(inparams)
  1315. print *, inparams%ta, inparams%tb, inparams%tc
  1316.  
  1317.   params = inparams
  1318.  
  1319.   allocate(allfluid, allfluidhalf, allfluidnext, allforces)
  1320.   allocate(allfluxes(dimen))
  1321.   allocate(time_steps(inparams%totalcells))
  1322.   allocate(boolflags(inparams%totalcells))
  1323.   allocate(tau)
  1324.  
  1325.   print *, "START!"
  1326.   call cuda_run(0.01, 30.0, 0.0)
  1327.   print *, "DONE CUDA!"
  1328. !Mach : |u|/Cs
  1329. end program GPU_cSPH_TVD
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement