Untitled

module functions

    integer, parameter :: LUL=128
    integer, parameter :: N=LUL, M=LUL, K=LUL
    integer, parameter :: grid=N  !assume we have "cube" array
    integer, parameter :: dimen=3
    integer, parameter :: prk=4   !float is 4, double is 8
    integer, parameter :: BLOCK_SIZE=16
    integer, parameter :: ITERATIONS=10000000

    type Fluids3D
        real(kind=prk) :: density(N, M, K), energy(N, M, K)
        real(kind=prk) :: U(N, M, K, dimen)
        real(kind=prk) :: r(N, M, K, dimen)
    end type

    type Fluids
        real(kind=prk) :: density(grid**dimen), energy(grid**dimen)
        real(kind=prk) :: U(grid**dimen,dimen)
        real(kind=prk) :: r(grid**dimen,dimen)
    end type

    contains

    attributes(global) subroutine calculate3D(in, innext, outhalf)
        type(Fluids3D), device :: in, outhalf, innext
        integer :: i, j, k

        i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z
        j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y
        k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x

        outhalf%density(i, j, k) = ( in%density(i, j, k) + innext%density(i, j, k) ) / 2.0
        outhalf%energy(i, j, k)  = ( in%energy(i, j, k)  + innext%energy(i, j, k) )  / 2.0
        outhalf%r(i, j, k, :)    = ( in%r(i, j, k, :)    + innext%r(i, j, k, :) )    / 2.0
        outhalf%U(i, j, k, :)    = ( in%U(i, j, k, :)    + innext%U(i, j, k, :) )    / 2.0
    end subroutine calculate3D

    attributes(global) subroutine calculate1D(in, innext, outhalf)
        type(Fluids), device :: in, outhalf, innext
        integer :: i, j, k, idx

        i   = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
        j   = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
        k   = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1

        idx = M * K * i + K * j + k + 1
        outhalf%density(idx) = ( in%density(idx) + innext%density(idx) ) / 2.0
        outhalf%energy(idx)  = ( in%energy(idx)  + innext%energy(idx) )  / 2.0
        outhalf%r(idx,:)     = ( in%r(idx,:)     + innext%r(idx,:) )     / 2.0
        outhalf%U(idx,:)     = ( in%U(idx,:)     + innext%U(idx,:) )     / 2.0
    end subroutine calculate1D
end module functions

program GPU
    use cudafor
    use cudadevice
    use functions
    implicit none
    integer :: istat, counter=1
    type(Fluids3D), allocatable, device :: DIM3_dev1, DIM3_dev2, DIM3_dev3
    type(Fluids), allocatable, device   :: DIM1_dev1, DIM1_dev2, DIM1_dev3
    type(Fluids3D), allocatable         :: DIM3_host1, DIM3_host2, DIM3_host3
    type(Fluids), allocatable           :: DIM1_host1, DIM1_host2, DIM1_host3
    type(dim3) :: blocks, threads
    type (cudaEvent) :: startEvent, stopEvent, dummyEvent
    real(kind=4) :: time

    allocate(DIM3_dev1, DIM3_dev2, DIM3_dev3)
    allocate(DIM3_host1, DIM3_host2, DIM3_host3)
    allocate(DIM1_dev1, DIM1_dev2, DIM1_dev3)
    allocate(DIM1_host1, DIM1_host2, DIM1_host3)

    call random_seed
    call random_number(DIM1_host1%density)
    call random_number(DIM1_host1%energy)
    call random_number(DIM1_host1%U)
    call random_number(DIM1_host1%r)
    call random_number(DIM1_host2%density)
    call random_number(DIM1_host2%energy)
    call random_number(DIM1_host2%U)
    call random_number(DIM1_host2%r)
    call random_number(DIM1_host3%density)
    call random_number(DIM1_host3%energy)
    call random_number(DIM1_host3%U)
    call random_number(DIM1_host3%r)
    call random_number(DIM3_host1%density)
    call random_number(DIM3_host1%energy)
    call random_number(DIM3_host1%U)
    call random_number(DIM3_host1%r)
    call random_number(DIM3_host2%density)
    call random_number(DIM3_host2%energy)
    call random_number(DIM3_host2%U)
    call random_number(DIM3_host2%r)
    call random_number(DIM3_host3%density)
    call random_number(DIM3_host3%energy)
    call random_number(DIM3_host3%U)
    call random_number(DIM3_host3%r)

    DIM3_dev1 = DIM3_host1
    DIM3_dev2 = DIM3_host2
    DIM3_dev3 = DIM3_host3
    DIM1_dev1 = DIM1_host1
    DIM1_dev2 = DIM1_host2
    DIM1_dev3 = DIM1_host3

    if (dimen == 1) then
        if ( mod(N,BLOCK_SIZE) /= 0) then
            blocks = dim3(N / BLOCK_SIZE + 1, 1, 1)
        else
            blocks = dim3(N / BLOCK_SIZE, 1, 1)
        endif
        threads = dim3(BLOCK_SIZE, 1, 1)
    else if (dimen == 2) then
        if ( (mod(N,BLOCK_SIZE) /= 0) .OR. (mod(K,BLOCK_SIZE) /= 0)) then
            blocks = dim3(N / BLOCK_SIZE + 1, K / BLOCK_SIZE + 1, 1)
        else
            blocks = dim3(N / BLOCK_SIZE, K / BLOCK_SIZE, 1)
        endif
        threads = dim3(BLOCK_SIZE, BLOCK_SIZE, 1)
    else
        if ( (mod(N,BLOCK_SIZE) /= 0) .OR. (mod(K,BLOCK_SIZE) /= 0) .OR. (mod(M,BLOCK_SIZE) /= 0)) then
            blocks = dim3(N / BLOCK_SIZE + 1, K / BLOCK_SIZE + 1, BLOCK_SIZE + 1)
        else
            blocks = dim3(N / BLOCK_SIZE, K / BLOCK_SIZE, M / BLOCK_SIZE)
        endif
        threads = dim3(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    endif

    print *, "START CUDA 3D!"
    ! run with time step 0.01,

    istat = cudaEventCreate ( startEvent )
    istat = cudaEventCreate ( stopEvent )

    istat = cudaEventRecord ( startEvent , 0)

    do while (counter < ITERATIONS)
        if (mod(counter, ITERATIONS/10) == 0) then
            print *, int(real(counter)/real(ITERATIONS)*100.0), '% done'
        endif
        call calculate3D<<<blocks, threads>>>(DIM3_dev1, DIM3_dev2, DIM3_dev3)
        call calculate3D<<<blocks, threads>>>(DIM3_dev2, DIM3_dev1, DIM3_dev3)
        call calculate3D<<<blocks, threads>>>(DIM3_dev3, DIM3_dev2, DIM3_dev1)
        counter = counter+1
    enddo

    istat = cudaEventRecord ( stopEvent , 0)
    istat = cudaEventSynchronize ( stopEvent )
    istat = cudaEventElapsedTime ( time,startEvent,stopEvent )
    print *, "DONE CUDA 3D! Time:", time

    print *, "START CUDA 1D!"

    istat = cudaEventRecord ( startEvent , 0)

    counter = 1

    do while (counter < ITERATIONS)
        if (mod(counter, ITERATIONS/10) == 0) then
            print *, int(real(counter)/real(ITERATIONS)*100.0), '% done'
        endif
        call calculate1D<<<blocks, threads>>>(DIM1_dev1, DIM1_dev2, DIM1_dev3)
        call calculate1D<<<blocks, threads>>>(DIM1_dev2, DIM1_dev1, DIM1_dev3)
        call calculate1D<<<blocks, threads>>>(DIM1_dev3, DIM1_dev2, DIM1_dev1)
        counter = counter+1
    enddo

    istat = cudaEventRecord ( stopEvent , 0)
    istat = cudaEventSynchronize ( stopEvent )
    istat = cudaEventElapsedTime ( time,startEvent,stopEvent )
    print *, "DONE CUDA 1D! Time:", time

    deallocate(DIM3_dev1, DIM3_dev2, DIM3_dev3)
    deallocate(DIM3_host1, DIM3_host2, DIM3_host3)
    deallocate(DIM1_dev1, DIM1_dev2, DIM1_dev3)
    deallocate(DIM1_host1, DIM1_host2, DIM1_host3)

end program GPU