Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- module functions
- integer, parameter :: LUL=128
- integer, parameter :: N=LUL, M=LUL, K=LUL
- integer, parameter :: grid=N !assume we have "cube" array
- integer, parameter :: dimen=3
- integer, parameter :: prk=4 !float is 4, double is 8
- integer, parameter :: BLOCK_SIZE=16
- integer, parameter :: ITERATIONS=10000000
- type Fluids3D
- real(kind=prk) :: density(N, M, K), energy(N, M, K)
- real(kind=prk) :: U(N, M, K, dimen)
- real(kind=prk) :: r(N, M, K, dimen)
- end type
- type Fluids
- real(kind=prk) :: density(grid**dimen), energy(grid**dimen)
- real(kind=prk) :: U(grid**dimen,dimen)
- real(kind=prk) :: r(grid**dimen,dimen)
- end type
- contains
- attributes(global) subroutine calculate3D(in, innext, outhalf)
- type(Fluids3D), device :: in, outhalf, innext
- integer :: i, j, k
- i = threadIdx%z + (blockIdx%z - 1) * blockDim%z
- j = threadIdx%y + (blockIdx%y - 1) * blockDim%y
- k = threadIdx%x + (blockIdx%x - 1) * blockDim%x
- outhalf%density(i, j, k) = ( in%density(i, j, k) + innext%density(i, j, k) ) / 2.0
- outhalf%energy(i, j, k) = ( in%energy(i, j, k) + innext%energy(i, j, k) ) / 2.0
- outhalf%r(i, j, k, :) = ( in%r(i, j, k, :) + innext%r(i, j, k, :) ) / 2.0
- outhalf%U(i, j, k, :) = ( in%U(i, j, k, :) + innext%U(i, j, k, :) ) / 2.0
- end subroutine calculate3D
- attributes(global) subroutine calculate1D(in, innext, outhalf)
- type(Fluids), device :: in, outhalf, innext
- integer :: i, j, k, idx
- i = threadIdx%z + (blockIdx%z - 1) * blockDim%z - 1
- j = threadIdx%y + (blockIdx%y - 1) * blockDim%y - 1
- k = threadIdx%x + (blockIdx%x - 1) * blockDim%x - 1
- idx = M * K * i + K * j + k + 1
- outhalf%density(idx) = ( in%density(idx) + innext%density(idx) ) / 2.0
- outhalf%energy(idx) = ( in%energy(idx) + innext%energy(idx) ) / 2.0
- outhalf%r(idx,:) = ( in%r(idx,:) + innext%r(idx,:) ) / 2.0
- outhalf%U(idx,:) = ( in%U(idx,:) + innext%U(idx,:) ) / 2.0
- end subroutine calculate1D
- end module functions
- program GPU
- use cudafor
- use cudadevice
- use functions
- implicit none
- integer :: istat, counter=1
- type(Fluids3D), allocatable, device :: DIM3_dev1, DIM3_dev2, DIM3_dev3
- type(Fluids), allocatable, device :: DIM1_dev1, DIM1_dev2, DIM1_dev3
- type(Fluids3D), allocatable :: DIM3_host1, DIM3_host2, DIM3_host3
- type(Fluids), allocatable :: DIM1_host1, DIM1_host2, DIM1_host3
- type(dim3) :: blocks, threads
- type (cudaEvent) :: startEvent, stopEvent, dummyEvent
- real(kind=4) :: time
- allocate(DIM3_dev1, DIM3_dev2, DIM3_dev3)
- allocate(DIM3_host1, DIM3_host2, DIM3_host3)
- allocate(DIM1_dev1, DIM1_dev2, DIM1_dev3)
- allocate(DIM1_host1, DIM1_host2, DIM1_host3)
- call random_seed
- call random_number(DIM1_host1%density)
- call random_number(DIM1_host1%energy)
- call random_number(DIM1_host1%U)
- call random_number(DIM1_host1%r)
- call random_number(DIM1_host2%density)
- call random_number(DIM1_host2%energy)
- call random_number(DIM1_host2%U)
- call random_number(DIM1_host2%r)
- call random_number(DIM1_host3%density)
- call random_number(DIM1_host3%energy)
- call random_number(DIM1_host3%U)
- call random_number(DIM1_host3%r)
- call random_number(DIM3_host1%density)
- call random_number(DIM3_host1%energy)
- call random_number(DIM3_host1%U)
- call random_number(DIM3_host1%r)
- call random_number(DIM3_host2%density)
- call random_number(DIM3_host2%energy)
- call random_number(DIM3_host2%U)
- call random_number(DIM3_host2%r)
- call random_number(DIM3_host3%density)
- call random_number(DIM3_host3%energy)
- call random_number(DIM3_host3%U)
- call random_number(DIM3_host3%r)
- DIM3_dev1 = DIM3_host1
- DIM3_dev2 = DIM3_host2
- DIM3_dev3 = DIM3_host3
- DIM1_dev1 = DIM1_host1
- DIM1_dev2 = DIM1_host2
- DIM1_dev3 = DIM1_host3
- if (dimen == 1) then
- if ( mod(N,BLOCK_SIZE) /= 0) then
- blocks = dim3(N / BLOCK_SIZE + 1, 1, 1)
- else
- blocks = dim3(N / BLOCK_SIZE, 1, 1)
- endif
- threads = dim3(BLOCK_SIZE, 1, 1)
- else if (dimen == 2) then
- if ( (mod(N,BLOCK_SIZE) /= 0) .OR. (mod(K,BLOCK_SIZE) /= 0)) then
- blocks = dim3(N / BLOCK_SIZE + 1, K / BLOCK_SIZE + 1, 1)
- else
- blocks = dim3(N / BLOCK_SIZE, K / BLOCK_SIZE, 1)
- endif
- threads = dim3(BLOCK_SIZE, BLOCK_SIZE, 1)
- else
- if ( (mod(N,BLOCK_SIZE) /= 0) .OR. (mod(K,BLOCK_SIZE) /= 0) .OR. (mod(M,BLOCK_SIZE) /= 0)) then
- blocks = dim3(N / BLOCK_SIZE + 1, K / BLOCK_SIZE + 1, BLOCK_SIZE + 1)
- else
- blocks = dim3(N / BLOCK_SIZE, K / BLOCK_SIZE, M / BLOCK_SIZE)
- endif
- threads = dim3(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
- endif
- print *, "START CUDA 3D!"
- ! run with time step 0.01,
- istat = cudaEventCreate ( startEvent )
- istat = cudaEventCreate ( stopEvent )
- istat = cudaEventRecord ( startEvent , 0)
- do while (counter < ITERATIONS)
- if (mod(counter, ITERATIONS/10) == 0) then
- print *, int(real(counter)/real(ITERATIONS)*100.0), '% done'
- endif
- call calculate3D<<<blocks, threads>>>(DIM3_dev1, DIM3_dev2, DIM3_dev3)
- call calculate3D<<<blocks, threads>>>(DIM3_dev2, DIM3_dev1, DIM3_dev3)
- call calculate3D<<<blocks, threads>>>(DIM3_dev3, DIM3_dev2, DIM3_dev1)
- counter = counter+1
- enddo
- istat = cudaEventRecord ( stopEvent , 0)
- istat = cudaEventSynchronize ( stopEvent )
- istat = cudaEventElapsedTime ( time,startEvent,stopEvent )
- print *, "DONE CUDA 3D! Time:", time
- print *, "START CUDA 1D!"
- istat = cudaEventRecord ( startEvent , 0)
- counter = 1
- do while (counter < ITERATIONS)
- if (mod(counter, ITERATIONS/10) == 0) then
- print *, int(real(counter)/real(ITERATIONS)*100.0), '% done'
- endif
- call calculate1D<<<blocks, threads>>>(DIM1_dev1, DIM1_dev2, DIM1_dev3)
- call calculate1D<<<blocks, threads>>>(DIM1_dev2, DIM1_dev1, DIM1_dev3)
- call calculate1D<<<blocks, threads>>>(DIM1_dev3, DIM1_dev2, DIM1_dev1)
- counter = counter+1
- enddo
- istat = cudaEventRecord ( stopEvent , 0)
- istat = cudaEventSynchronize ( stopEvent )
- istat = cudaEventElapsedTime ( time,startEvent,stopEvent )
- print *, "DONE CUDA 1D! Time:", time
- deallocate(DIM3_dev1, DIM3_dev2, DIM3_dev3)
- deallocate(DIM3_host1, DIM3_host2, DIM3_host3)
- deallocate(DIM1_dev1, DIM1_dev2, DIM1_dev3)
- deallocate(DIM1_host1, DIM1_host2, DIM1_host3)
- end program GPU
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement