Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ! --------------------------------------------------------------------
- MODULE kinds
- INTEGER, PARAMETER :: RK8 = SELECTED_REAL_KIND(15, 300)
- END MODULE kinds
- ! --------------------------------------------------------------------
- module arrayMod
- ! BIG ARRAY
- REAL(KIND=8),dimension(:,:),allocatable :: pool3
- REAL(KIND=8),dimension(:,:),ALLOCATABLE :: a3,b ! a3 working matrices
- ! small ARRAY pool (read-only in parallel small matrix tests)
- REAL(KIND=8),dimension(:,:,:),allocatable :: pool
- end module arrayMod
- module functioninv
- use arrayMod
- REAL(KIND=8), dimension(:,:) ,ALLOCATABLE :: Ainv
- common /allocateds/ allocated
- integer allocated
- contains
- function inv(A) result(Ainv)
- REAL(KIND=8), dimension(:,:), intent(in) :: A
- REAL(KIND=8), dimension(:,:) ,ALLOCATABLE :: Ainv
- REAL(KIND=8), ALLOCATABLE :: work(:)
- integer, dimension(size(A,1)) :: ipiv ! pivot indices
- integer :: n, info,stat
- ! Store A in Ainv to prevent it from being overwritten by LAPACK
- Ainv = A
- n = size(A,1)
- lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
- !write ( *, '(a,i6)' ) ' Optimized work matrix order = ', lwork
- allocate (work(lwork))
- ! DGETRF computes an LU factorization of a general M-by-N matrix A
- ! using partial pivoting with row interchanges.
- call DGETRF(n, n, Ainv, n, ipiv, info)
- if (info /= 0) then
- stop 'Matrix is numerically singular!'
- end if
- ! DGETRI computes the inverse of a matrix using the LU factorization
- ! computed by DGETRF.
- call DGETRI(n, Ainv, n, ipiv, work, lwork, info)
- if (info /= 0) then
- stop 'Matrix inversion failed!'
- end if
- deallocate (work)
- end function inv
- end module functioninv
- !------------
- PROGRAM TEST_FPU_NOSTACK ! A number-crunching benchmark using matrix inversion.
- USE kinds ! Implemented by: David Frank [email protected]
- USE functioninv ! Add for function Blas as example not as subroutine
- USE arrayMod ! For stack overflow BIG MATRIX
- USE OMP_LIB ! Use OpenMP library for multi-threading
- ! Crout routine by: James Van Buskirk [email protected]
- ! Lapack routine by: Jos Bergervoet [email protected]
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- ! - - - local variables - - -
- REAL(KIND=8) :: avg_err, dt(20),avg_err2
- INTEGER :: i, n, t(8), clock1, clock2, rate,j,k
- ! Declarations moved to the declaration section
- REAL(KIND=8), ALLOCATABLE :: a_local(:,:) ! Thread-private working matrix
- REAL(KIND=8), ALLOCATABLE :: a_final_smallits(:,:) ! To store the result for pool(:,:,smallits)
- smallsize=250 !250
- smallits=1000 ! RJA initial values were 101 and 1000 !1000
- bigsize=2000 ! RJA initial value was 1000 then 2000
- Call arraySub
- WRITE (*,*) ' Benchmark running, hopefully as only ACTIVE task'
- WRITE (*,*) ' Number of OpenMP threads: ', omp_get_max_threads()
- call arrayFILL
- ! - - - begin benchmark - - -
- dt = 0.d0
- DO n = 1,9
- CALL SYSTEM_CLOCK (clock1,rate) ! get benchmark (n) start time
- SELECT CASE (n)
- CASE (1)
- ! Test 1: Gauss - Multi-threaded over smallits, SIMD within Gauss
- ! Use OpenMP to parallelize the loop over 'smallits'
- ! Each thread will process a different matrix from the 'pool'
- ! Allocate a_final_smallits before the parallel region
- ALLOCATE(a_final_smallits(smallsize, smallsize))
- !$OMP PARALLEL DO PRIVATE(i, a_local) SHARED(pool, smallsize, smallits, a_final_smallits) DEFAULT(SHARED)
- DO i = 1,smallits
- ! a_local is declared outside and is PRIVATE, OpenMP handles allocation/deallocation per thread
- ALLOCATE(a_local(smallsize, smallsize)) ! Allocate thread-private a_local
- a_local = pool(:,:,i) ! get next matrix to invert (thread-safe read from pool)
- CALL Gauss_blas(a_local, smallsize) ! invert a_local
- CALL Gauss_blas(a_local, smallsize) ! invert a_local back
- ! If this is the last matrix (index smallits), store its result
- IF (i == smallits) THEN
- a_final_smallits = a_local
- END IF
- DEALLOCATE(a_local) ! Deallocate thread-private a_local
- END DO
- !$OMP END PARALLEL DO
- ! Calculate error using the result for the matrix at index 'smallits'
- avg_err = SUM(ABS(a_final_smallits - pool(:,:,smallits)))/(smallsize*smallsize)
- DEALLOCATE(a_final_smallits)
- CALL SYSTEM_CLOCK (clock2,rate)
- dt(n) = (clock2-clock1)/DBLE(rate) ! get benchmark (n) elapsed sec.
- ! Adjusted format for error
- WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
- "Test1: Gauss1OMP- ", smallits," (",smallsize,"x",smallsize,") inverts in ", dt(n), ' seconds Err=', avg_err
- CASE (2)
- ! Test 2: Crout - Multi-threaded over smallits, but serial within Crout
- ALLOCATE(a_final_smallits(smallsize, smallsize))
- ! Disable nested parallelism for this case
- !$OMP PARALLEL DO PRIVATE(i, a_local) SHARED(pool, smallsize, smallits, a_final_smallits)
- DO i = 1,smallits
- ALLOCATE(a_local(smallsize, smallsize))
- a_local = pool(:,:,i)
- CALL Croutblasted(a_local, smallsize)
- CALL Croutblasted(a_local, smallsize)
- IF (i == smallits) a_final_smallits = a_local
- DEALLOCATE(a_local)
- END DO
- !$OMP END PARALLEL DO
- ! Calculate error using the result for the matrix at index 'smallits'
- avg_err = SUM(ABS(a_final_smallits - pool(:,:,smallits)))/(smallsize*smallsize)
- DEALLOCATE(a_final_smallits)
- CALL SYSTEM_CLOCK (clock2,rate)
- dt(n) = (clock2-clock1)/DBLE(rate) ! get benchmark (n) elapsed sec.
- ! Adjusted format for error
- WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
- "Test2: CROUT1OMP- ", smallits," (",smallsize,"x",smallsize,") inverts in ", dt(n), ' seconds Err=', avg_err
- CASE (3)
- if(bigsize.le.2500) then
- i=1
- call FILL_a(i,bigsize) ! get bigsize x bigsize matrix
- ! Note: Crout for bigsize is still serial in this implementation.
- ! Parallelizing Crout itself is more complex (like the Julia threaded solve).
- ! Adding SIMD directives to inner loops might offer some speedup.
- print *, 'Calling Croutmt- 1 Bigsize'
- CALL crout_inverse_threaded3(a3, bigsize) ! invert a3
- print *, 'Calling Croutmt- 2 Bigsize'
- CALL crout_inverse_threaded3(a3, bigsize) ! invert a3
- !call avgerr2(avg_err)
- avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize) ! invert err.
- CALL SYSTEM_CLOCK (clock2,rate)
- dt(n) = (clock2-clock1)/DBLE(rate) ! get benchmark (n) elapsed sec.
- ! Adjusted format for error
- WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
- "Test3: CROUTMT- ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds Err=', avg_err
- else
- print *, 'Calling Crout - Bgsize cancelled, too long to solve. Next: LAPACK Solver'
- endif
- CASE (4)
- i=1
- call FILL_a(i,bigsize) ! get bigsize x bigsize matrix to a3
- CALL Lapack (bigsize) ! invert a3
- CALL Lapack (bigsize) ! invert a3
- avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize) ! invert err.
- CALL SYSTEM_CLOCK (clock2,rate)
- dt(n) = (clock2-clock1)/DBLE(rate) ! get benchmark (n) elapsed sec.
- ! Adjusted format for error
- WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
- "Test4: DGETRF - ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds Err=', avg_err
- CASE (5)
- i=1
- call FILL_a(i,bigsize) ! get bigsize x bigsize matrix to a3
- CALL DGEtrf2call (bigsize) ! invert a3
- CALL DGEtrf2call (bigsize) ! invert a3
- avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize) ! invert err.
- CALL SYSTEM_CLOCK (clock2,rate)
- dt(n) = (clock2-clock1)/DBLE(rate) ! get benchmark (n) elapsed sec.
- ! Adjusted format for error
- WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
- "Test5: DGETRF2- ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds Err=', avg_err
- CASE (6)
- i=1
- call FILL_a(i,bigsize)
- CALL Lapack2 (bigsize)
- CALL Lapack2 (bigsize)
- avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize) ! invert err.
- CALL SYSTEM_CLOCK (clock2,rate)
- dt(n) = (clock2-clock1)/DBLE(rate) ! get benchmark (n) elapsed sec.
- ! Adjusted format for error
- WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
- "Test6: DGESV - ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds Err=', avg_err
- CASE (7)
- CASE (8)
- CASE (9)
- END SELECT
- END DO ! for test 1-4
- WRITE (*,92) ' Total =',SUM(dt), ' sec'
- WRITE (*,*)
- 92 FORMAT (A,F5.1,A,F18.15)
- call arraygone()
- END PROGRAM TEST_FPU_NOSTACK
- ! allocating
- subroutine arraySub
- use arrayMod
- use functioninv
- implicit none
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- integer :: stat
- !write(*,*) 'Enter number of rows'
- !read *,row
- !write(*,*) 'Enter number of cols'
- !read *,col
- write(*,*) 'Inside arraySub() allocating memory ...'
- allocate (pool(smallsize,smallsize,smallits),stat=stat)
- if (stat/=0) stop 'Cannot allocate memory 1'
- allocate (pool3(bigsize,bigsize),stat=stat )
- if (stat/=0) stop 'Cannot allocate memory 2'
- allocate (a3(bigsize,bigsize),stat=stat )
- if (stat/=0) stop 'Cannot allocate memory 4'
- allocate (b(bigsize,bigsize),stat=stat )
- if (stat/=0) stop 'Cannot allocate memory 5'
- if (stat/=0) stop 'Cannot allocate memory'
- pool3 = 0.d0
- pool = 0.d0
- ! a = 0.d0 ! No longer global
- a3 = 0.d0
- print *, 'Memory allocation sufficient'
- end subroutine arraySub
- subroutine arraygone
- use arrayMod
- use functioninv
- implicit none
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- integer :: stat
- ! Deallocate only the globally allocated arrays
- deallocate (pool,pool3,a3,b)
- !deallocate (Ainv) ! Uncomment if Ainv was allocated globally
- end subroutine arraygone
- subroutine arrayFILL
- use arrayMod
- implicit none
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- integer :: t(8)
- CALL DATE_AND_TIME ( values = t )
- CALL RANDOM_SEED() ! set seed to random number based on time
- CALL RANDOM_NUMBER(pool) ! fill pool with random data ( 0. -> 1. )
- CALL RANDOM_NUMBER(pool3) ! fill pool with random data ( 0. -> 1. )
- end subroutine arrayfill
- subroutine FILL_a(i,n)
- use arrayMod
- implicit none
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- integer i,j,k,n
- ! This subroutine is primarily used for the big matrix case now.
- ! Small matrices are copied directly from 'pool' to thread-local arrays.
- if (n.eq.bigsize) then
- a3 = pool3
- endif
- end subroutine fill_a
- ! avgerr and avgerr2 are no longer needed as separate subroutines
- ! as the error calculation is done in the main program.
- ! SUBROUTINE avgerr(avg_err)
- ! use arrayMod
- ! IMPLICIT NONE
- ! real*8 :: avg_err
- ! integer :: smallsize,bigsize,smallits
- ! common smallsize,bigsize,smallits
- ! avg_err = SUM(ABS(a-pool(:,:,smallits)))/(smallsize*smallsize) ! last matrix error
- ! end SUBROUTINE avgerr
- ! --------------------------------------------------------------------
- SUBROUTINE Gauss (a_matrix, n) ! Invert matrix by Gauss method
- ! Modified to accept matrix as argument and use thread-local bsmall
- ! --------------------------------------------------------------------
- IMPLICIT NONE
- INTEGER, INTENT(IN) :: n
- REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix ! Matrix to invert
- ! - - - Local Variables - - -
- REAL(KIND=8), DIMENSION(n,n) :: bsmall ! Thread-local working matrix
- REAL(KIND=8):: c, d, temp(n)
- INTEGER :: i, j, k, m, imax(1), ipvt(n)
- ! - - - - - - - - - - - - - -
- bsmall = a_matrix
- ipvt = (/ (i, i = 1, n) /)
- DO k = 1,n
- imax = MAXLOC(ABS(bsmall(k:n,k)))
- m = k-1+imax(1)
- IF (m /= k) THEN
- ipvt( (/m,k/) ) = ipvt( (/k,m/) )
- bsmall((/m,k/),:) = bsmall((/k,m/),:)
- END IF
- d = 1/bsmall(k,k)
- temp = bsmall(:,k)
- ! Apply SIMD directive to the inner loop for vectorization
- !$OMP SIMD
- DO j = 1, n
- c = bsmall(k,j)*d
- bsmall(:,j) = bsmall(:,j)-temp*c
- bsmall(k,j) = c
- END DO
- bsmall(:,k) = temp*(-d)
- bsmall(k,k) = d
- END DO
- a_matrix(:,ipvt) = bsmall
- END SUBROUTINE Gauss
- SUBROUTINE Gauss_BLAS(a_matrix, n)
- USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_DOUBLE
- IMPLICIT NONE
- INTEGER, INTENT(IN) :: n
- REAL(C_DOUBLE), DIMENSION(n,n), INTENT(INOUT) :: a_matrix
- REAL(C_DOUBLE), DIMENSION(n,n) :: bsmall
- REAL(C_DOUBLE) :: c, d
- REAL(C_DOUBLE), DIMENSION(n) :: temp
- INTEGER :: i, j, k, m, imax(1), ipvt(n)
- bsmall = a_matrix
- ipvt = (/ (i, i = 1, n) /)
- DO k = 1, n
- imax = MAXLOC(ABS(bsmall(k:n,k)))
- m = k - 1 + imax(1)
- IF (m /= k) THEN
- ipvt((/m, k/)) = ipvt((/k, m/))
- bsmall((/m, k/), :) = bsmall((/k, m/), :)
- END IF
- d = 1.0D0 / bsmall(k,k)
- ! temp = bsmall(:,k)
- CALL DCOPY(n, bsmall(1,k), 1, temp, 1)
- !$OMP SIMD
- DO j = 1, n
- c = bsmall(k,j) * d
- ! bsmall(:,j) = bsmall(:,j) - temp * c
- CALL DAXPY(n, -c, temp, 1, bsmall(1,j), 1)
- bsmall(k,j) = c
- END DO
- ! bsmall(:,k) = temp * (-d)
- CALL DCOPY(n, temp, 1, bsmall(1,k), 1)
- CALL DSCAL(n, -d, bsmall(1,k), 1)
- bsmall(k,k) = d
- END DO
- ! Apply column permutation
- a_matrix(:, ipvt) = bsmall
- END SUBROUTINE Gauss_BLAS
- ! -------------------------------------------------------------------
- SUBROUTINE Crout (a_matrix, n) ! Invert matrix by Crout method not using BLAS
- ! Modified to accept matrix as argument and use thread-local bsmall
- ! Optimized with SIMD directives where applicable.
- ! This version is for small matrices processed in parallel.
- ! -------------------------------------------------------------------
- IMPLICIT NONE
- INTEGER, INTENT(IN) :: n
- REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix ! Matrix to invert
- ! - - - Local Variables - - -
- REAL(KIND=8), DIMENSION(n,n) :: bsmall ! Thread-local working matrix
- INTEGER :: i, j, m, imax(1) ! Current row & column, max pivot loc
- INTEGER :: index(n) ! Partial pivot record
- REAL(KIND=8) :: temp(n) ! working arrays, temp
- index = (/(i,i=1,n)/) ! initialize column index
- DO j = 1, n ! Shuffle matrix a -> bsmall
- DO i = 1, j-1
- bsmall(i, j) = a_matrix(i, j)
- END DO
- DO i = j, n
- bsmall(i, j) = a_matrix(n+1-j, i+1-j)
- END DO
- END DO
- DO j = 1, n ! LU decomposition; reciprocals of diagonal elements in L matrix
- DO i = j, n ! Get current column of L matrix
- ! DOT_PRODUCT is usually auto-vectorized. Explicit SIMD might not be needed or helpful.
- bsmall(n-i+j,n+1-i) = bsmall(n-i+j,n+1-i)-DOT_PRODUCT(bsmall(n+1-i:n-i+j-1,n+1-i), bsmall(1:j-1,j))
- END DO
- imax = MAXLOC(ABS( (/ (bsmall(j+i-1,i),i=1,n-j+1) /) ))
- m = imax(1)
- bsmall(j+m-1,m) = 1/bsmall(j+m-1,m)
- IF (m /= n+1-j) THEN ! Swap biggest element to current pivot position
- index((/j,n+1-m/)) = index((/n+1-m,j/))
- bsmall((/j,n+1-m/),n+2-m:n) = bsmall((/n+1-m,j/),n+2-m:n)
- temp(1:n+1-m) = bsmall(m:n, m)
- bsmall(m:j-1+m, m) = bsmall(n+1-j:n, n+1-j)
- bsmall(j+m:n, m) = bsmall(j, j+1:n+1-m)
- bsmall(n+1-j:n, n+1-j) = temp(1:j)
- bsmall(j, j+1:n+1-m) = temp(j+1:n+1-m)
- END IF
- DO i = j+1, n ! Get current row of U matrix
- ! DOT_PRODUCT is usually auto-vectorized.
- bsmall(j,i) = bsmall(n,n+1-j)*( bsmall(j,i)-DOT_PRODUCT( bsmall(n+1-j:n-1,n+1-j), bsmall(1:j-1,i)))
- END DO
- END DO
- DO j = 1, n-1 ! Invert L matrix
- temp(1) = bsmall(n, n+1-j)
- DO i = j+1, n
- ! DOT_PRODUCT is usually auto-vectorized.
- bsmall(n-i+j,n+1-i) = -DOT_PRODUCT(bsmall(n-i+j:n-1,n+1-i),temp(1:i-j))*bsmall(n,n+1-i)
- temp(i-j+1) = bsmall(n-i+j,n+1-i)
- END DO
- END DO
- DO i = 1, (n+1)/3 ! Reshuffle matrix
- temp(1:n+2-3*i) = bsmall(2*i:n+1-i,i)
- DO j = 2*i, n+1-i
- bsmall(j, i) = bsmall(n+i-j, n+1-j)
- END DO
- DO j = i, n+1-2*i
- bsmall(i+j-1, j) = bsmall(n+1-i, n+2-i-j)
- END DO
- bsmall(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
- END DO
- DO i = 1, n-1 ! Invert U matrix
- DO j = i+1, n
- ! DOT_PRODUCT is usually auto-vectorized.
- bsmall(i,j) = -bsmall(i,j)-DOT_PRODUCT(temp(1:j-i-1), bsmall(i+1:j-1,j))
- temp(j-i) = bsmall(i,j)
- END DO
- END DO
- DO i = 1, n-1 ! Multiply inverses in reverse order
- temp(1:n-i) = bsmall(i,i+1:n)
- ! Apply SIMD directive to the inner loops for vectorization
- !$OMP SIMD
- DO j = 1,i
- ! DOT_PRODUCT is usually auto-vectorized.
- bsmall(i,j) = bsmall(i,j)+DOT_PRODUCT(temp(1:n-i),bsmall(i+1:n,j))
- END DO
- !$OMP SIMD
- DO j = i+1, n
- bsmall(i,j) = DOT_PRODUCT(temp(j-i:n-i),bsmall(j:n,j))
- END DO
- END DO
- a_matrix(:,index) = bsmall ! output straightened columns of the inverse
- END SUBROUTINE Crout
- ! -------------------------------------------------------------------
- SUBROUTINE Croutblasted (a_matrix, n) ! Invert matrix by Crout method
- USE kinds
- IMPLICIT NONE
- INTEGER, INTENT(IN) :: n
- REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix
- ! BLAS Interfaces
- INTERFACE
- SUBROUTINE DGEMV(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
- USE kinds
- CHARACTER(LEN=1), INTENT(IN) :: TRANS
- INTEGER, INTENT(IN) :: M, N, LDA, INCX, INCY
- REAL(KIND=8), INTENT(IN) :: ALPHA, BETA
- REAL(KIND=8), INTENT(IN) :: A(LDA,*), X(*)
- REAL(KIND=8), INTENT(INOUT) :: Y(*)
- END SUBROUTINE DGEMV
- REAL(KIND=8) FUNCTION DDOT(N, DX, INCX, DY, INCY)
- USE kinds
- INTEGER, INTENT(IN) :: N, INCX, INCY
- REAL(KIND=8), INTENT(IN) :: DX(*), DY(*)
- END FUNCTION DDOT
- SUBROUTINE DSWAP(N, DX, INCX, DY, INCY)
- USE kinds
- INTEGER, INTENT(IN) :: N, INCX, INCY
- REAL(KIND=8), INTENT(INOUT) :: DX(*), DY(*)
- END SUBROUTINE DSWAP
- END INTERFACE
- ! Constants
- REAL(KIND=8), PARAMETER :: one = 1.0_RK8
- REAL(KIND=8), PARAMETER :: zero = 0.0_RK8
- ! Rest of your implementation...
- REAL(KIND=8), DIMENSION(n,n) :: bsmall
- INTEGER :: i, j, m, imax(1) ! Current row & column, max pivot loc
- INTEGER :: index(n) ! Partial pivot record
- REAL(KIND=8) :: temp(n) ! working arrays, temp
- !!!! STEP 1: Matrix Shuffle - CAN'T BE BLAS OPTIMIZED (irregular access)
- index = (/(i,i=1,n)/)
- DO j = 1, n
- DO i = 1, j-1
- bsmall(i, j) = a_matrix(i, j)
- END DO
- DO i = j, n
- bsmall(i, j) = a_matrix(n+1-j, i+1-j)
- END DO
- END DO
- DO j = 1, n ! LU decomposition; reciprocals of diagonal elements in L matrix
- !!!! STEP 2: LU Decomposition - PARTIALLY OPTIMIZED
- !!!! DONE: L column computation with DDOT
- DO i = j, n
- bsmall(n-i+j,n+1-i) = bsmall(n-i+j,n+1-i) - &
- DDOT(j-1, bsmall(n+1-i:n-i+j-1,n+1-i), 1, bsmall(1:j-1,j), 1)
- END DO
- imax = MAXLOC(ABS( (/ (bsmall(j+i-1,i),i=1,n-j+1) /) ))
- m = imax(1)
- bsmall(j+m-1,m) = 1/bsmall(j+m-1,m)
- IF (m /= n+1-j) THEN ! Swap biggest element to current pivot position
- index((/j,n+1-m/)) = index((/n+1-m,j/))
- bsmall((/j,n+1-m/),n+2-m:n) = bsmall((/n+1-m,j/),n+2-m:n)
- temp(1:n+1-m) = bsmall(m:n, m)
- bsmall(m:j-1+m, m) = bsmall(n+1-j:n, n+1-j)
- bsmall(j+m:n, m) = bsmall(j, j+1:n+1-m)
- bsmall(n+1-j:n, n+1-j) = temp(1:j)
- bsmall(j, j+1:n+1-m) = temp(j+1:n+1-m)
- END IF
- !!!! TODO: U row computation - Current uses DOT_PRODUCT
- ! Potential optimization: Replace with DDOT
- DO i = j+1, n
- bsmall(j,i) = bsmall(n,n+1-j)*(bsmall(j,i) - &
- DDOT(j-1, bsmall(n+1-j:n-1,n+1-j), 1, bsmall(1:j-1,i), 1))
- END DO
- END DO
- !!!! STEP 3: L Inversion - DONE with DDOT
- DO j = 1, n-1
- temp(1) = bsmall(n, n+1-j)
- DO i = j+1, n
- bsmall(n-i+j,n+1-i) = -DDOT(i-j, bsmall(n-i+j:n-1,n+1-i), 1, temp(1), 1) * bsmall(n,n+1-i)
- temp(i-j+1) = bsmall(n-i+j,n+1-i)
- END DO
- END DO
- !!!! STEP 4: Matrix Reshuffle - CAN'T BE BLAS OPTIMIZED (irregular access)
- DO i = 1, (n+1)/3
- temp(1:n+2-3*i) = bsmall(2*i:n+1-i,i)
- DO j = 2*i, n+1-i
- bsmall(j, i) = bsmall(n+i-j, n+1-j)
- END DO
- DO j = i, n+1-2*i
- bsmall(i+j-1, j) = bsmall(n+1-i, n+2-i-j)
- END DO
- bsmall(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
- END DO
- !!!! STEP 5: U Inversion - DONE with DDOT
- DO i = 1, n-1
- DO j = i+1, n
- bsmall(i,j) = -bsmall(i,j) - DDOT(j-i-1, temp(1), 1, bsmall(i+1,j), 1)
- temp(j-i) = bsmall(i,j)
- END DO
- END DO
- !!!! STEP 6: Final Multiplication - TODO: Current uses DOT_PRODUCT
- ! Potential optimization: Replace with DGEMV
- DO i = 1, n-1
- temp(1:n-i) = bsmall(i,i+1:n)
- DO j = 1,i
- bsmall(i,j) = bsmall(i,j) + DOT_PRODUCT(temp(1:n-i),bsmall(i+1:n,j))
- END DO
- DO j = i+1, n
- bsmall(i,j) = DOT_PRODUCT(temp(j-i:n-i),bsmall(j:n,j))
- END DO
- END DO
- a_matrix(:,index) = bsmall ! output straightened columns of the inverse
- END SUBROUTINE Croutblasted
- ! ===== chat gpt
- subroutine crout_inverse_threaded3(A, n)
- USE kinds
- implicit none
- integer, intent(in) :: n
- REAL(RK8), intent(inout) :: A(n,n)
- real(RK8), allocatable :: L(:,:), U(:,:), Ainv(:,:)
- integer :: i, j, k
- real(RK8) :: sum_val, inv_Ljj
- real(RK8), parameter :: eps = 1.0e-12_RK8
- if (n.lt.500) then
- call Croutblasted(A,n)
- else
- ! Allocate aligned matrices
- allocate(L(n,n), source=0.0_RK8)
- allocate(U(n,n), source=0.0_RK8)
- allocate(Ainv(n,n), source=0.0_RK8)
- !$omp parallel do simd aligned(U:64)
- do i = 1, n
- U(i,i) = 1.0_RK8
- end do
- do j = 1, n
- ! Crout: compute L column
- !$omp parallel do private(i, k, sum_val) shared(A, L, U)
- do i = j, n
- sum_val = 0.0_RK8
- !$omp simd reduction(+:sum_val)
- do k = 1, j-1
- sum_val = sum_val + L(i,k) * U(k,j)
- end do
- L(i,j) = A(i,j) - sum_val
- end do
- !$omp end parallel do
- if (abs(L(j,j)) < eps * n * sqrt(real(n, RK8))) then
- error stop "Matrix is numerically singular in Crout LU"
- end if
- ! Crout: compute U row
- inv_Ljj = 1.0_RK8 / L(j,j)
- !$omp parallel do private(i, k, sum_val) shared(A, L, U)
- do i = j+1, n
- sum_val = 0.0_RK8
- !$omp simd reduction(+:sum_val)
- do k = 1, j-1
- sum_val = sum_val + L(j,k) * U(k,i)
- end do
- U(j,i) = (A(j,i) - sum_val) * inv_Ljj
- end do
- !$omp end parallel do
- end do
- ! Solve each column of inverse matrix in parallel
- !$omp parallel do
- do i = 1, n
- call solve_column(L, U, n, i, Ainv(:,i))
- end do
- !$omp end parallel do
- A = Ainv
- endif
- contains
- subroutine solve_column(L, U, n, col_idx, x)
- real(RK8), intent(in) :: L(n,n), U(n,n)
- integer, intent(in) :: n, col_idx
- real(RK8), intent(out) :: x(n)
- real(RK8) :: e(n), y(n)
- integer :: row, k
- real(RK8) :: sum_val
- e = 0.0_RK8
- e(col_idx) = 1.0_RK8
- ! Forward substitution: solve L y = e
- do row = 1, n
- sum_val = 0.0_RK8
- !$omp simd reduction(+:sum_val)
- do k = 1, row-1
- sum_val = sum_val + L(row,k) * y(k)
- end do
- y(row) = (e(row) - sum_val) / L(row,row)
- end do
- ! Backward substitution: solve U x = y
- do row = n, 1, -1
- sum_val = 0.0_RK8
- !$omp simd reduction(+:sum_val)
- do k = row+1, n
- sum_val = sum_val + U(row,k) * x(k)
- end do
- x(row) = (y(row) - sum_val) / U(row,row)
- end do
- end subroutine solve_column
- end subroutine crout_inverse_threaded3
- ! =====
- ! --------------------------------------------------------------------
- SUBROUTINE Lapack (n) ! Invert matrix by Lapack method
- ! --------------------------------------------------------------------
- use arrayMod
- IMPLICIT NONE
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- INTEGER :: n
- !REAL(KIND=8):: a(n,n)
- INTEGER :: ipiv(n)
- INTEGER :: info, lwork, ILAENV
- REAL(KIND=8), ALLOCATABLE :: work(:)
- lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
- ALLOCATE ( work(lwork) )
- CALL DGETRF( n, n, a3, n, ipiv, info )
- CALL DGETRI( n, a3, n, ipiv, work, lwork, info )
- DEALLOCATE ( work )
- END SUBROUTINE Lapack
- ! --------------------------------------------------------------------
- SUBROUTINE Lapack2 (n) ! Invert matrix by Lapack method using DGESV
- ! --------------------------------------------------------------------
- use arrayMod
- IMPLICIT NONE
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- INTEGER :: n
- !REAL(KIND=8):: a(n,n)
- !REAL(KIND=8), ALLOCATABLE :: B(:,:)
- INTEGER :: ipiv(n)
- INTEGER :: info !, lwork, ILAENV
- integer :: NRHS,i,lda,ldb
- !REAL(KIND=8), ALLOCATABLE :: work(:)
- !lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
- !ALLOCATE ( work(lwork) )
- ![A*B = X] all nxn if X = i B is inverse
- ! NRHS (input) INTEGER
- ! The number of right hand sides, i.e., the number of columns
- ! of the matrix B. NRHS >= 0.
- !* A (input/output) DOUBLE PRECISION array, dimension (LDA,N)
- ! On entry, the N-by-N coefficient matrix A.
- ! On exit, the factors L and U from the factorization
- ! A = P*L*U; the unit diagonal elements of L are not stored.
- !* LDA (input) INTEGER
- ! The leading dimension of the array A. LDA >= max(1,N).
- ! IPIV (output) INTEGER array, dimension (N)
- ! The pivot indices that define the permutation matrix P;
- ! row i of the matrix was interchanged with row IPIV(i).
- ! B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
- ! On entry, the N-by-NRHS matrix of right hand side matrix B.
- ! On exit, if INFO = 0, the N-by-NRHS solution matrix X.
- ! LDB (input) INTEGER
- ! The leading dimension of the array B. LDB >= max(1,N).
- ! INFO (output) INTEGER
- ! = 0: successful exit
- ! < 0: if INFO = -i, the i-th argument had an illegal value
- ! > 0: if INFO = i, U(i,i) is exactly zero. The factorization
- NRHS = n
- lda = n
- !allocate (b(n,n)) ! b is already allocated in arrayMod
- b=0.d0
- do i=1,n
- b(i,i) = 1.d0 ! Set b to identity matrix
- enddo
- ldb = n
- call DGESV( N, NRHS, A3, LDA, IPIV, B, LDB, INFO )
- !DEALLOCATE ( work )
- a3 = b ! Resulting inverse is in b
- !deallocate (b) ! b is allocated globally, don't deallocate here
- END SUBROUTINE Lapack2
- ! --------------------------------------------------------------------
- SUBROUTINE DGEtrf2call (n) ! Invert matrix by Lapack method using DGETRF2
- ! --------------------------------------------------------------------
- use arrayMod
- IMPLICIT NONE
- integer :: smallsize,bigsize,smallits
- common smallsize,bigsize,smallits
- INTEGER :: n
- INTEGER :: ipiv(n)
- INTEGER :: info, lwork, ILAENV
- REAL(KIND=8), ALLOCATABLE :: work(:)
- lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
- ALLOCATE ( work(lwork) )
- CALL DGETRF2( n, n, a3, n, ipiv, info )
- CALL DGETRI( n, a3, n, ipiv, work, lwork, info )
- DEALLOCATE ( work )
- END SUBROUTINE DGEtrf2call
Advertisement
Comments
-
- Updated version of multithreading TEST_CPU Fortran.uk Polyhedron Fortran Benchmark : that single thread (https://openbenchmarking.org/test/pts/polyhedron&eval=111335e02dea1589bab7a9564f8efef17d907dcb#metrics) Implemented by: David Frank [email protected], original gauss Tim Prince [email protected], Crout James Van Buskirk [email protected], Jos Bergervoet [email protected]. NOW gauss single thread with looping repetition OpenMP, crout single thread with looping repetition OpenMP, crout multi thread, and implementing with vendor blas lapack instead building own. by default as same as phoroix is 1000 repetition dual inversing 250x250, and 2000x2000 dual inversing, and check the difference.
- Example: ryzen 3900x
- C:\Temp>test_fpu_gemini_mt5.exe
- Inside arraySub() allocating memory ...
- Memory allocation sufficient
- Benchmark running, hopefully as only ACTIVE task
- Number of OpenMP threads: 24
- Test1: Gauss mt- 1000 ( 250x 250) inverts in 0.723 seconds Err= 0.5971541960567150E-14
- Test2: Croutstmt- 1000 ( 250x 250) inverts in 0.865 seconds Err= 0.7541962840872367E-14
- Calling Croutmt- 1 Bigsize
- Calling Croutmt- 2 Bigsize
- Test3: Croutmt- 2 ( 2000x 2000) inverts in 3.016 seconds Err= 0.9010735252907198E-09
- Test4: DGETRF - 2 ( 2000x 2000) inverts in 0.294 seconds Err= 0.2360292208074361E-11
- Test5: DGETRF2- 2 ( 2000x 2000) inverts in 0.289 seconds Err= 0.2635024887233167E-11
- Test6: DGESV - 2 ( 2000x 2000) inverts in 0.199 seconds Err= 0.2345595889465859E-11
- Total = 5.4 sec
Add Comment
Please, Sign In to add comment
Advertisement