TEST_CPU Multithread Version of Polyhedron Benchmark

! --------------------------------------------------------------------
MODULE kinds
   INTEGER, PARAMETER :: RK8 = SELECTED_REAL_KIND(15, 300)
END MODULE kinds
! --------------------------------------------------------------------
module arrayMod
  ! BIG ARRAY
  REAL(KIND=8),dimension(:,:),allocatable :: pool3
  REAL(KIND=8),dimension(:,:),ALLOCATABLE :: a3,b  ! a3 working matrices
  ! small ARRAY pool (read-only in parallel small matrix tests)
  REAL(KIND=8),dimension(:,:,:),allocatable :: pool
end module arrayMod

module functioninv
use arrayMod
  REAL(KIND=8), dimension(:,:)  ,ALLOCATABLE ::  Ainv
  common /allocateds/ allocated
  integer allocated
  contains

  function inv(A) result(Ainv)
  REAL(KIND=8), dimension(:,:), intent(in) :: A
  REAL(KIND=8), dimension(:,:)  ,ALLOCATABLE ::  Ainv
  REAL(KIND=8), ALLOCATABLE :: work(:)
  integer, dimension(size(A,1)) :: ipiv   ! pivot indices
  integer :: n, info,stat

  ! Store A in Ainv to prevent it from being overwritten by LAPACK
  Ainv = A
  n = size(A,1)

  lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
  !write ( *, '(a,i6)' ) ' Optimized work matrix order = ', lwork
  allocate (work(lwork))
  ! DGETRF computes an LU factorization of a general M-by-N matrix A
  ! using partial pivoting with row interchanges.
  call DGETRF(n, n, Ainv, n, ipiv, info)

  if (info /= 0) then
     stop 'Matrix is numerically singular!'
  end if

  ! DGETRI computes the inverse of a matrix using the LU factorization
  ! computed by DGETRF.
  call DGETRI(n, Ainv, n, ipiv, work, lwork, info)

  if (info /= 0) then
     stop 'Matrix inversion failed!'
  end if
  deallocate (work)
  end function inv
end module functioninv


!------------
PROGRAM TEST_FPU_NOSTACK  ! A number-crunching benchmark using matrix inversion.
USE kinds         ! Implemented by:    David Frank  [email protected]
USE functioninv   ! Add for function Blas as example not as subroutine
USE arrayMod      ! For stack overflow BIG MATRIX
USE OMP_LIB       ! Use OpenMP library for multi-threading

IMPLICIT NONE     ! Gauss  routine by: Tim Prince   [email protected]
                  ! Crout  routine by: James Van Buskirk  [email protected]
                  ! Lapack routine by: Jos Bergervoet [email protected]


integer             :: smallsize,bigsize,smallits
common                smallsize,bigsize,smallits


! - - - local variables - - -

REAL(KIND=8) :: avg_err, dt(20),avg_err2
INTEGER :: i, n, t(8), clock1, clock2, rate,j,k

! Declarations moved to the declaration section
REAL(KIND=8), ALLOCATABLE :: a_local(:,:) ! Thread-private working matrix
REAL(KIND=8), ALLOCATABLE :: a_final_smallits(:,:) ! To store the result for pool(:,:,smallits)


smallsize=250 !250
smallits=1000 ! RJA initial values were 101 and 1000 !1000
bigsize=2000 ! RJA initial value was 1000 then 2000
Call arraySub


WRITE (*,*) ' Benchmark running, hopefully as only ACTIVE task'
WRITE (*,*) ' Number of OpenMP threads: ', omp_get_max_threads()


call arrayFILL

! - - - begin benchmark - - -

dt  = 0.d0
DO n = 1,9

   CALL SYSTEM_CLOCK (clock1,rate)  ! get benchmark (n) start time

   SELECT CASE (n)
   CASE (1)
      ! Test 1: Gauss - Multi-threaded over smallits, SIMD within Gauss
      ! Use OpenMP to parallelize the loop over 'smallits'
      ! Each thread will process a different matrix from the 'pool'

      ! Allocate a_final_smallits before the parallel region
      ALLOCATE(a_final_smallits(smallsize, smallsize))

      !$OMP PARALLEL DO PRIVATE(i, a_local) SHARED(pool, smallsize, smallits, a_final_smallits) DEFAULT(SHARED)
      DO i = 1,smallits
          ! a_local is declared outside and is PRIVATE, OpenMP handles allocation/deallocation per thread
          ALLOCATE(a_local(smallsize, smallsize)) ! Allocate thread-private a_local
          a_local = pool(:,:,i)         ! get next matrix to invert (thread-safe read from pool)

          CALL Gauss_blas(a_local, smallsize)  ! invert a_local
          CALL Gauss_blas(a_local, smallsize)  ! invert a_local back

          ! If this is the last matrix (index smallits), store its result
          IF (i == smallits) THEN
             a_final_smallits = a_local
          END IF
          DEALLOCATE(a_local) ! Deallocate thread-private a_local

      END DO
      !$OMP END PARALLEL DO

      ! Calculate error using the result for the matrix at index 'smallits'
      avg_err = SUM(ABS(a_final_smallits - pool(:,:,smallits)))/(smallsize*smallsize)
      DEALLOCATE(a_final_smallits)


      CALL SYSTEM_CLOCK (clock2,rate)
      dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
      ! Adjusted format for error
      WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
           "Test1: Gauss1OMP- ", smallits," (",smallsize,"x",smallsize,") inverts in ", dt(n), ' seconds  Err=', avg_err


      CASE (2)
      ! Test 2: Crout - Multi-threaded over smallits, but serial within Crout
      ALLOCATE(a_final_smallits(smallsize, smallsize))

      ! Disable nested parallelism for this case
!$OMP PARALLEL DO PRIVATE(i, a_local) SHARED(pool, smallsize, smallits, a_final_smallits)
DO i = 1,smallits
  ALLOCATE(a_local(smallsize, smallsize))
  a_local = pool(:,:,i)

  CALL Croutblasted(a_local, smallsize)
  CALL Croutblasted(a_local, smallsize)

  IF (i == smallits) a_final_smallits = a_local

  DEALLOCATE(a_local)
END DO
!$OMP END PARALLEL DO

      ! Calculate error using the result for the matrix at index 'smallits'
      avg_err = SUM(ABS(a_final_smallits - pool(:,:,smallits)))/(smallsize*smallsize)
      DEALLOCATE(a_final_smallits)

      CALL SYSTEM_CLOCK (clock2,rate)
      dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
      ! Adjusted format for error
      WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
           "Test2: CROUT1OMP- ", smallits," (",smallsize,"x",smallsize,") inverts in ", dt(n), ' seconds  Err=', avg_err

   CASE (3)

    if(bigsize.le.2500) then

      i=1
      call FILL_a(i,bigsize)       ! get bigsize x bigsize matrix
      ! Note: Crout for bigsize is still serial in this implementation.
      ! Parallelizing Crout itself is more complex (like the Julia threaded solve).
      ! Adding SIMD directives to inner loops might offer some speedup.
      print *, 'Calling Croutmt- 1 Bigsize'
      CALL crout_inverse_threaded3(a3, bigsize)     ! invert a3
      print *, 'Calling Croutmt- 2 Bigsize'
      CALL crout_inverse_threaded3(a3, bigsize)     ! invert a3

      !call avgerr2(avg_err)
      avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
      CALL SYSTEM_CLOCK (clock2,rate)
      dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
      ! Adjusted format for error
      WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
           "Test3: CROUTMT- ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
    else
     print *, 'Calling Crout - Bgsize cancelled, too long to solve. Next: LAPACK Solver'
    endif
   CASE (4)

      i=1
      call FILL_a(i,bigsize)       ! get bigsize x bigsize matrix to a3

      CALL Lapack (bigsize)    ! invert a3
      CALL Lapack (bigsize)    ! invert a3

      avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
      CALL SYSTEM_CLOCK (clock2,rate)
      dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
      ! Adjusted format for error
      WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
           "Test4: DGETRF - ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err

   CASE (5)

      i=1
      call FILL_a(i,bigsize)       ! get bigsize x bigsize matrix  to a3
      CALL DGEtrf2call (bigsize)    ! invert a3
      CALL DGEtrf2call (bigsize)    ! invert a3

      avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
      CALL SYSTEM_CLOCK (clock2,rate)
      dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
      ! Adjusted format for error
      WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
           "Test5: DGETRF2- ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err

   CASE (6)

        i=1
        call FILL_a(i,bigsize)
        CALL Lapack2 (bigsize)
        CALL Lapack2 (bigsize)

      avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
      CALL SYSTEM_CLOCK (clock2,rate)
      dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
      ! Adjusted format for error
      WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
           "Test6: DGESV  - ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err


    CASE (7)

    CASE (8)

    CASE (9)


   END SELECT


 END DO                         ! for test 1-4

  WRITE (*,92) '                             Total =',SUM(dt), ' sec'
  WRITE (*,*)

92 FORMAT (A,F5.1,A,F18.15)


   call arraygone()

END PROGRAM TEST_FPU_NOSTACK


! allocating
subroutine arraySub

   use arrayMod
   use functioninv
   implicit none

    integer             :: smallsize,bigsize,smallits
    common                 smallsize,bigsize,smallits
    integer             :: stat


   !write(*,*) 'Enter number of rows'
   !read *,row
   !write(*,*) 'Enter number of cols'
   !read *,col
   write(*,*) 'Inside arraySub() allocating memory ...'
   allocate (pool(smallsize,smallsize,smallits),stat=stat)
      if (stat/=0) stop 'Cannot allocate memory 1'
   allocate (pool3(bigsize,bigsize),stat=stat )
      if (stat/=0) stop 'Cannot allocate memory 2'
   allocate (a3(bigsize,bigsize),stat=stat  )
      if (stat/=0) stop 'Cannot allocate memory 4'
   allocate (b(bigsize,bigsize),stat=stat  )
      if (stat/=0) stop 'Cannot allocate memory 5'


   if (stat/=0) stop 'Cannot allocate memory'
   pool3 = 0.d0
   pool  = 0.d0
   ! a     = 0.d0 ! No longer global
   a3    = 0.d0


   print *, 'Memory allocation sufficient'
end subroutine arraySub

subroutine arraygone
   use arrayMod
   use functioninv
   implicit none

    integer             :: smallsize,bigsize,smallits
    common                 smallsize,bigsize,smallits
    integer             :: stat
    ! Deallocate only the globally allocated arrays
    deallocate (pool,pool3,a3,b)
    !deallocate (Ainv) ! Uncomment if Ainv was allocated globally

end subroutine arraygone

subroutine arrayFILL
   use arrayMod
   implicit none

   integer             :: smallsize,bigsize,smallits
   common                 smallsize,bigsize,smallits
   integer :: t(8)


   CALL DATE_AND_TIME ( values = t )
   CALL RANDOM_SEED()                ! set seed to random number based on time
   CALL RANDOM_NUMBER(pool)          ! fill pool with random data ( 0. -> 1. )
   CALL RANDOM_NUMBER(pool3)         ! fill pool with random data ( 0. -> 1. )

end subroutine arrayfill

subroutine FILL_a(i,n)

   use arrayMod
   implicit none
   integer             :: smallsize,bigsize,smallits
   common                 smallsize,bigsize,smallits
   integer             i,j,k,n

   ! This subroutine is primarily used for the big matrix case now.
   ! Small matrices are copied directly from 'pool' to thread-local arrays.
   if (n.eq.bigsize) then
    a3 = pool3
   endif

end subroutine  fill_a

! avgerr and avgerr2 are no longer needed as separate subroutines
! as the error calculation is done in the main program.
! SUBROUTINE avgerr(avg_err)
! use arrayMod
! IMPLICIT NONE
!  real*8 :: avg_err
! integer             :: smallsize,bigsize,smallits
! common                 smallsize,bigsize,smallits
! avg_err = SUM(ABS(a-pool(:,:,smallits)))/(smallsize*smallsize)   ! last matrix error
! end SUBROUTINE avgerr

! --------------------------------------------------------------------
SUBROUTINE Gauss (a_matrix, n)       ! Invert matrix by Gauss method
! Modified to accept matrix as argument and use thread-local bsmall
! --------------------------------------------------------------------
IMPLICIT NONE
INTEGER, INTENT(IN) :: n
REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix ! Matrix to invert

! - - - Local Variables - - -
REAL(KIND=8), DIMENSION(n,n) :: bsmall ! Thread-local working matrix
REAL(KIND=8):: c, d, temp(n)
INTEGER :: i, j, k, m, imax(1), ipvt(n)
! - - - - - - - - - - - - - -

bsmall = a_matrix
ipvt = (/ (i, i = 1, n) /)

DO k = 1,n
   imax = MAXLOC(ABS(bsmall(k:n,k)))
   m = k-1+imax(1)

   IF (m /= k) THEN
      ipvt( (/m,k/) ) = ipvt( (/k,m/) )
      bsmall((/m,k/),:) = bsmall((/k,m/),:)
   END IF
   d = 1/bsmall(k,k)

   temp = bsmall(:,k)
   ! Apply SIMD directive to the inner loop for vectorization
   !$OMP SIMD
   DO j = 1, n
      c = bsmall(k,j)*d
      bsmall(:,j) = bsmall(:,j)-temp*c
      bsmall(k,j) = c
   END DO
   bsmall(:,k) = temp*(-d)
   bsmall(k,k) = d
END DO
a_matrix(:,ipvt) = bsmall
END SUBROUTINE Gauss


SUBROUTINE Gauss_BLAS(a_matrix, n)
  USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_DOUBLE
  IMPLICIT NONE
  INTEGER, INTENT(IN) :: n
  REAL(C_DOUBLE), DIMENSION(n,n), INTENT(INOUT) :: a_matrix

  REAL(C_DOUBLE), DIMENSION(n,n) :: bsmall
  REAL(C_DOUBLE) :: c, d
  REAL(C_DOUBLE), DIMENSION(n) :: temp
  INTEGER :: i, j, k, m, imax(1), ipvt(n)

  bsmall = a_matrix
  ipvt = (/ (i, i = 1, n) /)

  DO k = 1, n
    imax = MAXLOC(ABS(bsmall(k:n,k)))
    m = k - 1 + imax(1)

    IF (m /= k) THEN
      ipvt((/m, k/)) = ipvt((/k, m/))
      bsmall((/m, k/), :) = bsmall((/k, m/), :)
    END IF

    d = 1.0D0 / bsmall(k,k)

    ! temp = bsmall(:,k)
    CALL DCOPY(n, bsmall(1,k), 1, temp, 1)

    !$OMP SIMD
    DO j = 1, n
      c = bsmall(k,j) * d
      ! bsmall(:,j) = bsmall(:,j) - temp * c
      CALL DAXPY(n, -c, temp, 1, bsmall(1,j), 1)
      bsmall(k,j) = c
    END DO

    ! bsmall(:,k) = temp * (-d)
    CALL DCOPY(n, temp, 1, bsmall(1,k), 1)
    CALL DSCAL(n, -d, bsmall(1,k), 1)

    bsmall(k,k) = d
  END DO

  ! Apply column permutation
  a_matrix(:, ipvt) = bsmall
END SUBROUTINE Gauss_BLAS


! -------------------------------------------------------------------
SUBROUTINE Crout (a_matrix, n)      ! Invert matrix by Crout method not using BLAS
! Modified to accept matrix as argument and use thread-local bsmall
! Optimized with SIMD directives where applicable.
! This version is for small matrices processed in parallel.
! -------------------------------------------------------------------
IMPLICIT NONE
INTEGER, INTENT(IN) :: n
REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix ! Matrix to invert

! - - - Local Variables - - -
REAL(KIND=8), DIMENSION(n,n) :: bsmall ! Thread-local working matrix
INTEGER :: i, j, m, imax(1)      ! Current row & column, max pivot loc
INTEGER :: index(n)              ! Partial pivot record
REAL(KIND=8) :: temp(n)     ! working arrays, temp


index = (/(i,i=1,n)/)        ! initialize column index

DO j = 1, n        ! Shuffle matrix a -> bsmall
   DO i = 1, j-1
      bsmall(i, j) = a_matrix(i, j)
   END DO
   DO i = j, n
      bsmall(i, j) = a_matrix(n+1-j, i+1-j)
   END DO
END DO

DO j = 1, n   ! LU decomposition; reciprocals of diagonal elements in L matrix

   DO i = j, n    ! Get current column of L matrix
       ! DOT_PRODUCT is usually auto-vectorized. Explicit SIMD might not be needed or helpful.
       bsmall(n-i+j,n+1-i) = bsmall(n-i+j,n+1-i)-DOT_PRODUCT(bsmall(n+1-i:n-i+j-1,n+1-i), bsmall(1:j-1,j))
   END DO

   imax = MAXLOC(ABS( (/ (bsmall(j+i-1,i),i=1,n-j+1) /) ))

   m = imax(1)

   bsmall(j+m-1,m) = 1/bsmall(j+m-1,m)

   IF (m /= n+1-j) THEN   ! Swap biggest element to current pivot position
      index((/j,n+1-m/))     = index((/n+1-m,j/))

      bsmall((/j,n+1-m/),n+2-m:n) = bsmall((/n+1-m,j/),n+2-m:n)
      temp(1:n+1-m)          = bsmall(m:n, m)
      bsmall(m:j-1+m, m)          = bsmall(n+1-j:n, n+1-j)
      bsmall(j+m:n, m)            = bsmall(j, j+1:n+1-m)
      bsmall(n+1-j:n, n+1-j)      = temp(1:j)
      bsmall(j, j+1:n+1-m)        = temp(j+1:n+1-m)
   END IF

   DO i = j+1, n   ! Get current row of U matrix
        ! DOT_PRODUCT is usually auto-vectorized.
        bsmall(j,i) =  bsmall(n,n+1-j)*( bsmall(j,i)-DOT_PRODUCT( bsmall(n+1-j:n-1,n+1-j), bsmall(1:j-1,i)))
   END DO
END DO

DO j = 1, n-1     ! Invert L matrix
    temp(1) = bsmall(n, n+1-j)
   DO i = j+1, n
     ! DOT_PRODUCT is usually auto-vectorized.
     bsmall(n-i+j,n+1-i) = -DOT_PRODUCT(bsmall(n-i+j:n-1,n+1-i),temp(1:i-j))*bsmall(n,n+1-i)
     temp(i-j+1) = bsmall(n-i+j,n+1-i)
   END DO
END DO

DO i = 1, (n+1)/3      ! Reshuffle matrix
       temp(1:n+2-3*i) = bsmall(2*i:n+1-i,i)
   DO j = 2*i, n+1-i
       bsmall(j, i) = bsmall(n+i-j, n+1-j)
   END DO
   DO j = i, n+1-2*i
      bsmall(i+j-1, j) = bsmall(n+1-i, n+2-i-j)
   END DO
   bsmall(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
END DO

DO i = 1, n-1      ! Invert U matrix
   DO j = i+1, n
      ! DOT_PRODUCT is usually auto-vectorized.
      bsmall(i,j) = -bsmall(i,j)-DOT_PRODUCT(temp(1:j-i-1), bsmall(i+1:j-1,j))
      temp(j-i) = bsmall(i,j)
   END DO
END DO

DO i = 1, n-1      ! Multiply inverses in reverse order
   temp(1:n-i) = bsmall(i,i+1:n)
   ! Apply SIMD directive to the inner loops for vectorization
   !$OMP SIMD
   DO j = 1,i
      ! DOT_PRODUCT is usually auto-vectorized.
      bsmall(i,j) = bsmall(i,j)+DOT_PRODUCT(temp(1:n-i),bsmall(i+1:n,j))
   END DO
   !$OMP SIMD
   DO j = i+1, n
      bsmall(i,j) = DOT_PRODUCT(temp(j-i:n-i),bsmall(j:n,j))
   END DO
END DO

a_matrix(:,index) = bsmall    ! output straightened columns of the inverse

END SUBROUTINE Crout

! -------------------------------------------------------------------
SUBROUTINE Croutblasted (a_matrix, n)      ! Invert matrix by Crout method
  USE kinds
  IMPLICIT NONE
  INTEGER, INTENT(IN) :: n
  REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix

  ! BLAS Interfaces
  INTERFACE
    SUBROUTINE DGEMV(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
      USE kinds
      CHARACTER(LEN=1), INTENT(IN) :: TRANS
      INTEGER, INTENT(IN) :: M, N, LDA, INCX, INCY
      REAL(KIND=8), INTENT(IN) :: ALPHA, BETA
      REAL(KIND=8), INTENT(IN) :: A(LDA,*), X(*)
      REAL(KIND=8), INTENT(INOUT) :: Y(*)
    END SUBROUTINE DGEMV

    REAL(KIND=8) FUNCTION DDOT(N, DX, INCX, DY, INCY)
      USE kinds
      INTEGER, INTENT(IN) :: N, INCX, INCY
      REAL(KIND=8), INTENT(IN) :: DX(*), DY(*)
    END FUNCTION DDOT

    SUBROUTINE DSWAP(N, DX, INCX, DY, INCY)
      USE kinds
      INTEGER, INTENT(IN) :: N, INCX, INCY
      REAL(KIND=8), INTENT(INOUT) :: DX(*), DY(*)
    END SUBROUTINE DSWAP
  END INTERFACE

  ! Constants
  REAL(KIND=8), PARAMETER :: one = 1.0_RK8
  REAL(KIND=8), PARAMETER :: zero = 0.0_RK8

  ! Rest of your implementation...
  REAL(KIND=8), DIMENSION(n,n) :: bsmall
INTEGER :: i, j, m, imax(1)      ! Current row & column, max pivot loc
INTEGER :: index(n)              ! Partial pivot record
REAL(KIND=8) :: temp(n)     ! working arrays, temp


  !!!! STEP 1: Matrix Shuffle - CAN'T BE BLAS OPTIMIZED (irregular access)
  index = (/(i,i=1,n)/)
  DO j = 1, n
     DO i = 1, j-1
        bsmall(i, j) = a_matrix(i, j)
     END DO
     DO i = j, n
        bsmall(i, j) = a_matrix(n+1-j, i+1-j)
     END DO
  END DO


DO j = 1, n   ! LU decomposition; reciprocals of diagonal elements in L matrix
     !!!! STEP 2: LU Decomposition - PARTIALLY OPTIMIZED
     !!!! DONE: L column computation with DDOT
     DO i = j, n
        bsmall(n-i+j,n+1-i) = bsmall(n-i+j,n+1-i) - &
             DDOT(j-1, bsmall(n+1-i:n-i+j-1,n+1-i), 1, bsmall(1:j-1,j), 1)
     END DO


   imax = MAXLOC(ABS( (/ (bsmall(j+i-1,i),i=1,n-j+1) /) ))

   m = imax(1)

   bsmall(j+m-1,m) = 1/bsmall(j+m-1,m)

   IF (m /= n+1-j) THEN   ! Swap biggest element to current pivot position
      index((/j,n+1-m/))     = index((/n+1-m,j/))

      bsmall((/j,n+1-m/),n+2-m:n) = bsmall((/n+1-m,j/),n+2-m:n)
      temp(1:n+1-m)          = bsmall(m:n, m)
      bsmall(m:j-1+m, m)          = bsmall(n+1-j:n, n+1-j)
      bsmall(j+m:n, m)            = bsmall(j, j+1:n+1-m)
      bsmall(n+1-j:n, n+1-j)      = temp(1:j)
      bsmall(j, j+1:n+1-m)        = temp(j+1:n+1-m)
   END IF

     !!!! TODO: U row computation - Current uses DOT_PRODUCT
     ! Potential optimization: Replace with DDOT
    DO i = j+1, n
        bsmall(j,i) = bsmall(n,n+1-j)*(bsmall(j,i) - &
        DDOT(j-1, bsmall(n+1-j:n-1,n+1-j), 1, bsmall(1:j-1,i), 1))
    END DO
END DO

  !!!! STEP 3: L Inversion - DONE with DDOT
  DO j = 1, n-1
     temp(1) = bsmall(n, n+1-j)
     DO i = j+1, n
        bsmall(n-i+j,n+1-i) = -DDOT(i-j, bsmall(n-i+j:n-1,n+1-i), 1, temp(1), 1) * bsmall(n,n+1-i)
        temp(i-j+1) = bsmall(n-i+j,n+1-i)
     END DO
  END DO


  !!!! STEP 4: Matrix Reshuffle - CAN'T BE BLAS OPTIMIZED (irregular access)
  DO i = 1, (n+1)/3
     temp(1:n+2-3*i) = bsmall(2*i:n+1-i,i)
     DO j = 2*i, n+1-i
        bsmall(j, i) = bsmall(n+i-j, n+1-j)
     END DO
     DO j = i, n+1-2*i
        bsmall(i+j-1, j) = bsmall(n+1-i, n+2-i-j)
     END DO
     bsmall(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
  END DO

  !!!! STEP 5: U Inversion - DONE with DDOT
  DO i = 1, n-1
     DO j = i+1, n
        bsmall(i,j) = -bsmall(i,j) - DDOT(j-i-1, temp(1), 1, bsmall(i+1,j), 1)
        temp(j-i) = bsmall(i,j)
     END DO
  END DO

  !!!! STEP 6: Final Multiplication - TODO: Current uses DOT_PRODUCT
  ! Potential optimization: Replace with DGEMV
  DO i = 1, n-1
     temp(1:n-i) = bsmall(i,i+1:n)
     DO j = 1,i
        bsmall(i,j) = bsmall(i,j) + DOT_PRODUCT(temp(1:n-i),bsmall(i+1:n,j))
     END DO
     DO j = i+1, n
        bsmall(i,j) = DOT_PRODUCT(temp(j-i:n-i),bsmall(j:n,j))
     END DO
  END DO


a_matrix(:,index) = bsmall    ! output straightened columns of the inverse

END SUBROUTINE Croutblasted


! ===== chat gpt
subroutine crout_inverse_threaded3(A, n)
  USE kinds
  implicit none
  integer, intent(in) :: n

  REAL(RK8), intent(inout) :: A(n,n)


  real(RK8), allocatable :: L(:,:), U(:,:), Ainv(:,:)
  integer :: i, j, k
  real(RK8) :: sum_val, inv_Ljj
  real(RK8), parameter :: eps = 1.0e-12_RK8

  if (n.lt.500) then
  call Croutblasted(A,n)
  else
  ! Allocate aligned matrices
  allocate(L(n,n), source=0.0_RK8)
  allocate(U(n,n), source=0.0_RK8)
  allocate(Ainv(n,n), source=0.0_RK8)

  !$omp parallel do simd aligned(U:64)
  do i = 1, n
    U(i,i) = 1.0_RK8
  end do

  do j = 1, n
    ! Crout: compute L column
    !$omp parallel do private(i, k, sum_val) shared(A, L, U)
    do i = j, n
      sum_val = 0.0_RK8
      !$omp simd reduction(+:sum_val)
      do k = 1, j-1
        sum_val = sum_val + L(i,k) * U(k,j)
      end do
      L(i,j) = A(i,j) - sum_val
    end do
    !$omp end parallel do

    if (abs(L(j,j)) < eps * n * sqrt(real(n, RK8))) then
      error stop "Matrix is numerically singular in Crout LU"
    end if

    ! Crout: compute U row
    inv_Ljj = 1.0_RK8 / L(j,j)
    !$omp parallel do private(i, k, sum_val) shared(A, L, U)
    do i = j+1, n
      sum_val = 0.0_RK8
      !$omp simd reduction(+:sum_val)
      do k = 1, j-1
        sum_val = sum_val + L(j,k) * U(k,i)
      end do
      U(j,i) = (A(j,i) - sum_val) * inv_Ljj
    end do
    !$omp end parallel do
  end do

  ! Solve each column of inverse matrix in parallel
  !$omp parallel do
  do i = 1, n
    call solve_column(L, U, n, i, Ainv(:,i))
  end do
  !$omp end parallel do

  A = Ainv
  endif

contains

  subroutine solve_column(L, U, n, col_idx, x)
    real(RK8), intent(in) :: L(n,n), U(n,n)
    integer, intent(in) :: n, col_idx
    real(RK8), intent(out) :: x(n)
    real(RK8) :: e(n), y(n)
    integer :: row, k
    real(RK8) :: sum_val

    e = 0.0_RK8
    e(col_idx) = 1.0_RK8

    ! Forward substitution: solve L y = e
    do row = 1, n
      sum_val = 0.0_RK8
      !$omp simd reduction(+:sum_val)
      do k = 1, row-1
        sum_val = sum_val + L(row,k) * y(k)
      end do
      y(row) = (e(row) - sum_val) / L(row,row)
    end do

    ! Backward substitution: solve U x = y
    do row = n, 1, -1
      sum_val = 0.0_RK8
      !$omp simd reduction(+:sum_val)
      do k = row+1, n
        sum_val = sum_val + U(row,k) * x(k)
      end do
      x(row) = (y(row) - sum_val) / U(row,row)
    end do
  end subroutine solve_column

end subroutine crout_inverse_threaded3


! =====


! --------------------------------------------------------------------
SUBROUTINE Lapack (n)     ! Invert matrix by Lapack method
! --------------------------------------------------------------------
use arrayMod
IMPLICIT NONE
integer             :: smallsize,bigsize,smallits
common                 smallsize,bigsize,smallits
INTEGER   :: n
!REAL(KIND=8):: a(n,n)


INTEGER :: ipiv(n)
INTEGER :: info, lwork, ILAENV
REAL(KIND=8), ALLOCATABLE :: work(:)

lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
ALLOCATE ( work(lwork) )

CALL DGETRF( n, n, a3, n, ipiv, info )
CALL DGETRI( n, a3, n, ipiv, work, lwork, info )

DEALLOCATE ( work )

END SUBROUTINE Lapack

! --------------------------------------------------------------------
SUBROUTINE Lapack2 (n)     ! Invert matrix by Lapack method using DGESV
! --------------------------------------------------------------------
use arrayMod
IMPLICIT NONE
integer             :: smallsize,bigsize,smallits
common                 smallsize,bigsize,smallits

INTEGER   :: n
!REAL(KIND=8):: a(n,n)
!REAL(KIND=8), ALLOCATABLE :: B(:,:)

INTEGER :: ipiv(n)
INTEGER :: info !, lwork, ILAENV
integer :: NRHS,i,lda,ldb
!REAL(KIND=8), ALLOCATABLE :: work(:)

!lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
!ALLOCATE ( work(lwork) )


![A*B = X] all nxn if X = i B is inverse
!  NRHS    (input) INTEGER
!          The number of right hand sides, i.e., the number of columns
!          of the matrix B.  NRHS >= 0.
!* A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
!          On entry, the N-by-N coefficient matrix A.
!          On exit, the factors L and U from the factorization
!          A = P*L*U; the unit diagonal elements of L are not stored.
!* LDA     (input) INTEGER
!          The leading dimension of the array A.  LDA >= max(1,N).
!  IPIV    (output) INTEGER array, dimension (N)
!          The pivot indices that define the permutation matrix P;
!          row i of the matrix was interchanged with row IPIV(i).
!  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
!          On entry, the N-by-NRHS matrix of right hand side matrix B.
!          On exit, if INFO = 0, the N-by-NRHS solution matrix X.
!  LDB     (input) INTEGER
!          The leading dimension of the array B.  LDB >= max(1,N).
!  INFO    (output) INTEGER
!          = 0:  successful exit
!         < 0:  if INFO = -i, the i-th argument had an illegal value
!          > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization

NRHS = n
lda  = n
!allocate (b(n,n)) ! b is already allocated in arrayMod
b=0.d0
do i=1,n
    b(i,i) = 1.d0 ! Set b to identity matrix
enddo
ldb = n


call DGESV( N, NRHS, A3, LDA, IPIV, B, LDB, INFO )


!DEALLOCATE ( work )
a3 = b ! Resulting inverse is in b
!deallocate (b) ! b is allocated globally, don't deallocate here
END SUBROUTINE Lapack2

! --------------------------------------------------------------------
SUBROUTINE DGEtrf2call  (n)     ! Invert matrix by Lapack method using DGETRF2
! --------------------------------------------------------------------
 use arrayMod
 IMPLICIT NONE
 integer             :: smallsize,bigsize,smallits
 common                 smallsize,bigsize,smallits

 INTEGER   :: n


 INTEGER :: ipiv(n)
 INTEGER :: info, lwork, ILAENV
 REAL(KIND=8), ALLOCATABLE :: work(:)

 lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
 ALLOCATE ( work(lwork) )

 CALL DGETRF2( n, n, a3, n, ipiv, info )
 CALL DGETRI( n, a3, n, ipiv, work, lwork, info )

 DEALLOCATE ( work )

END SUBROUTINE DGEtrf2call

Updated version of multithreading TEST_CPU Fortran.uk Polyhedron Fortran Benchmark : that single thread (https://openbenchmarking.org/test/pts/polyhedron&eval=111335e02dea1589bab7a9564f8efef17d907dcb#metrics) Implemented by: David Frank  [email protected], original gauss Tim Prince   [email protected], Crout James Van Buskirk  [email protected], Jos Bergervoet [email protected]. NOW gauss single thread with looping repetition OpenMP, crout single thread with looping repetition OpenMP, crout multi thread, and implementing with vendor blas lapack instead building own. by default as same as phoroix is 1000 repetition dual inversing 250x250, and 2000x2000 dual inversing, and check the difference.

Example: ryzen 3900x
C:\Temp>test_fpu_gemini_mt5.exe
 Inside arraySub() allocating memory ...
 Memory allocation sufficient
  Benchmark running, hopefully as only ACTIVE task
  Number of OpenMP threads:           24
Test1: Gauss  mt- 1000 (   250x   250) inverts in   0.723 seconds  Err=  0.5971541960567150E-14
Test2: Croutstmt- 1000 (   250x   250) inverts in   0.865 seconds  Err=  0.7541962840872367E-14
 Calling Croutmt- 1 Bigsize
 Calling Croutmt- 2 Bigsize
Test3: Croutmt-    2 (  2000x  2000) inverts in   3.016 seconds  Err=  0.9010735252907198E-09
Test4: DGETRF -    2 (  2000x  2000) inverts in   0.294 seconds  Err=  0.2360292208074361E-11
Test5: DGETRF2-    2 (  2000x  2000) inverts in   0.289 seconds  Err=  0.2635024887233167E-11
Test6: DGESV  -    2 (  2000x  2000) inverts in   0.199 seconds  Err=  0.2345595889465859E-11
                             Total =  5.4 sec