Advertisement
realbabilu

TEST_CPU Multithread Version of Polyhedron Benchmark

May 9th, 2025
40
0
Never
1
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Fortran 27.82 KB | Source Code | 0 0
  1. ! --------------------------------------------------------------------
  2. MODULE kinds
  3.    INTEGER, PARAMETER :: RK8 = SELECTED_REAL_KIND(15, 300)
  4. END MODULE kinds
  5. ! --------------------------------------------------------------------
  6. module arrayMod
  7.   ! BIG ARRAY
  8.   REAL(KIND=8),dimension(:,:),allocatable :: pool3
  9.   REAL(KIND=8),dimension(:,:),ALLOCATABLE :: a3,b  ! a3 working matrices
  10.   ! small ARRAY pool (read-only in parallel small matrix tests)
  11.   REAL(KIND=8),dimension(:,:,:),allocatable :: pool
  12. end module arrayMod
  13.  
  14. module functioninv
  15. use arrayMod
  16.   REAL(KIND=8), dimension(:,:)  ,ALLOCATABLE ::  Ainv
  17.   common /allocateds/ allocated
  18.   integer allocated
  19.   contains
  20.  
  21.   function inv(A) result(Ainv)
  22.   REAL(KIND=8), dimension(:,:), intent(in) :: A
  23.   REAL(KIND=8), dimension(:,:)  ,ALLOCATABLE ::  Ainv
  24.   REAL(KIND=8), ALLOCATABLE :: work(:)
  25.   integer, dimension(size(A,1)) :: ipiv   ! pivot indices
  26.   integer :: n, info,stat
  27.  
  28.   ! Store A in Ainv to prevent it from being overwritten by LAPACK
  29.   Ainv = A
  30.   n = size(A,1)
  31.  
  32.   lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
  33.   !write ( *, '(a,i6)' ) ' Optimized work matrix order = ', lwork
  34.   allocate (work(lwork))
  35.   ! DGETRF computes an LU factorization of a general M-by-N matrix A
  36.   ! using partial pivoting with row interchanges.
  37.   call DGETRF(n, n, Ainv, n, ipiv, info)
  38.  
  39.   if (info /= 0) then
  40.      stop 'Matrix is numerically singular!'
  41.   end if
  42.  
  43.   ! DGETRI computes the inverse of a matrix using the LU factorization
  44.   ! computed by DGETRF.
  45.   call DGETRI(n, Ainv, n, ipiv, work, lwork, info)
  46.  
  47.   if (info /= 0) then
  48.      stop 'Matrix inversion failed!'
  49.   end if
  50.   deallocate (work)
  51.   end function inv
  52. end module functioninv
  53.  
  54.  
  55. !------------
  56. PROGRAM TEST_FPU_NOSTACK  ! A number-crunching benchmark using matrix inversion.
  57. USE kinds         ! Implemented by:    David Frank  [email protected]
  58. USE functioninv   ! Add for function Blas as example not as subroutine
  59. USE arrayMod      ! For stack overflow BIG MATRIX
  60. USE OMP_LIB       ! Use OpenMP library for multi-threading
  61.  
  62. IMPLICIT NONE     ! Gauss  routine by: Tim Prince   [email protected]
  63.                   ! Crout  routine by: James Van Buskirk  [email protected]
  64.                   ! Lapack routine by: Jos Bergervoet [email protected]
  65.  
  66.  
  67. integer             :: smallsize,bigsize,smallits
  68. common                smallsize,bigsize,smallits
  69.  
  70.  
  71. ! - - - local variables - - -
  72.  
  73. REAL(KIND=8) :: avg_err, dt(20),avg_err2
  74. INTEGER :: i, n, t(8), clock1, clock2, rate,j,k
  75.  
  76. ! Declarations moved to the declaration section
  77. REAL(KIND=8), ALLOCATABLE :: a_local(:,:) ! Thread-private working matrix
  78. REAL(KIND=8), ALLOCATABLE :: a_final_smallits(:,:) ! To store the result for pool(:,:,smallits)
  79.  
  80.  
  81. smallsize=250 !250
  82. smallits=1000 ! RJA initial values were 101 and 1000 !1000
  83. bigsize=2000 ! RJA initial value was 1000 then 2000
  84. Call arraySub
  85.  
  86.  
  87. WRITE (*,*) ' Benchmark running, hopefully as only ACTIVE task'
  88. WRITE (*,*) ' Number of OpenMP threads: ', omp_get_max_threads()
  89.  
  90.  
  91. call arrayFILL
  92.  
  93. ! - - - begin benchmark - - -
  94.  
  95. dt  = 0.d0
  96. DO n = 1,9
  97.  
  98.    CALL SYSTEM_CLOCK (clock1,rate)  ! get benchmark (n) start time
  99.  
  100.    SELECT CASE (n)
  101.    CASE (1)
  102.       ! Test 1: Gauss - Multi-threaded over smallits, SIMD within Gauss
  103.       ! Use OpenMP to parallelize the loop over 'smallits'
  104.       ! Each thread will process a different matrix from the 'pool'
  105.  
  106.       ! Allocate a_final_smallits before the parallel region
  107.       ALLOCATE(a_final_smallits(smallsize, smallsize))
  108.  
  109.       !$OMP PARALLEL DO PRIVATE(i, a_local) SHARED(pool, smallsize, smallits, a_final_smallits) DEFAULT(SHARED)
  110.       DO i = 1,smallits
  111.           ! a_local is declared outside and is PRIVATE, OpenMP handles allocation/deallocation per thread
  112.           ALLOCATE(a_local(smallsize, smallsize)) ! Allocate thread-private a_local
  113.           a_local = pool(:,:,i)         ! get next matrix to invert (thread-safe read from pool)
  114.  
  115.           CALL Gauss_blas(a_local, smallsize)  ! invert a_local
  116.           CALL Gauss_blas(a_local, smallsize)  ! invert a_local back
  117.  
  118.           ! If this is the last matrix (index smallits), store its result
  119.           IF (i == smallits) THEN
  120.              a_final_smallits = a_local
  121.           END IF
  122.           DEALLOCATE(a_local) ! Deallocate thread-private a_local
  123.  
  124.       END DO
  125.       !$OMP END PARALLEL DO
  126.  
  127.       ! Calculate error using the result for the matrix at index 'smallits'
  128.       avg_err = SUM(ABS(a_final_smallits - pool(:,:,smallits)))/(smallsize*smallsize)
  129.       DEALLOCATE(a_final_smallits)
  130.  
  131.  
  132.       CALL SYSTEM_CLOCK (clock2,rate)
  133.       dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
  134.       ! Adjusted format for error
  135.       WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
  136.            "Test1: Gauss1OMP- ", smallits," (",smallsize,"x",smallsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
  137.  
  138.  
  139.       CASE (2)
  140.       ! Test 2: Crout - Multi-threaded over smallits, but serial within Crout
  141.       ALLOCATE(a_final_smallits(smallsize, smallsize))
  142.  
  143.       ! Disable nested parallelism for this case
  144. !$OMP PARALLEL DO PRIVATE(i, a_local) SHARED(pool, smallsize, smallits, a_final_smallits)
  145. DO i = 1,smallits
  146.   ALLOCATE(a_local(smallsize, smallsize))
  147.   a_local = pool(:,:,i)
  148.  
  149.   CALL Croutblasted(a_local, smallsize)
  150.   CALL Croutblasted(a_local, smallsize)
  151.  
  152.   IF (i == smallits) a_final_smallits = a_local
  153.  
  154.   DEALLOCATE(a_local)
  155. END DO
  156. !$OMP END PARALLEL DO
  157.  
  158.       ! Calculate error using the result for the matrix at index 'smallits'
  159.       avg_err = SUM(ABS(a_final_smallits - pool(:,:,smallits)))/(smallsize*smallsize)
  160.       DEALLOCATE(a_final_smallits)
  161.  
  162.       CALL SYSTEM_CLOCK (clock2,rate)
  163.       dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
  164.       ! Adjusted format for error
  165.       WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
  166.            "Test2: CROUT1OMP- ", smallits," (",smallsize,"x",smallsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
  167.  
  168.    CASE (3)
  169.  
  170.     if(bigsize.le.2500) then
  171.  
  172.       i=1
  173.       call FILL_a(i,bigsize)       ! get bigsize x bigsize matrix
  174.       ! Note: Crout for bigsize is still serial in this implementation.
  175.       ! Parallelizing Crout itself is more complex (like the Julia threaded solve).
  176.       ! Adding SIMD directives to inner loops might offer some speedup.
  177.       print *, 'Calling Croutmt- 1 Bigsize'
  178.       CALL crout_inverse_threaded3(a3, bigsize)     ! invert a3
  179.       print *, 'Calling Croutmt- 2 Bigsize'
  180.       CALL crout_inverse_threaded3(a3, bigsize)     ! invert a3
  181.  
  182.       !call avgerr2(avg_err)
  183.       avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
  184.       CALL SYSTEM_CLOCK (clock2,rate)
  185.       dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
  186.       ! Adjusted format for error
  187.       WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
  188.            "Test3: CROUTMT- ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
  189.     else
  190.      print *, 'Calling Crout - Bgsize cancelled, too long to solve. Next: LAPACK Solver'
  191.     endif
  192.    CASE (4)
  193.  
  194.       i=1
  195.       call FILL_a(i,bigsize)       ! get bigsize x bigsize matrix to a3
  196.  
  197.       CALL Lapack (bigsize)    ! invert a3
  198.       CALL Lapack (bigsize)    ! invert a3
  199.  
  200.       avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
  201.       CALL SYSTEM_CLOCK (clock2,rate)
  202.       dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
  203.       ! Adjusted format for error
  204.       WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
  205.            "Test4: DGETRF - ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
  206.  
  207.    CASE (5)
  208.  
  209.       i=1
  210.       call FILL_a(i,bigsize)       ! get bigsize x bigsize matrix  to a3
  211.       CALL DGEtrf2call (bigsize)    ! invert a3
  212.       CALL DGEtrf2call (bigsize)    ! invert a3
  213.  
  214.       avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
  215.       CALL SYSTEM_CLOCK (clock2,rate)
  216.       dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
  217.       ! Adjusted format for error
  218.       WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
  219.            "Test5: DGETRF2- ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
  220.  
  221.    CASE (6)
  222.  
  223.         i=1
  224.         call FILL_a(i,bigsize)
  225.         CALL Lapack2 (bigsize)
  226.         CALL Lapack2 (bigsize)
  227.  
  228.       avg_err = SUM(ABS(a3-pool3))/(bigsize*bigsize)    ! invert err.
  229.       CALL SYSTEM_CLOCK (clock2,rate)
  230.       dt(n) = (clock2-clock1)/DBLE(rate)  ! get benchmark (n) elapsed sec.
  231.       ! Adjusted format for error
  232.       WRITE (*,FMT='(A,I4,A,I6,A,I6,A,F7.3,A,E24.16)') &
  233.            "Test6: DGESV  - ", 2," (",bigsize,"x",bigsize,") inverts in ", dt(n), ' seconds  Err=', avg_err
  234.  
  235.  
  236.     CASE (7)
  237.  
  238.     CASE (8)
  239.  
  240.     CASE (9)
  241.  
  242.  
  243.  
  244.    END SELECT
  245.  
  246.  
  247.  
  248.  END DO                         ! for test 1-4
  249.  
  250.   WRITE (*,92) '                             Total =',SUM(dt), ' sec'
  251.   WRITE (*,*)
  252.  
  253. 92 FORMAT (A,F5.1,A,F18.15)
  254.  
  255.  
  256.    call arraygone()
  257.  
  258. END PROGRAM TEST_FPU_NOSTACK
  259.  
  260.  
  261.  
  262. ! allocating
  263. subroutine arraySub
  264.  
  265.    use arrayMod
  266.    use functioninv
  267.    implicit none
  268.  
  269.     integer             :: smallsize,bigsize,smallits
  270.     common                 smallsize,bigsize,smallits
  271.     integer             :: stat
  272.  
  273.  
  274.    !write(*,*) 'Enter number of rows'
  275.    !read *,row
  276.    !write(*,*) 'Enter number of cols'
  277.    !read *,col
  278.    write(*,*) 'Inside arraySub() allocating memory ...'
  279.    allocate (pool(smallsize,smallsize,smallits),stat=stat)
  280.       if (stat/=0) stop 'Cannot allocate memory 1'
  281.    allocate (pool3(bigsize,bigsize),stat=stat )
  282.       if (stat/=0) stop 'Cannot allocate memory 2'
  283.    allocate (a3(bigsize,bigsize),stat=stat  )
  284.       if (stat/=0) stop 'Cannot allocate memory 4'
  285.    allocate (b(bigsize,bigsize),stat=stat  )
  286.       if (stat/=0) stop 'Cannot allocate memory 5'
  287.  
  288.  
  289.  
  290.    if (stat/=0) stop 'Cannot allocate memory'
  291.    pool3 = 0.d0
  292.    pool  = 0.d0
  293.    ! a     = 0.d0 ! No longer global
  294.    a3    = 0.d0
  295.  
  296.  
  297.    print *, 'Memory allocation sufficient'
  298. end subroutine arraySub
  299.  
  300. subroutine arraygone
  301.    use arrayMod
  302.    use functioninv
  303.    implicit none
  304.  
  305.     integer             :: smallsize,bigsize,smallits
  306.     common                 smallsize,bigsize,smallits
  307.     integer             :: stat
  308.     ! Deallocate only the globally allocated arrays
  309.     deallocate (pool,pool3,a3,b)
  310.     !deallocate (Ainv) ! Uncomment if Ainv was allocated globally
  311.  
  312. end subroutine arraygone
  313.  
  314. subroutine arrayFILL
  315.    use arrayMod
  316.    implicit none
  317.  
  318.    integer             :: smallsize,bigsize,smallits
  319.    common                 smallsize,bigsize,smallits
  320.    integer :: t(8)
  321.  
  322.  
  323.    CALL DATE_AND_TIME ( values = t )
  324.    CALL RANDOM_SEED()                ! set seed to random number based on time
  325.    CALL RANDOM_NUMBER(pool)          ! fill pool with random data ( 0. -> 1. )
  326.    CALL RANDOM_NUMBER(pool3)         ! fill pool with random data ( 0. -> 1. )
  327.  
  328. end subroutine arrayfill
  329.  
  330. subroutine FILL_a(i,n)
  331.  
  332.    use arrayMod
  333.    implicit none
  334.    integer             :: smallsize,bigsize,smallits
  335.    common                 smallsize,bigsize,smallits
  336.    integer             i,j,k,n
  337.  
  338.    ! This subroutine is primarily used for the big matrix case now.
  339.    ! Small matrices are copied directly from 'pool' to thread-local arrays.
  340.    if (n.eq.bigsize) then
  341.     a3 = pool3
  342.    endif
  343.  
  344. end subroutine  fill_a
  345.  
  346. ! avgerr and avgerr2 are no longer needed as separate subroutines
  347. ! as the error calculation is done in the main program.
  348. ! SUBROUTINE avgerr(avg_err)
  349. ! use arrayMod
  350. ! IMPLICIT NONE
  351. !  real*8 :: avg_err
  352. ! integer             :: smallsize,bigsize,smallits
  353. ! common                 smallsize,bigsize,smallits
  354. ! avg_err = SUM(ABS(a-pool(:,:,smallits)))/(smallsize*smallsize)   ! last matrix error
  355. ! end SUBROUTINE avgerr
  356.  
  357. ! --------------------------------------------------------------------
  358. SUBROUTINE Gauss (a_matrix, n)       ! Invert matrix by Gauss method
  359. ! Modified to accept matrix as argument and use thread-local bsmall
  360. ! --------------------------------------------------------------------
  361. IMPLICIT NONE
  362. INTEGER, INTENT(IN) :: n
  363. REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix ! Matrix to invert
  364.  
  365. ! - - - Local Variables - - -
  366. REAL(KIND=8), DIMENSION(n,n) :: bsmall ! Thread-local working matrix
  367. REAL(KIND=8):: c, d, temp(n)
  368. INTEGER :: i, j, k, m, imax(1), ipvt(n)
  369. ! - - - - - - - - - - - - - -
  370.  
  371. bsmall = a_matrix
  372. ipvt = (/ (i, i = 1, n) /)
  373.  
  374. DO k = 1,n
  375.    imax = MAXLOC(ABS(bsmall(k:n,k)))
  376.    m = k-1+imax(1)
  377.  
  378.    IF (m /= k) THEN
  379.       ipvt( (/m,k/) ) = ipvt( (/k,m/) )
  380.       bsmall((/m,k/),:) = bsmall((/k,m/),:)
  381.    END IF
  382.    d = 1/bsmall(k,k)
  383.  
  384.    temp = bsmall(:,k)
  385.    ! Apply SIMD directive to the inner loop for vectorization
  386.    !$OMP SIMD
  387.    DO j = 1, n
  388.       c = bsmall(k,j)*d
  389.       bsmall(:,j) = bsmall(:,j)-temp*c
  390.       bsmall(k,j) = c
  391.    END DO
  392.    bsmall(:,k) = temp*(-d)
  393.    bsmall(k,k) = d
  394. END DO
  395. a_matrix(:,ipvt) = bsmall
  396. END SUBROUTINE Gauss
  397.  
  398.  
  399. SUBROUTINE Gauss_BLAS(a_matrix, n)
  400.   USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_DOUBLE
  401.   IMPLICIT NONE
  402.   INTEGER, INTENT(IN) :: n
  403.   REAL(C_DOUBLE), DIMENSION(n,n), INTENT(INOUT) :: a_matrix
  404.  
  405.   REAL(C_DOUBLE), DIMENSION(n,n) :: bsmall
  406.   REAL(C_DOUBLE) :: c, d
  407.   REAL(C_DOUBLE), DIMENSION(n) :: temp
  408.   INTEGER :: i, j, k, m, imax(1), ipvt(n)
  409.  
  410.   bsmall = a_matrix
  411.   ipvt = (/ (i, i = 1, n) /)
  412.  
  413.   DO k = 1, n
  414.     imax = MAXLOC(ABS(bsmall(k:n,k)))
  415.     m = k - 1 + imax(1)
  416.  
  417.     IF (m /= k) THEN
  418.       ipvt((/m, k/)) = ipvt((/k, m/))
  419.       bsmall((/m, k/), :) = bsmall((/k, m/), :)
  420.     END IF
  421.  
  422.     d = 1.0D0 / bsmall(k,k)
  423.  
  424.     ! temp = bsmall(:,k)
  425.     CALL DCOPY(n, bsmall(1,k), 1, temp, 1)
  426.  
  427.     !$OMP SIMD
  428.     DO j = 1, n
  429.       c = bsmall(k,j) * d
  430.       ! bsmall(:,j) = bsmall(:,j) - temp * c
  431.       CALL DAXPY(n, -c, temp, 1, bsmall(1,j), 1)
  432.       bsmall(k,j) = c
  433.     END DO
  434.  
  435.     ! bsmall(:,k) = temp * (-d)
  436.     CALL DCOPY(n, temp, 1, bsmall(1,k), 1)
  437.     CALL DSCAL(n, -d, bsmall(1,k), 1)
  438.  
  439.     bsmall(k,k) = d
  440.   END DO
  441.  
  442.   ! Apply column permutation
  443.   a_matrix(:, ipvt) = bsmall
  444. END SUBROUTINE Gauss_BLAS
  445.  
  446.  
  447. ! -------------------------------------------------------------------
  448. SUBROUTINE Crout (a_matrix, n)      ! Invert matrix by Crout method not using BLAS
  449. ! Modified to accept matrix as argument and use thread-local bsmall
  450. ! Optimized with SIMD directives where applicable.
  451. ! This version is for small matrices processed in parallel.
  452. ! -------------------------------------------------------------------
  453. IMPLICIT NONE
  454. INTEGER, INTENT(IN) :: n
  455. REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix ! Matrix to invert
  456.  
  457. ! - - - Local Variables - - -
  458. REAL(KIND=8), DIMENSION(n,n) :: bsmall ! Thread-local working matrix
  459. INTEGER :: i, j, m, imax(1)      ! Current row & column, max pivot loc
  460. INTEGER :: index(n)              ! Partial pivot record
  461. REAL(KIND=8) :: temp(n)     ! working arrays, temp
  462.  
  463.  
  464. index = (/(i,i=1,n)/)        ! initialize column index
  465.  
  466. DO j = 1, n        ! Shuffle matrix a -> bsmall
  467.    DO i = 1, j-1
  468.       bsmall(i, j) = a_matrix(i, j)
  469.    END DO
  470.    DO i = j, n
  471.       bsmall(i, j) = a_matrix(n+1-j, i+1-j)
  472.    END DO
  473. END DO
  474.  
  475. DO j = 1, n   ! LU decomposition; reciprocals of diagonal elements in L matrix
  476.  
  477.    DO i = j, n    ! Get current column of L matrix
  478.        ! DOT_PRODUCT is usually auto-vectorized. Explicit SIMD might not be needed or helpful.
  479.        bsmall(n-i+j,n+1-i) = bsmall(n-i+j,n+1-i)-DOT_PRODUCT(bsmall(n+1-i:n-i+j-1,n+1-i), bsmall(1:j-1,j))
  480.    END DO
  481.  
  482.    imax = MAXLOC(ABS( (/ (bsmall(j+i-1,i),i=1,n-j+1) /) ))
  483.  
  484.    m = imax(1)
  485.  
  486.    bsmall(j+m-1,m) = 1/bsmall(j+m-1,m)
  487.  
  488.    IF (m /= n+1-j) THEN   ! Swap biggest element to current pivot position
  489.       index((/j,n+1-m/))     = index((/n+1-m,j/))
  490.  
  491.       bsmall((/j,n+1-m/),n+2-m:n) = bsmall((/n+1-m,j/),n+2-m:n)
  492.       temp(1:n+1-m)          = bsmall(m:n, m)
  493.       bsmall(m:j-1+m, m)          = bsmall(n+1-j:n, n+1-j)
  494.       bsmall(j+m:n, m)            = bsmall(j, j+1:n+1-m)
  495.       bsmall(n+1-j:n, n+1-j)      = temp(1:j)
  496.       bsmall(j, j+1:n+1-m)        = temp(j+1:n+1-m)
  497.    END IF
  498.  
  499.    DO i = j+1, n   ! Get current row of U matrix
  500.         ! DOT_PRODUCT is usually auto-vectorized.
  501.         bsmall(j,i) =  bsmall(n,n+1-j)*( bsmall(j,i)-DOT_PRODUCT( bsmall(n+1-j:n-1,n+1-j), bsmall(1:j-1,i)))
  502.    END DO
  503. END DO
  504.  
  505. DO j = 1, n-1     ! Invert L matrix
  506.     temp(1) = bsmall(n, n+1-j)
  507.    DO i = j+1, n
  508.      ! DOT_PRODUCT is usually auto-vectorized.
  509.      bsmall(n-i+j,n+1-i) = -DOT_PRODUCT(bsmall(n-i+j:n-1,n+1-i),temp(1:i-j))*bsmall(n,n+1-i)
  510.      temp(i-j+1) = bsmall(n-i+j,n+1-i)
  511.    END DO
  512. END DO
  513.  
  514. DO i = 1, (n+1)/3      ! Reshuffle matrix
  515.        temp(1:n+2-3*i) = bsmall(2*i:n+1-i,i)
  516.    DO j = 2*i, n+1-i
  517.        bsmall(j, i) = bsmall(n+i-j, n+1-j)
  518.    END DO
  519.    DO j = i, n+1-2*i
  520.       bsmall(i+j-1, j) = bsmall(n+1-i, n+2-i-j)
  521.    END DO
  522.    bsmall(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
  523. END DO
  524.  
  525. DO i = 1, n-1      ! Invert U matrix
  526.    DO j = i+1, n
  527.       ! DOT_PRODUCT is usually auto-vectorized.
  528.       bsmall(i,j) = -bsmall(i,j)-DOT_PRODUCT(temp(1:j-i-1), bsmall(i+1:j-1,j))
  529.       temp(j-i) = bsmall(i,j)
  530.    END DO
  531. END DO
  532.  
  533. DO i = 1, n-1      ! Multiply inverses in reverse order
  534.    temp(1:n-i) = bsmall(i,i+1:n)
  535.    ! Apply SIMD directive to the inner loops for vectorization
  536.    !$OMP SIMD
  537.    DO j = 1,i
  538.       ! DOT_PRODUCT is usually auto-vectorized.
  539.       bsmall(i,j) = bsmall(i,j)+DOT_PRODUCT(temp(1:n-i),bsmall(i+1:n,j))
  540.    END DO
  541.    !$OMP SIMD
  542.    DO j = i+1, n
  543.       bsmall(i,j) = DOT_PRODUCT(temp(j-i:n-i),bsmall(j:n,j))
  544.    END DO
  545. END DO
  546.  
  547. a_matrix(:,index) = bsmall    ! output straightened columns of the inverse
  548.  
  549. END SUBROUTINE Crout
  550.  
  551. ! -------------------------------------------------------------------
  552. SUBROUTINE Croutblasted (a_matrix, n)      ! Invert matrix by Crout method
  553.   USE kinds
  554.   IMPLICIT NONE
  555.   INTEGER, INTENT(IN) :: n
  556.   REAL(KIND=8), DIMENSION(n,n), INTENT(INOUT) :: a_matrix
  557.  
  558.   ! BLAS Interfaces
  559.   INTERFACE
  560.     SUBROUTINE DGEMV(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY)
  561.       USE kinds
  562.       CHARACTER(LEN=1), INTENT(IN) :: TRANS
  563.       INTEGER, INTENT(IN) :: M, N, LDA, INCX, INCY
  564.       REAL(KIND=8), INTENT(IN) :: ALPHA, BETA
  565.       REAL(KIND=8), INTENT(IN) :: A(LDA,*), X(*)
  566.       REAL(KIND=8), INTENT(INOUT) :: Y(*)
  567.     END SUBROUTINE DGEMV
  568.    
  569.     REAL(KIND=8) FUNCTION DDOT(N, DX, INCX, DY, INCY)
  570.       USE kinds
  571.       INTEGER, INTENT(IN) :: N, INCX, INCY
  572.       REAL(KIND=8), INTENT(IN) :: DX(*), DY(*)
  573.     END FUNCTION DDOT
  574.    
  575.     SUBROUTINE DSWAP(N, DX, INCX, DY, INCY)
  576.       USE kinds
  577.       INTEGER, INTENT(IN) :: N, INCX, INCY
  578.       REAL(KIND=8), INTENT(INOUT) :: DX(*), DY(*)
  579.     END SUBROUTINE DSWAP
  580.   END INTERFACE
  581.  
  582.   ! Constants
  583.   REAL(KIND=8), PARAMETER :: one = 1.0_RK8
  584.   REAL(KIND=8), PARAMETER :: zero = 0.0_RK8
  585.  
  586.   ! Rest of your implementation...
  587.   REAL(KIND=8), DIMENSION(n,n) :: bsmall
  588. INTEGER :: i, j, m, imax(1)      ! Current row & column, max pivot loc
  589. INTEGER :: index(n)              ! Partial pivot record
  590. REAL(KIND=8) :: temp(n)     ! working arrays, temp
  591.  
  592.  
  593.   !!!! STEP 1: Matrix Shuffle - CAN'T BE BLAS OPTIMIZED (irregular access)
  594.   index = (/(i,i=1,n)/)
  595.   DO j = 1, n
  596.      DO i = 1, j-1
  597.         bsmall(i, j) = a_matrix(i, j)
  598.      END DO
  599.      DO i = j, n
  600.         bsmall(i, j) = a_matrix(n+1-j, i+1-j)
  601.      END DO
  602.   END DO
  603.  
  604.  
  605. DO j = 1, n   ! LU decomposition; reciprocals of diagonal elements in L matrix
  606.      !!!! STEP 2: LU Decomposition - PARTIALLY OPTIMIZED
  607.      !!!! DONE: L column computation with DDOT
  608.      DO i = j, n
  609.         bsmall(n-i+j,n+1-i) = bsmall(n-i+j,n+1-i) - &
  610.              DDOT(j-1, bsmall(n+1-i:n-i+j-1,n+1-i), 1, bsmall(1:j-1,j), 1)
  611.      END DO
  612.  
  613.  
  614.    imax = MAXLOC(ABS( (/ (bsmall(j+i-1,i),i=1,n-j+1) /) ))
  615.  
  616.    m = imax(1)
  617.  
  618.    bsmall(j+m-1,m) = 1/bsmall(j+m-1,m)
  619.  
  620.    IF (m /= n+1-j) THEN   ! Swap biggest element to current pivot position
  621.       index((/j,n+1-m/))     = index((/n+1-m,j/))
  622.  
  623.       bsmall((/j,n+1-m/),n+2-m:n) = bsmall((/n+1-m,j/),n+2-m:n)
  624.       temp(1:n+1-m)          = bsmall(m:n, m)
  625.       bsmall(m:j-1+m, m)          = bsmall(n+1-j:n, n+1-j)
  626.       bsmall(j+m:n, m)            = bsmall(j, j+1:n+1-m)
  627.       bsmall(n+1-j:n, n+1-j)      = temp(1:j)
  628.       bsmall(j, j+1:n+1-m)        = temp(j+1:n+1-m)
  629.    END IF
  630.  
  631.      !!!! TODO: U row computation - Current uses DOT_PRODUCT
  632.      ! Potential optimization: Replace with DDOT
  633.     DO i = j+1, n
  634.         bsmall(j,i) = bsmall(n,n+1-j)*(bsmall(j,i) - &
  635.         DDOT(j-1, bsmall(n+1-j:n-1,n+1-j), 1, bsmall(1:j-1,i), 1))
  636.     END DO
  637. END DO
  638.  
  639.   !!!! STEP 3: L Inversion - DONE with DDOT
  640.   DO j = 1, n-1
  641.      temp(1) = bsmall(n, n+1-j)
  642.      DO i = j+1, n
  643.         bsmall(n-i+j,n+1-i) = -DDOT(i-j, bsmall(n-i+j:n-1,n+1-i), 1, temp(1), 1) * bsmall(n,n+1-i)
  644.         temp(i-j+1) = bsmall(n-i+j,n+1-i)
  645.      END DO
  646.   END DO
  647.  
  648.  
  649.   !!!! STEP 4: Matrix Reshuffle - CAN'T BE BLAS OPTIMIZED (irregular access)
  650.   DO i = 1, (n+1)/3
  651.      temp(1:n+2-3*i) = bsmall(2*i:n+1-i,i)
  652.      DO j = 2*i, n+1-i
  653.         bsmall(j, i) = bsmall(n+i-j, n+1-j)
  654.      END DO
  655.      DO j = i, n+1-2*i
  656.         bsmall(i+j-1, j) = bsmall(n+1-i, n+2-i-j)
  657.      END DO
  658.      bsmall(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
  659.   END DO
  660.  
  661.   !!!! STEP 5: U Inversion - DONE with DDOT
  662.   DO i = 1, n-1
  663.      DO j = i+1, n
  664.         bsmall(i,j) = -bsmall(i,j) - DDOT(j-i-1, temp(1), 1, bsmall(i+1,j), 1)
  665.         temp(j-i) = bsmall(i,j)
  666.      END DO
  667.   END DO
  668.  
  669.   !!!! STEP 6: Final Multiplication - TODO: Current uses DOT_PRODUCT
  670.   ! Potential optimization: Replace with DGEMV
  671.   DO i = 1, n-1
  672.      temp(1:n-i) = bsmall(i,i+1:n)
  673.      DO j = 1,i
  674.         bsmall(i,j) = bsmall(i,j) + DOT_PRODUCT(temp(1:n-i),bsmall(i+1:n,j))
  675.      END DO
  676.      DO j = i+1, n
  677.         bsmall(i,j) = DOT_PRODUCT(temp(j-i:n-i),bsmall(j:n,j))
  678.      END DO
  679.   END DO
  680.  
  681.  
  682. a_matrix(:,index) = bsmall    ! output straightened columns of the inverse
  683.  
  684. END SUBROUTINE Croutblasted
  685.  
  686.  
  687.  
  688. ! ===== chat gpt
  689. subroutine crout_inverse_threaded3(A, n)
  690.   USE kinds
  691.   implicit none
  692.   integer, intent(in) :: n
  693.  
  694.   REAL(RK8), intent(inout) :: A(n,n)
  695.  
  696.  
  697.   real(RK8), allocatable :: L(:,:), U(:,:), Ainv(:,:)
  698.   integer :: i, j, k
  699.   real(RK8) :: sum_val, inv_Ljj
  700.   real(RK8), parameter :: eps = 1.0e-12_RK8
  701.  
  702.   if (n.lt.500) then
  703.   call Croutblasted(A,n)
  704.   else
  705.   ! Allocate aligned matrices
  706.   allocate(L(n,n), source=0.0_RK8)
  707.   allocate(U(n,n), source=0.0_RK8)
  708.   allocate(Ainv(n,n), source=0.0_RK8)
  709.  
  710.   !$omp parallel do simd aligned(U:64)
  711.   do i = 1, n
  712.     U(i,i) = 1.0_RK8
  713.   end do
  714.  
  715.   do j = 1, n
  716.     ! Crout: compute L column
  717.     !$omp parallel do private(i, k, sum_val) shared(A, L, U)
  718.     do i = j, n
  719.       sum_val = 0.0_RK8
  720.       !$omp simd reduction(+:sum_val)
  721.       do k = 1, j-1
  722.         sum_val = sum_val + L(i,k) * U(k,j)
  723.       end do
  724.       L(i,j) = A(i,j) - sum_val
  725.     end do
  726.     !$omp end parallel do
  727.  
  728.     if (abs(L(j,j)) < eps * n * sqrt(real(n, RK8))) then
  729.       error stop "Matrix is numerically singular in Crout LU"
  730.     end if
  731.  
  732.     ! Crout: compute U row
  733.     inv_Ljj = 1.0_RK8 / L(j,j)
  734.     !$omp parallel do private(i, k, sum_val) shared(A, L, U)
  735.     do i = j+1, n
  736.       sum_val = 0.0_RK8
  737.       !$omp simd reduction(+:sum_val)
  738.       do k = 1, j-1
  739.         sum_val = sum_val + L(j,k) * U(k,i)
  740.       end do
  741.       U(j,i) = (A(j,i) - sum_val) * inv_Ljj
  742.     end do
  743.     !$omp end parallel do
  744.   end do
  745.  
  746.   ! Solve each column of inverse matrix in parallel
  747.   !$omp parallel do
  748.   do i = 1, n
  749.     call solve_column(L, U, n, i, Ainv(:,i))
  750.   end do
  751.   !$omp end parallel do
  752.  
  753.   A = Ainv
  754.   endif
  755.  
  756. contains
  757.  
  758.   subroutine solve_column(L, U, n, col_idx, x)
  759.     real(RK8), intent(in) :: L(n,n), U(n,n)
  760.     integer, intent(in) :: n, col_idx
  761.     real(RK8), intent(out) :: x(n)
  762.     real(RK8) :: e(n), y(n)
  763.     integer :: row, k
  764.     real(RK8) :: sum_val
  765.  
  766.     e = 0.0_RK8
  767.     e(col_idx) = 1.0_RK8
  768.  
  769.     ! Forward substitution: solve L y = e
  770.     do row = 1, n
  771.       sum_val = 0.0_RK8
  772.       !$omp simd reduction(+:sum_val)
  773.       do k = 1, row-1
  774.         sum_val = sum_val + L(row,k) * y(k)
  775.       end do
  776.       y(row) = (e(row) - sum_val) / L(row,row)
  777.     end do
  778.  
  779.     ! Backward substitution: solve U x = y
  780.     do row = n, 1, -1
  781.       sum_val = 0.0_RK8
  782.       !$omp simd reduction(+:sum_val)
  783.       do k = row+1, n
  784.         sum_val = sum_val + U(row,k) * x(k)
  785.       end do
  786.       x(row) = (y(row) - sum_val) / U(row,row)
  787.     end do
  788.   end subroutine solve_column
  789.  
  790. end subroutine crout_inverse_threaded3
  791.  
  792.  
  793. ! =====
  794.  
  795.  
  796.  
  797. ! --------------------------------------------------------------------
  798. SUBROUTINE Lapack (n)     ! Invert matrix by Lapack method
  799. ! --------------------------------------------------------------------
  800. use arrayMod
  801. IMPLICIT NONE
  802. integer             :: smallsize,bigsize,smallits
  803. common                 smallsize,bigsize,smallits
  804. INTEGER   :: n
  805. !REAL(KIND=8):: a(n,n)
  806.  
  807.  
  808. INTEGER :: ipiv(n)
  809. INTEGER :: info, lwork, ILAENV
  810. REAL(KIND=8), ALLOCATABLE :: work(:)
  811.  
  812. lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
  813. ALLOCATE ( work(lwork) )
  814.  
  815. CALL DGETRF( n, n, a3, n, ipiv, info )
  816. CALL DGETRI( n, a3, n, ipiv, work, lwork, info )
  817.  
  818. DEALLOCATE ( work )
  819.  
  820. END SUBROUTINE Lapack
  821.  
  822. ! --------------------------------------------------------------------
  823. SUBROUTINE Lapack2 (n)     ! Invert matrix by Lapack method using DGESV
  824. ! --------------------------------------------------------------------
  825. use arrayMod
  826. IMPLICIT NONE
  827. integer             :: smallsize,bigsize,smallits
  828. common                 smallsize,bigsize,smallits
  829.  
  830. INTEGER   :: n
  831. !REAL(KIND=8):: a(n,n)
  832. !REAL(KIND=8), ALLOCATABLE :: B(:,:)
  833.  
  834. INTEGER :: ipiv(n)
  835. INTEGER :: info !, lwork, ILAENV
  836. integer :: NRHS,i,lda,ldb
  837. !REAL(KIND=8), ALLOCATABLE :: work(:)
  838.  
  839. !lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
  840. !ALLOCATE ( work(lwork) )
  841.  
  842.  
  843. ![A*B = X] all nxn if X = i B is inverse
  844. !  NRHS    (input) INTEGER
  845. !          The number of right hand sides, i.e., the number of columns
  846. !          of the matrix B.  NRHS >= 0.
  847. !* A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  848. !          On entry, the N-by-N coefficient matrix A.
  849. !          On exit, the factors L and U from the factorization
  850. !          A = P*L*U; the unit diagonal elements of L are not stored.
  851. !* LDA     (input) INTEGER
  852. !          The leading dimension of the array A.  LDA >= max(1,N).
  853. !  IPIV    (output) INTEGER array, dimension (N)
  854. !          The pivot indices that define the permutation matrix P;
  855. !          row i of the matrix was interchanged with row IPIV(i).
  856. !  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
  857. !          On entry, the N-by-NRHS matrix of right hand side matrix B.
  858. !          On exit, if INFO = 0, the N-by-NRHS solution matrix X.
  859. !  LDB     (input) INTEGER
  860. !          The leading dimension of the array B.  LDB >= max(1,N).
  861. !  INFO    (output) INTEGER
  862. !          = 0:  successful exit
  863. !         < 0:  if INFO = -i, the i-th argument had an illegal value
  864. !          > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization
  865.  
  866. NRHS = n
  867. lda  = n
  868. !allocate (b(n,n)) ! b is already allocated in arrayMod
  869. b=0.d0
  870. do i=1,n
  871.     b(i,i) = 1.d0 ! Set b to identity matrix
  872. enddo
  873. ldb = n
  874.  
  875.  
  876. call DGESV( N, NRHS, A3, LDA, IPIV, B, LDB, INFO )
  877.  
  878.  
  879. !DEALLOCATE ( work )
  880. a3 = b ! Resulting inverse is in b
  881. !deallocate (b) ! b is allocated globally, don't deallocate here
  882. END SUBROUTINE Lapack2
  883.  
  884. ! --------------------------------------------------------------------
  885. SUBROUTINE DGEtrf2call  (n)     ! Invert matrix by Lapack method using DGETRF2
  886. ! --------------------------------------------------------------------
  887.  use arrayMod
  888.  IMPLICIT NONE
  889.  integer             :: smallsize,bigsize,smallits
  890.  common                 smallsize,bigsize,smallits
  891.  
  892.  INTEGER   :: n
  893.  
  894.  
  895.  INTEGER :: ipiv(n)
  896.  INTEGER :: info, lwork, ILAENV
  897.  REAL(KIND=8), ALLOCATABLE :: work(:)
  898.  
  899.  lwork = n * ILAENV( 1, 'DGETRI', ' ', n, -1, -1, -1 )
  900.  ALLOCATE ( work(lwork) )
  901.  
  902.  CALL DGETRF2( n, n, a3, n, ipiv, info )
  903.  CALL DGETRI( n, a3, n, ipiv, work, lwork, info )
  904.  
  905.  DEALLOCATE ( work )
  906.  
  907. END SUBROUTINE DGEtrf2call
  908.  
Advertisement
Comments
  • realbabilu
    41 days
    # text 1.57 KB | 0 0
    1. Updated version of multithreading TEST_CPU Fortran.uk Polyhedron Fortran Benchmark : that single thread (https://openbenchmarking.org/test/pts/polyhedron&eval=111335e02dea1589bab7a9564f8efef17d907dcb#metrics) Implemented by: David Frank [email protected], original gauss Tim Prince [email protected], Crout James Van Buskirk [email protected], Jos Bergervoet [email protected]. NOW gauss single thread with looping repetition OpenMP, crout single thread with looping repetition OpenMP, crout multi thread, and implementing with vendor blas lapack instead building own. by default as same as phoroix is 1000 repetition dual inversing 250x250, and 2000x2000 dual inversing, and check the difference.
    2.  
    3. Example: ryzen 3900x
    4. C:\Temp>test_fpu_gemini_mt5.exe
    5. Inside arraySub() allocating memory ...
    6. Memory allocation sufficient
    7. Benchmark running, hopefully as only ACTIVE task
    8. Number of OpenMP threads: 24
    9. Test1: Gauss mt- 1000 ( 250x 250) inverts in 0.723 seconds Err= 0.5971541960567150E-14
    10. Test2: Croutstmt- 1000 ( 250x 250) inverts in 0.865 seconds Err= 0.7541962840872367E-14
    11. Calling Croutmt- 1 Bigsize
    12. Calling Croutmt- 2 Bigsize
    13. Test3: Croutmt- 2 ( 2000x 2000) inverts in 3.016 seconds Err= 0.9010735252907198E-09
    14. Test4: DGETRF - 2 ( 2000x 2000) inverts in 0.294 seconds Err= 0.2360292208074361E-11
    15. Test5: DGETRF2- 2 ( 2000x 2000) inverts in 0.289 seconds Err= 0.2635024887233167E-11
    16. Test6: DGESV - 2 ( 2000x 2000) inverts in 0.199 seconds Err= 0.2345595889465859E-11
    17. Total = 5.4 sec
Add Comment
Please, Sign In to add comment
Advertisement