test_cpu_multithread_julia

using Random, LinearAlgebra, Dates, Printf
using LoopVectorization
using .Threads

function crout_inverse_threaded_optimized(A::Matrix{Float64})
    n = size(A, 1)
    L = zeros(n, n)
    U = Matrix{Float64}(I, n, n)
    block_size = 64  # Cache-friendly block size

    # Optimized Crout factorization
    @inbounds for j in 1:n
        # Compute column j of L (cache-blocked)
        for i_block in j:block_size:n
            i_end = min(i_block + block_size - 1, n)
            @turbo for i in i_block:i_end
                sum_val = 0.0
                for k in 1:j-1
                    sum_val += L[i,k] * U[k,j]
                end
                L[i,j] = A[i,j] - sum_val
            end
        end

        # Check singularity
        if abs(L[j,j]) < eps(Float64) * n * sqrt(n)
            error("Matrix is numerically singular")
        end

        # Compute row j of U (cache-blocked)
        inv_Ljj = 1.0 / L[j,j]
        for i_block in j+1:block_size:n
            i_end = min(i_block + block_size - 1, n)
            @turbo for i in i_block:i_end
                sum_val = 0.0
                for k in 1:j-1
                    sum_val += L[j,k] * U[k,i]
                end
                U[j,i] = (A[j,i] - sum_val) * inv_Ljj
            end
        end
    end

    # Parallel solve for inverse columns
    Ainv = zeros(n, n)
    Threads.@threads for i in 1:n
        e = zeros(n)
        e[i] = 1.0
        y = zeros(n)
        x = zeros(n)

        # Forward substitution (optimized)
        @inbounds for row in 1:n
            sum_val = 0.0
            @turbo for col in 1:row-1
                sum_val += L[row,col] * y[col]
            end
            y[row] = (e[row] - sum_val) / L[row,row]
        end

        # Backward substitution (optimized)
        @inbounds for row in n:-1:1
            sum_val = 0.0
            @turbo for col in row+1:n
                sum_val += U[row,col] * x[col]
            end
            x[row] = (y[row] - sum_val) / U[row,row]
        end

        Ainv[:,i] .= x
    end

    return Ainv
end

# Thread-safe Gauss-Jordan implementation
function gauss_safe!(A::Matrix{Float64})
    n = size(A, 1)
    B = copy(A)
    I_matrix = Matrix{Float64}(I, n, n)
    block_size = 64  # Cache-friendly block size

    @inbounds for k in 1:n
        # Pivoting (sequential)
        pivot_row = k
        for i in k+1:n
            if abs(B[i, k]) > abs(B[pivot_row, k])
                pivot_row = i
            end
        end

        if pivot_row != k
            B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
            I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
        end

        pivot = B[k, k]
        if abs(pivot) < eps(Float64) * n * sqrt(n)
            error("Matrix is numerically singular")
        end

        # Scale pivot row (parallel safe)
        scale = 1.0 / pivot
        @inbounds @simd for j in 1:n
            B[k, j] *= scale
            I_matrix[k, j] *= scale
        end

        # Elimination (parallel safe)
        @inbounds for i in 1:n
            if i != k
                factor = B[i, k]
                @simd for j in 1:n
                    B[i, j] -= factor * B[k, j]
                    I_matrix[i, j] -= factor * I_matrix[k, j]
                end
            end
        end
    end
    return I_matrix
end

# === Selected Implementations ===

# Standard Serial Gauss-Jordan Elimination based Inverse
# Included for comparison with custom threaded methods.
function gauss_jordan_inverse_serial!(A::Matrix{Float64})
    n = size(A, 1)
    B = copy(A) # Work on a copy of A
    I_matrix = Matrix{Float64}(I, n, n) # Start with an identity matrix

    @inbounds for k in 1:n
        # Partial Pivoting
        pivot_row = k
        for i in k+1:n
            if abs(B[i, k]) > abs(B[pivot_row, k])
                pivot_row = i
            end
        end

        if pivot_row != k
            # Swap rows in both B and I_matrix
            B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
            I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
        end

        pivot_element = B[k, k]
        if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
            error("Matrix is numerically singular at step $k (Gauss-Jordan Serial)")
        end

        # Scale the pivot row to make the pivot element 1
        scale = 1.0 / pivot_element
        @inbounds for col in 1:n
            B[k, col] *= scale
            I_matrix[k, col] *= scale
        end


        # Perform elimination for other rows
        for i in 1:n
            if i != k
                factor = B[i, k]
                @inbounds for col in 1:n
                    B[i, col] -= factor * B[k, col]
                    I_matrix[i, col] -= factor * I_matrix[k, col]
                end
            end
        end
    end

    # After reduction, I_matrix holds the inverse
    return I_matrix
end

# Multi-threaded Crout Factorization and Inverse - The best performing Crout
function crout_inverse_threaded(A::Matrix{Float64})
    n = size(A, 1)
    L = zeros(n, n)
    U = Matrix{Float64}(I, n, n) # U is initialized as Identity

    # Crout Factorization (mostly sequential due to dependencies)
    # Parallelizing this part is complex and might not be beneficial for small matrices
    @inbounds for j in 1:n # Loop over columns
        # Compute column j of L
        for i in j:n # Loop over rows in column j
            sum_val = zero(Float64)
            @inbounds for k in 1:j-1
                sum_val += L[i,k] * U[k,j]
            end
            L[i,j] = A[i,j] - sum_val

            # Check for singularity during factorization
            if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
                error("Matrix is numerically singular during Crout factorization at step $j (Threaded)")
            end
        end

        # Compute row j of U (for elements right of the diagonal)
        inv_Ljj = 1.0 / L[j,j]
        for i in j+1:n # Loop over columns in row j (right of diagonal)
            sum_val = zero(Float64)
            @inbounds for k in 1:j-1
                sum_val += L[j,k] * U[k,i]
            end
            U[j,i] = (A[j,i] - sum_val) * inv_Ljj
        end
    end

    # Solve for the inverse column by column (Parallelize this step)
    # A * Ainv = I => L*U * Ainv = I. Solve L*Y = I for Y, then U*Ainv = Y for Ainv.
    # We can solve for each column of Ainv independently.
    Ainv = zeros(n, n)
    Threads.@threads for i in 1:n # Parallelize over columns of the identity matrix (and thus Ainv)
        e = zeros(n) # Column of the identity matrix
        e[i] = 1.0

        # Solve L * y = e (Forward Substitution)
        # This part is sequential within each thread
        y = zeros(n)
        @inbounds for row in 1:n
            sum_val = zero(Float64)
            @inbounds for col in 1:row-1
                sum_val += L[row, col] * y[col]
            end
            y[row] = (e[row] - sum_val) / L[row, row]
        end

        # Solve U * x = y (Backward Substitution)
        # This part is sequential within each thread
        x = zeros(n) # This will be the i-th column of Ainv
        @inbounds for row in n:-1:1
            sum_val = zero(Float64)
            @inbounds for col in row+1:n
                sum_val += U[row, col] * x[col]
            end
            x[row] = (y[row] - sum_val) / U[row, row]
        end
        Ainv[:,i] .= x # Store the computed column of the inverse
    end
    return Ainv
end

# Function to compute the inverse using LAPACK's dgetrf! and dgetri!
function lapack_inverse(A::Matrix{Float64})
    n = size(A, 1)
    n != size(A, 2) && error("Matrix must be square")

    # Create a copy since we modify in-place
    A_copy = copy(A)

    # Step 1: LU factorization (getrf!)
    result = LinearAlgebra.LAPACK.getrf!(A_copy)
    A_lu = result[1]
    ipiv = result[2]
    info_getrf = result[3]

    if info_getrf > 0
        error("Matrix is singular in getrf! at position $info_getrf")
    elseif info_getrf < 0
        error("Invalid argument in getrf! at position $(-info_getrf)")
    end

    # Step 2: Matrix inversion (getri!)
    A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)
end


function test_fpu()
    # Constants
    smallsize = 250
    smallits = 1000
    bigsize = 2000

    println("Benchmark running, hopefully as only ACTIVE task")
    println("Current time: ", now())

    # Warm-up runs
    warmup = rand(10,10)
    gauss_safe!(copy(warmup))
    crout_inverse_threaded_optimized(copy(warmup))

    # Initialize random number pools
    Random.seed!(1234)  # Fixed seed for reproducibility
    pool = rand(smallsize, smallsize, smallits)
    pool3 = rand(bigsize, bigsize)

    # Working matrices
    a = zeros(smallsize, smallsize)
    a3 = zeros(bigsize, bigsize)
    c  = zeros(smallsize, smallsize)

    # Timing results
    dt = zeros(4)

    for n in 1:4
        clock1 = time_ns()

        if n == 1
            # Test 1: Optimized Gauss inversion
            # Parallelize across matrices (thread-safe)
            Threads.@threads for i in 1:smallits
             local_a = copy(@view pool[:, :, i])
             local_a = gauss_safe!(local_a)  # invert
             local_a = gauss_safe!(local_a)  # invert back

             if i == smallits
               a .= local_a  # Only last iteration for error check
             end
            end

            # Calculate error (compare to original)
            avg_err = sum(abs.(a .- pool[:, :, smallits]))/(smallsize*smallsize)
            clock2 = time_ns()
            dt[n] = (clock2 - clock1)/1e9
            @printf("Test1: Gauss  - %d (%dx%d) inverts in %.3f seconds  Err= %.16e\n",
                    smallits, smallsize, smallsize, dt[n], avg_err)

        elseif n == 2
            # Test 2: Crout inversion crout_inverse_threaded_optimized
            Threads.@threads for i in 1:smallits
             local_a = copy(@view pool[:, :, i])
             local_a = crout_inverse_threaded_optimized(local_a)  # invert
             local_a = crout_inverse_threaded_optimized(local_a)  # invert back

             if i == smallits
               a .= local_a  # Only last iteration for error check
             end
            end

            # Calculate error (compare to original)
            avg_err = sum(abs.(a .- pool[:, :, smallits]))/(smallsize*smallsize)
            clock2 = time_ns()
            dt[n] = (clock2 - clock1)/1e9
            @printf("Test2: Crout  - %d (%dx%d) inverts in %.3f seconds  Err= %.16e\n",
                    smallits, smallsize, smallsize, dt[n], avg_err)

        elseif n == 3
            # Test 3: Crout inversion Largesize
            a3 = copy(pool3)
            a3 = crout_inverse_threaded(a3)  # invert a
            a3 = crout_inverse_threaded(a3)  # invert back

            # Calculate error (compare to original)
            avg_err = sum(abs.(a3 .- pool3))/(bigsize*bigsize)
            clock2 = time_ns()
            dt[n] = (clock2 - clock1)/1e9
            @printf("Test3: Crout  - %d (%dx%d)  inverts in %.3f seconds  Err= %.16e\n",
                    2, bigsize, bigsize, dt[n], avg_err)

        elseif n == 4
            # Test 4: Lapack inversion Largesize
            a3 = copy(pool3)
            a3 = lapack_inverse(a3)  # invert a
            a3 = lapack_inverse(a3)  # invert back

            # Calculate error (compare to original)
            avg_err = sum(abs.(a3 .- pool3))/(bigsize*bigsize)
            clock2 = time_ns()
            dt[n] = (clock2 - clock1)/1e9
            @printf("Test4: Lapack - %d (%dx%d)  inverts in %.3f seconds  Err= %.16e\n",
                    2, bigsize, bigsize, dt[n], avg_err)
        end
    end

    @printf("                             Total = %.1f sec\n", sum(dt))
    println()
end


# Run the benchmark
test_fpu()
Julia version of multithreading TEST_CPU Fortran.uk Polyhedron Fortran Benchmark : that single thread (https://openbenchmarking.org/test/pts/polyhedron&eval=111335e02dea1589bab7a9564f8efef17d907dcb#metrics) Implemented by: David Frank  [email protected], original gauss Tim Prince   [email protected], Crout James Van Buskirk  [email protected], Jos Bergervoet [email protected]. NOW gauss single thread with looping repetition OpenMP, crout single thread with looping repetition OpenMP, crout multi thread, and implementing with vendor blas lapack instead building own. by default as same as phoroix is 1000 repetition dual inversing 250x250, and 2000x2000 dual inversing, and check the difference.

Example: ryzen 3900x
C:\Temp>julia -t auto -O3 test_fpu5.jl.txt
Benchmark running, hopefully as only ACTIVE task
Julia Threads         : 24
BLAS Threads          : 12
BLAS backend libraries:
LBTConfig([ILP64] mkl_rt.2.dll, [LP64] mkl_rt.2.dll)
Test1: Gauss mt- 1000 (250x250) inverts in 6.529 seconds  Err= 4.2673054395647724e-15
Test2: Crout mt- 1000 (250x250) inverts in 2.347 seconds  Err= 1.5946885695561353e-12
Test3: Crout mt- 2 (2000x2000)  inverts in 7.336 seconds  Err= 5.3543682559395547e-10
Test4: Lapack - 2 (2000x2000)  inverts in 0.200 seconds  Err= 5.2270673897888225e-13
                             Total = 16.4 sec