Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using Random, LinearAlgebra, Dates, Printf
- using LoopVectorization
- using .Threads
- function crout_inverse_threaded_optimized(A::Matrix{Float64})
- n = size(A, 1)
- L = zeros(n, n)
- U = Matrix{Float64}(I, n, n)
- block_size = 64 # Cache-friendly block size
- # Optimized Crout factorization
- @inbounds for j in 1:n
- # Compute column j of L (cache-blocked)
- for i_block in j:block_size:n
- i_end = min(i_block + block_size - 1, n)
- @turbo for i in i_block:i_end
- sum_val = 0.0
- for k in 1:j-1
- sum_val += L[i,k] * U[k,j]
- end
- L[i,j] = A[i,j] - sum_val
- end
- end
- # Check singularity
- if abs(L[j,j]) < eps(Float64) * n * sqrt(n)
- error("Matrix is numerically singular")
- end
- # Compute row j of U (cache-blocked)
- inv_Ljj = 1.0 / L[j,j]
- for i_block in j+1:block_size:n
- i_end = min(i_block + block_size - 1, n)
- @turbo for i in i_block:i_end
- sum_val = 0.0
- for k in 1:j-1
- sum_val += L[j,k] * U[k,i]
- end
- U[j,i] = (A[j,i] - sum_val) * inv_Ljj
- end
- end
- end
- # Parallel solve for inverse columns
- Ainv = zeros(n, n)
- Threads.@threads for i in 1:n
- e = zeros(n)
- e[i] = 1.0
- y = zeros(n)
- x = zeros(n)
- # Forward substitution (optimized)
- @inbounds for row in 1:n
- sum_val = 0.0
- @turbo for col in 1:row-1
- sum_val += L[row,col] * y[col]
- end
- y[row] = (e[row] - sum_val) / L[row,row]
- end
- # Backward substitution (optimized)
- @inbounds for row in n:-1:1
- sum_val = 0.0
- @turbo for col in row+1:n
- sum_val += U[row,col] * x[col]
- end
- x[row] = (y[row] - sum_val) / U[row,row]
- end
- Ainv[:,i] .= x
- end
- return Ainv
- end
- # Thread-safe Gauss-Jordan implementation
- function gauss_safe!(A::Matrix{Float64})
- n = size(A, 1)
- B = copy(A)
- I_matrix = Matrix{Float64}(I, n, n)
- block_size = 64 # Cache-friendly block size
- @inbounds for k in 1:n
- # Pivoting (sequential)
- pivot_row = k
- for i in k+1:n
- if abs(B[i, k]) > abs(B[pivot_row, k])
- pivot_row = i
- end
- end
- if pivot_row != k
- B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
- I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
- end
- pivot = B[k, k]
- if abs(pivot) < eps(Float64) * n * sqrt(n)
- error("Matrix is numerically singular")
- end
- # Scale pivot row (parallel safe)
- scale = 1.0 / pivot
- @inbounds @simd for j in 1:n
- B[k, j] *= scale
- I_matrix[k, j] *= scale
- end
- # Elimination (parallel safe)
- @inbounds for i in 1:n
- if i != k
- factor = B[i, k]
- @simd for j in 1:n
- B[i, j] -= factor * B[k, j]
- I_matrix[i, j] -= factor * I_matrix[k, j]
- end
- end
- end
- end
- return I_matrix
- end
- # === Selected Implementations ===
- # Standard Serial Gauss-Jordan Elimination based Inverse
- # Included for comparison with custom threaded methods.
- function gauss_jordan_inverse_serial!(A::Matrix{Float64})
- n = size(A, 1)
- B = copy(A) # Work on a copy of A
- I_matrix = Matrix{Float64}(I, n, n) # Start with an identity matrix
- @inbounds for k in 1:n
- # Partial Pivoting
- pivot_row = k
- for i in k+1:n
- if abs(B[i, k]) > abs(B[pivot_row, k])
- pivot_row = i
- end
- end
- if pivot_row != k
- # Swap rows in both B and I_matrix
- B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
- I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
- end
- pivot_element = B[k, k]
- if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
- error("Matrix is numerically singular at step $k (Gauss-Jordan Serial)")
- end
- # Scale the pivot row to make the pivot element 1
- scale = 1.0 / pivot_element
- @inbounds for col in 1:n
- B[k, col] *= scale
- I_matrix[k, col] *= scale
- end
- # Perform elimination for other rows
- for i in 1:n
- if i != k
- factor = B[i, k]
- @inbounds for col in 1:n
- B[i, col] -= factor * B[k, col]
- I_matrix[i, col] -= factor * I_matrix[k, col]
- end
- end
- end
- end
- # After reduction, I_matrix holds the inverse
- return I_matrix
- end
- # Multi-threaded Crout Factorization and Inverse - The best performing Crout
- function crout_inverse_threaded(A::Matrix{Float64})
- n = size(A, 1)
- L = zeros(n, n)
- U = Matrix{Float64}(I, n, n) # U is initialized as Identity
- # Crout Factorization (mostly sequential due to dependencies)
- # Parallelizing this part is complex and might not be beneficial for small matrices
- @inbounds for j in 1:n # Loop over columns
- # Compute column j of L
- for i in j:n # Loop over rows in column j
- sum_val = zero(Float64)
- @inbounds for k in 1:j-1
- sum_val += L[i,k] * U[k,j]
- end
- L[i,j] = A[i,j] - sum_val
- # Check for singularity during factorization
- if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
- error("Matrix is numerically singular during Crout factorization at step $j (Threaded)")
- end
- end
- # Compute row j of U (for elements right of the diagonal)
- inv_Ljj = 1.0 / L[j,j]
- for i in j+1:n # Loop over columns in row j (right of diagonal)
- sum_val = zero(Float64)
- @inbounds for k in 1:j-1
- sum_val += L[j,k] * U[k,i]
- end
- U[j,i] = (A[j,i] - sum_val) * inv_Ljj
- end
- end
- # Solve for the inverse column by column (Parallelize this step)
- # A * Ainv = I => L*U * Ainv = I. Solve L*Y = I for Y, then U*Ainv = Y for Ainv.
- # We can solve for each column of Ainv independently.
- Ainv = zeros(n, n)
- Threads.@threads for i in 1:n # Parallelize over columns of the identity matrix (and thus Ainv)
- e = zeros(n) # Column of the identity matrix
- e[i] = 1.0
- # Solve L * y = e (Forward Substitution)
- # This part is sequential within each thread
- y = zeros(n)
- @inbounds for row in 1:n
- sum_val = zero(Float64)
- @inbounds for col in 1:row-1
- sum_val += L[row, col] * y[col]
- end
- y[row] = (e[row] - sum_val) / L[row, row]
- end
- # Solve U * x = y (Backward Substitution)
- # This part is sequential within each thread
- x = zeros(n) # This will be the i-th column of Ainv
- @inbounds for row in n:-1:1
- sum_val = zero(Float64)
- @inbounds for col in row+1:n
- sum_val += U[row, col] * x[col]
- end
- x[row] = (y[row] - sum_val) / U[row, row]
- end
- Ainv[:,i] .= x # Store the computed column of the inverse
- end
- return Ainv
- end
- # Function to compute the inverse using LAPACK's dgetrf! and dgetri!
- function lapack_inverse(A::Matrix{Float64})
- n = size(A, 1)
- n != size(A, 2) && error("Matrix must be square")
- # Create a copy since we modify in-place
- A_copy = copy(A)
- # Step 1: LU factorization (getrf!)
- result = LinearAlgebra.LAPACK.getrf!(A_copy)
- A_lu = result[1]
- ipiv = result[2]
- info_getrf = result[3]
- if info_getrf > 0
- error("Matrix is singular in getrf! at position $info_getrf")
- elseif info_getrf < 0
- error("Invalid argument in getrf! at position $(-info_getrf)")
- end
- # Step 2: Matrix inversion (getri!)
- A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)
- end
- function test_fpu()
- # Constants
- smallsize = 250
- smallits = 1000
- bigsize = 2000
- println("Benchmark running, hopefully as only ACTIVE task")
- println("Current time: ", now())
- # Warm-up runs
- warmup = rand(10,10)
- gauss_safe!(copy(warmup))
- crout_inverse_threaded_optimized(copy(warmup))
- # Initialize random number pools
- Random.seed!(1234) # Fixed seed for reproducibility
- pool = rand(smallsize, smallsize, smallits)
- pool3 = rand(bigsize, bigsize)
- # Working matrices
- a = zeros(smallsize, smallsize)
- a3 = zeros(bigsize, bigsize)
- c = zeros(smallsize, smallsize)
- # Timing results
- dt = zeros(4)
- for n in 1:4
- clock1 = time_ns()
- if n == 1
- # Test 1: Optimized Gauss inversion
- # Parallelize across matrices (thread-safe)
- Threads.@threads for i in 1:smallits
- local_a = copy(@view pool[:, :, i])
- local_a = gauss_safe!(local_a) # invert
- local_a = gauss_safe!(local_a) # invert back
- if i == smallits
- a .= local_a # Only last iteration for error check
- end
- end
- # Calculate error (compare to original)
- avg_err = sum(abs.(a .- pool[:, :, smallits]))/(smallsize*smallsize)
- clock2 = time_ns()
- dt[n] = (clock2 - clock1)/1e9
- @printf("Test1: Gauss - %d (%dx%d) inverts in %.3f seconds Err= %.16e\n",
- smallits, smallsize, smallsize, dt[n], avg_err)
- elseif n == 2
- # Test 2: Crout inversion crout_inverse_threaded_optimized
- Threads.@threads for i in 1:smallits
- local_a = copy(@view pool[:, :, i])
- local_a = crout_inverse_threaded_optimized(local_a) # invert
- local_a = crout_inverse_threaded_optimized(local_a) # invert back
- if i == smallits
- a .= local_a # Only last iteration for error check
- end
- end
- # Calculate error (compare to original)
- avg_err = sum(abs.(a .- pool[:, :, smallits]))/(smallsize*smallsize)
- clock2 = time_ns()
- dt[n] = (clock2 - clock1)/1e9
- @printf("Test2: Crout - %d (%dx%d) inverts in %.3f seconds Err= %.16e\n",
- smallits, smallsize, smallsize, dt[n], avg_err)
- elseif n == 3
- # Test 3: Crout inversion Largesize
- a3 = copy(pool3)
- a3 = crout_inverse_threaded(a3) # invert a
- a3 = crout_inverse_threaded(a3) # invert back
- # Calculate error (compare to original)
- avg_err = sum(abs.(a3 .- pool3))/(bigsize*bigsize)
- clock2 = time_ns()
- dt[n] = (clock2 - clock1)/1e9
- @printf("Test3: Crout - %d (%dx%d) inverts in %.3f seconds Err= %.16e\n",
- 2, bigsize, bigsize, dt[n], avg_err)
- elseif n == 4
- # Test 4: Lapack inversion Largesize
- a3 = copy(pool3)
- a3 = lapack_inverse(a3) # invert a
- a3 = lapack_inverse(a3) # invert back
- # Calculate error (compare to original)
- avg_err = sum(abs.(a3 .- pool3))/(bigsize*bigsize)
- clock2 = time_ns()
- dt[n] = (clock2 - clock1)/1e9
- @printf("Test4: Lapack - %d (%dx%d) inverts in %.3f seconds Err= %.16e\n",
- 2, bigsize, bigsize, dt[n], avg_err)
- end
- end
- @printf(" Total = %.1f sec\n", sum(dt))
- println()
- end
- # Run the benchmark
- test_fpu()
Advertisement
Comments
-
- Julia version of multithreading TEST_CPU Fortran.uk Polyhedron Fortran Benchmark : that single thread (https://openbenchmarking.org/test/pts/polyhedron&eval=111335e02dea1589bab7a9564f8efef17d907dcb#metrics) Implemented by: David Frank [email protected], original gauss Tim Prince [email protected], Crout James Van Buskirk [email protected], Jos Bergervoet [email protected]. NOW gauss single thread with looping repetition OpenMP, crout single thread with looping repetition OpenMP, crout multi thread, and implementing with vendor blas lapack instead building own. by default as same as phoroix is 1000 repetition dual inversing 250x250, and 2000x2000 dual inversing, and check the difference.
- Example: ryzen 3900x
- C:\Temp>julia -t auto -O3 test_fpu5.jl.txt
- Benchmark running, hopefully as only ACTIVE task
- Julia Threads : 24
- BLAS Threads : 12
- BLAS backend libraries:
- LBTConfig([ILP64] mkl_rt.2.dll, [LP64] mkl_rt.2.dll)
- Test1: Gauss mt- 1000 (250x250) inverts in 6.529 seconds Err= 4.2673054395647724e-15
- Test2: Crout mt- 1000 (250x250) inverts in 2.347 seconds Err= 1.5946885695561353e-12
- Test3: Crout mt- 2 (2000x2000) inverts in 7.336 seconds Err= 5.3543682559395547e-10
- Test4: Lapack - 2 (2000x2000) inverts in 0.200 seconds Err= 5.2270673897888225e-13
- Total = 16.4 sec
Add Comment
Please, Sign In to add comment
Advertisement