Advertisement
realbabilu

test_cpu_multithread_julia

May 9th, 2025
26
0
Never
1
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Julia 12.06 KB | Source Code | 0 0
  1. using Random, LinearAlgebra, Dates, Printf
  2. using LoopVectorization
  3. using .Threads
  4.  
  5. function crout_inverse_threaded_optimized(A::Matrix{Float64})
  6.     n = size(A, 1)
  7.     L = zeros(n, n)
  8.     U = Matrix{Float64}(I, n, n)
  9.     block_size = 64  # Cache-friendly block size
  10.  
  11.     # Optimized Crout factorization
  12.     @inbounds for j in 1:n
  13.         # Compute column j of L (cache-blocked)
  14.         for i_block in j:block_size:n
  15.             i_end = min(i_block + block_size - 1, n)
  16.             @turbo for i in i_block:i_end
  17.                 sum_val = 0.0
  18.                 for k in 1:j-1
  19.                     sum_val += L[i,k] * U[k,j]
  20.                 end
  21.                 L[i,j] = A[i,j] - sum_val
  22.             end
  23.         end
  24.  
  25.         # Check singularity
  26.         if abs(L[j,j]) < eps(Float64) * n * sqrt(n)
  27.             error("Matrix is numerically singular")
  28.         end
  29.  
  30.         # Compute row j of U (cache-blocked)
  31.         inv_Ljj = 1.0 / L[j,j]
  32.         for i_block in j+1:block_size:n
  33.             i_end = min(i_block + block_size - 1, n)
  34.             @turbo for i in i_block:i_end
  35.                 sum_val = 0.0
  36.                 for k in 1:j-1
  37.                     sum_val += L[j,k] * U[k,i]
  38.                 end
  39.                 U[j,i] = (A[j,i] - sum_val) * inv_Ljj
  40.             end
  41.         end
  42.     end
  43.  
  44.     # Parallel solve for inverse columns
  45.     Ainv = zeros(n, n)
  46.     Threads.@threads for i in 1:n
  47.         e = zeros(n)
  48.         e[i] = 1.0
  49.         y = zeros(n)
  50.         x = zeros(n)
  51.  
  52.         # Forward substitution (optimized)
  53.         @inbounds for row in 1:n
  54.             sum_val = 0.0
  55.             @turbo for col in 1:row-1
  56.                 sum_val += L[row,col] * y[col]
  57.             end
  58.             y[row] = (e[row] - sum_val) / L[row,row]
  59.         end
  60.  
  61.         # Backward substitution (optimized)
  62.         @inbounds for row in n:-1:1
  63.             sum_val = 0.0
  64.             @turbo for col in row+1:n
  65.                 sum_val += U[row,col] * x[col]
  66.             end
  67.             x[row] = (y[row] - sum_val) / U[row,row]
  68.         end
  69.  
  70.         Ainv[:,i] .= x
  71.     end
  72.  
  73.     return Ainv
  74. end
  75.  
  76. # Thread-safe Gauss-Jordan implementation
  77. function gauss_safe!(A::Matrix{Float64})
  78.     n = size(A, 1)
  79.     B = copy(A)
  80.     I_matrix = Matrix{Float64}(I, n, n)
  81.     block_size = 64  # Cache-friendly block size
  82.  
  83.     @inbounds for k in 1:n
  84.         # Pivoting (sequential)
  85.         pivot_row = k
  86.         for i in k+1:n
  87.             if abs(B[i, k]) > abs(B[pivot_row, k])
  88.                 pivot_row = i
  89.             end
  90.         end
  91.  
  92.         if pivot_row != k
  93.             B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
  94.             I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
  95.         end
  96.  
  97.         pivot = B[k, k]
  98.         if abs(pivot) < eps(Float64) * n * sqrt(n)
  99.             error("Matrix is numerically singular")
  100.         end
  101.  
  102.         # Scale pivot row (parallel safe)
  103.         scale = 1.0 / pivot
  104.         @inbounds @simd for j in 1:n
  105.             B[k, j] *= scale
  106.             I_matrix[k, j] *= scale
  107.         end
  108.  
  109.         # Elimination (parallel safe)
  110.         @inbounds for i in 1:n
  111.             if i != k
  112.                 factor = B[i, k]
  113.                 @simd for j in 1:n
  114.                     B[i, j] -= factor * B[k, j]
  115.                     I_matrix[i, j] -= factor * I_matrix[k, j]
  116.                 end
  117.             end
  118.         end
  119.     end
  120.     return I_matrix
  121. end
  122.  
  123. # === Selected Implementations ===
  124.  
  125. # Standard Serial Gauss-Jordan Elimination based Inverse
  126. # Included for comparison with custom threaded methods.
  127. function gauss_jordan_inverse_serial!(A::Matrix{Float64})
  128.     n = size(A, 1)
  129.     B = copy(A) # Work on a copy of A
  130.     I_matrix = Matrix{Float64}(I, n, n) # Start with an identity matrix
  131.  
  132.     @inbounds for k in 1:n
  133.         # Partial Pivoting
  134.         pivot_row = k
  135.         for i in k+1:n
  136.             if abs(B[i, k]) > abs(B[pivot_row, k])
  137.                 pivot_row = i
  138.             end
  139.         end
  140.  
  141.         if pivot_row != k
  142.             # Swap rows in both B and I_matrix
  143.             B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
  144.             I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
  145.         end
  146.  
  147.         pivot_element = B[k, k]
  148.         if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
  149.             error("Matrix is numerically singular at step $k (Gauss-Jordan Serial)")
  150.         end
  151.  
  152.         # Scale the pivot row to make the pivot element 1
  153.         scale = 1.0 / pivot_element
  154.         @inbounds for col in 1:n
  155.             B[k, col] *= scale
  156.             I_matrix[k, col] *= scale
  157.         end
  158.  
  159.  
  160.         # Perform elimination for other rows
  161.         for i in 1:n
  162.             if i != k
  163.                 factor = B[i, k]
  164.                 @inbounds for col in 1:n
  165.                     B[i, col] -= factor * B[k, col]
  166.                     I_matrix[i, col] -= factor * I_matrix[k, col]
  167.                 end
  168.             end
  169.         end
  170.     end
  171.  
  172.     # After reduction, I_matrix holds the inverse
  173.     return I_matrix
  174. end
  175.  
  176. # Multi-threaded Crout Factorization and Inverse - The best performing Crout
  177. function crout_inverse_threaded(A::Matrix{Float64})
  178.     n = size(A, 1)
  179.     L = zeros(n, n)
  180.     U = Matrix{Float64}(I, n, n) # U is initialized as Identity
  181.  
  182.     # Crout Factorization (mostly sequential due to dependencies)
  183.     # Parallelizing this part is complex and might not be beneficial for small matrices
  184.     @inbounds for j in 1:n # Loop over columns
  185.         # Compute column j of L
  186.         for i in j:n # Loop over rows in column j
  187.             sum_val = zero(Float64)
  188.             @inbounds for k in 1:j-1
  189.                 sum_val += L[i,k] * U[k,j]
  190.             end
  191.             L[i,j] = A[i,j] - sum_val
  192.  
  193.             # Check for singularity during factorization
  194.             if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
  195.                 error("Matrix is numerically singular during Crout factorization at step $j (Threaded)")
  196.             end
  197.         end
  198.  
  199.         # Compute row j of U (for elements right of the diagonal)
  200.         inv_Ljj = 1.0 / L[j,j]
  201.         for i in j+1:n # Loop over columns in row j (right of diagonal)
  202.             sum_val = zero(Float64)
  203.             @inbounds for k in 1:j-1
  204.                 sum_val += L[j,k] * U[k,i]
  205.             end
  206.             U[j,i] = (A[j,i] - sum_val) * inv_Ljj
  207.         end
  208.     end
  209.  
  210.     # Solve for the inverse column by column (Parallelize this step)
  211.     # A * Ainv = I => L*U * Ainv = I. Solve L*Y = I for Y, then U*Ainv = Y for Ainv.
  212.     # We can solve for each column of Ainv independently.
  213.     Ainv = zeros(n, n)
  214.     Threads.@threads for i in 1:n # Parallelize over columns of the identity matrix (and thus Ainv)
  215.         e = zeros(n) # Column of the identity matrix
  216.         e[i] = 1.0
  217.  
  218.         # Solve L * y = e (Forward Substitution)
  219.         # This part is sequential within each thread
  220.         y = zeros(n)
  221.         @inbounds for row in 1:n
  222.             sum_val = zero(Float64)
  223.             @inbounds for col in 1:row-1
  224.                 sum_val += L[row, col] * y[col]
  225.             end
  226.             y[row] = (e[row] - sum_val) / L[row, row]
  227.         end
  228.  
  229.         # Solve U * x = y (Backward Substitution)
  230.         # This part is sequential within each thread
  231.         x = zeros(n) # This will be the i-th column of Ainv
  232.         @inbounds for row in n:-1:1
  233.             sum_val = zero(Float64)
  234.             @inbounds for col in row+1:n
  235.                 sum_val += U[row, col] * x[col]
  236.             end
  237.             x[row] = (y[row] - sum_val) / U[row, row]
  238.         end
  239.         Ainv[:,i] .= x # Store the computed column of the inverse
  240.     end
  241.     return Ainv
  242. end
  243.  
  244. # Function to compute the inverse using LAPACK's dgetrf! and dgetri!
  245. function lapack_inverse(A::Matrix{Float64})
  246.     n = size(A, 1)
  247.     n != size(A, 2) && error("Matrix must be square")
  248.  
  249.     # Create a copy since we modify in-place
  250.     A_copy = copy(A)
  251.  
  252.     # Step 1: LU factorization (getrf!)
  253.     result = LinearAlgebra.LAPACK.getrf!(A_copy)
  254.     A_lu = result[1]
  255.     ipiv = result[2]
  256.     info_getrf = result[3]
  257.  
  258.     if info_getrf > 0
  259.         error("Matrix is singular in getrf! at position $info_getrf")
  260.     elseif info_getrf < 0
  261.         error("Invalid argument in getrf! at position $(-info_getrf)")
  262.     end
  263.  
  264.     # Step 2: Matrix inversion (getri!)
  265.     A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)
  266. end
  267.  
  268.  
  269.  
  270. function test_fpu()
  271.     # Constants
  272.     smallsize = 250
  273.     smallits = 1000
  274.     bigsize = 2000
  275.  
  276.     println("Benchmark running, hopefully as only ACTIVE task")
  277.     println("Current time: ", now())
  278.  
  279.     # Warm-up runs
  280.     warmup = rand(10,10)
  281.     gauss_safe!(copy(warmup))
  282.     crout_inverse_threaded_optimized(copy(warmup))
  283.  
  284.     # Initialize random number pools
  285.     Random.seed!(1234)  # Fixed seed for reproducibility
  286.     pool = rand(smallsize, smallsize, smallits)
  287.     pool3 = rand(bigsize, bigsize)
  288.    
  289.     # Working matrices
  290.     a = zeros(smallsize, smallsize)
  291.     a3 = zeros(bigsize, bigsize)
  292.     c  = zeros(smallsize, smallsize)
  293.  
  294.     # Timing results
  295.     dt = zeros(4)
  296.    
  297.     for n in 1:4
  298.         clock1 = time_ns()
  299.        
  300.         if n == 1
  301.             # Test 1: Optimized Gauss inversion
  302.             # Parallelize across matrices (thread-safe)
  303.             Threads.@threads for i in 1:smallits
  304.              local_a = copy(@view pool[:, :, i])
  305.              local_a = gauss_safe!(local_a)  # invert
  306.              local_a = gauss_safe!(local_a)  # invert back
  307.        
  308.              if i == smallits
  309.                a .= local_a  # Only last iteration for error check
  310.              end
  311.             end
  312.            
  313.             # Calculate error (compare to original)
  314.             avg_err = sum(abs.(a .- pool[:, :, smallits]))/(smallsize*smallsize)
  315.             clock2 = time_ns()
  316.             dt[n] = (clock2 - clock1)/1e9
  317.             @printf("Test1: Gauss  - %d (%dx%d) inverts in %.3f seconds  Err= %.16e\n",
  318.                     smallits, smallsize, smallsize, dt[n], avg_err)
  319.  
  320.         elseif n == 2
  321.             # Test 2: Crout inversion crout_inverse_threaded_optimized
  322.             Threads.@threads for i in 1:smallits
  323.              local_a = copy(@view pool[:, :, i])
  324.              local_a = crout_inverse_threaded_optimized(local_a)  # invert
  325.              local_a = crout_inverse_threaded_optimized(local_a)  # invert back
  326.        
  327.              if i == smallits
  328.                a .= local_a  # Only last iteration for error check
  329.              end
  330.             end
  331.            
  332.             # Calculate error (compare to original)
  333.             avg_err = sum(abs.(a .- pool[:, :, smallits]))/(smallsize*smallsize)
  334.             clock2 = time_ns()
  335.             dt[n] = (clock2 - clock1)/1e9
  336.             @printf("Test2: Crout  - %d (%dx%d) inverts in %.3f seconds  Err= %.16e\n",
  337.                     smallits, smallsize, smallsize, dt[n], avg_err)
  338.  
  339.         elseif n == 3
  340.             # Test 3: Crout inversion Largesize
  341.             a3 = copy(pool3)
  342.             a3 = crout_inverse_threaded(a3)  # invert a
  343.             a3 = crout_inverse_threaded(a3)  # invert back
  344.            
  345.             # Calculate error (compare to original)
  346.             avg_err = sum(abs.(a3 .- pool3))/(bigsize*bigsize)
  347.             clock2 = time_ns()
  348.             dt[n] = (clock2 - clock1)/1e9
  349.             @printf("Test3: Crout  - %d (%dx%d)  inverts in %.3f seconds  Err= %.16e\n",
  350.                     2, bigsize, bigsize, dt[n], avg_err)
  351.  
  352.         elseif n == 4
  353.             # Test 4: Lapack inversion Largesize
  354.             a3 = copy(pool3)
  355.             a3 = lapack_inverse(a3)  # invert a
  356.             a3 = lapack_inverse(a3)  # invert back
  357.            
  358.             # Calculate error (compare to original)
  359.             avg_err = sum(abs.(a3 .- pool3))/(bigsize*bigsize)
  360.             clock2 = time_ns()
  361.             dt[n] = (clock2 - clock1)/1e9
  362.             @printf("Test4: Lapack - %d (%dx%d)  inverts in %.3f seconds  Err= %.16e\n",
  363.                     2, bigsize, bigsize, dt[n], avg_err)
  364.         end
  365.     end
  366.    
  367.     @printf("                             Total = %.1f sec\n", sum(dt))
  368.     println()
  369. end
  370.  
  371.  
  372.  
  373. # Run the benchmark
  374. test_fpu()
Advertisement
Comments
  • realbabilu
    41 days
    # text 1.32 KB | 0 0
    1. Julia version of multithreading TEST_CPU Fortran.uk Polyhedron Fortran Benchmark : that single thread (https://openbenchmarking.org/test/pts/polyhedron&eval=111335e02dea1589bab7a9564f8efef17d907dcb#metrics) Implemented by: David Frank [email protected], original gauss Tim Prince [email protected], Crout James Van Buskirk [email protected], Jos Bergervoet [email protected]. NOW gauss single thread with looping repetition OpenMP, crout single thread with looping repetition OpenMP, crout multi thread, and implementing with vendor blas lapack instead building own. by default as same as phoroix is 1000 repetition dual inversing 250x250, and 2000x2000 dual inversing, and check the difference.
    2.  
    3. Example: ryzen 3900x
    4. C:\Temp>julia -t auto -O3 test_fpu5.jl.txt
    5. Benchmark running, hopefully as only ACTIVE task
    6. Julia Threads : 24
    7. BLAS Threads : 12
    8. BLAS backend libraries:
    9. LBTConfig([ILP64] mkl_rt.2.dll, [LP64] mkl_rt.2.dll)
    10. Test1: Gauss mt- 1000 (250x250) inverts in 6.529 seconds Err= 4.2673054395647724e-15
    11. Test2: Crout mt- 1000 (250x250) inverts in 2.347 seconds Err= 1.5946885695561353e-12
    12. Test3: Crout mt- 2 (2000x2000) inverts in 7.336 seconds Err= 5.3543682559395547e-10
    13. Test4: Lapack - 2 (2000x2000) inverts in 0.200 seconds Err= 5.2270673897888225e-13
    14. Total = 16.4 sec
    15.  
Add Comment
Please, Sign In to add comment
Advertisement