Advertisement
realbabilu

Untitled

May 21st, 2025
19
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Julia 123.86 KB | Source Code | 0 0
  1. using Libdl
  2. using LinearAlgebra # For BlasInt, I, and general linear algebra types
  3.  
  4. # --- Define Library Paths ---
  5. # IMPORTANT: Ensure these paths are correct for your system.
  6. # Replace with your actual AOCL installation paths.
  7. const LIBBLIS_ILP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-blis\\lib\\ILP64\\AOCL-LibBlis-Win-MT-dll.dll"
  8. const LIBBLIS_LP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-blis\\lib\\LP64\\AOCL-LibBlis-Win-MT-dll.dll"
  9.  
  10. const LIBFLAME_ILP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-libflame\\lib\\ILP64\\AOCL-LibFlame-Win-MT-dll.dll"
  11. const LIBFLAME_LP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-libflame\\lib\\LP64\\AOCL-LibFlame-Win-MT-dll.dll"
  12.  
  13. # --- Global Library Handles ---
  14. # These will hold the handles to the loaded DLLs.
  15. # Using `global` allows them to be modified within `try-catch` blocks.
  16. global blis_ilp64_handle = C_NULL
  17. global blis_lp64_handle = C_NULL
  18. global flame_ilp64_handle = C_NULL
  19. global flame_lp64_handle = C_NULL
  20.  
  21. # Function to load a library safely
  22. function load_library(path::String)
  23.     handle = C_NULL
  24.     try
  25.         handle = Libdl.dlopen(path)
  26.         println("Successfully loaded library: $path") # Keep this for loading feedback
  27.     catch e
  28.         @error "Could not load library: $path. Please ensure the path is correct and dependencies are met." exception=(e, catch_backtrace())
  29.     end
  30.     return handle
  31. end
  32.  
  33. # Load all necessary libraries at startup
  34. blis_ilp64_handle = load_library(LIBBLIS_ILP64_PATH)
  35. blis_lp64_handle = load_library(LIBBLIS_LP64_PATH)
  36. flame_ilp64_handle = load_library(LIBFLAME_ILP64_PATH)
  37. flame_lp64_handle = load_library(LIBFLAME_LP64_PATH)
  38.  
  39. # --- Helper for BLAS/LAPACK function calls ---
  40. # This function is a generic dispatcher for ccall.
  41. # It checks if the library handle is valid before attempting to call the function.
  42. function call_external_function(lib_handle, func_name::Symbol, ret_type::DataType, arg_types::Tuple, args...)
  43.     if lib_handle === C_NULL
  44.         error("Library for function :$func_name is not loaded.")
  45.     end
  46.     func_ptr = Libdl.dlsym(lib_handle, func_name)
  47.     if func_ptr == C_NULL
  48.         error("Function :$func_name not found in library.")
  49.     end
  50.     return ccall(func_ptr, ret_type, arg_types, args...)
  51. end
  52.  
  53. # --- BLAS Level 1 Functions (Vector-Vector Operations) ---
  54.  
  55. # AXPY: y = alpha*x + y
  56. # s/daxpy (single/double precision)
  57. function aocl_saxpy!(y::Vector{Float32}, alpha::Float32, x::Vector{Float32})
  58.     n = length(x)
  59.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  60.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  61.     call_external_function(handle, :saxpy_, Cvoid,
  62.                            (Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  63.                            BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1))
  64.     return y
  65. end
  66.  
  67. function aocl_daxpy!(y::Vector{Float64}, alpha::Float64, x::Vector{Float64})
  68.     n = length(x)
  69.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  70.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  71.     call_external_function(handle, :daxpy_, Cvoid,
  72.                            (Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  73.                            BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1))
  74.     return y
  75. end
  76.  
  77. # DOT: dot product of two vectors
  78. # s/ddot (single/double precision)
  79. function aocl_sdot(x::Vector{Float32}, y::Vector{Float32})
  80.     n = length(x)
  81.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  82.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  83.     result = Ref{Float32}(0.0)
  84.     call_external_function(handle, :sdot_, Cvoid,
  85.                            (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}),
  86.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), result)
  87.     return result[]
  88. end
  89.  
  90. function aocl_ddot(x::Vector{Float64}, y::Vector{Float64})
  91.     n = length(x)
  92.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  93.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  94.     result = Ref{Float64}(0.0)
  95.     call_external_function(handle, :ddot_, Cvoid,
  96.                            (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}),
  97.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), result)
  98.     return result[]
  99. end
  100.  
  101. # SCAL: x = alpha*x
  102. # s/dscal (single/double precision)
  103. function aocl_sscal!(x::Vector{Float32}, alpha::Float32)
  104.     n = length(x)
  105.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  106.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  107.     call_external_function(handle, :sscal_, Cvoid,
  108.                            (Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  109.                            BlasIntType(n), alpha, x, BlasIntType(1))
  110.     return x
  111. end
  112.  
  113. function aocl_dscal!(x::Vector{Float64}, alpha::Float64)
  114.     n = length(x)
  115.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  116.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  117.     call_external_function(handle, :dscal_, Cvoid,
  118.                            (Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  119.                            BlasIntType(n), alpha, x, BlasIntType(1))
  120.     return x
  121. end
  122.  
  123. # SWAP: swap two vectors
  124. # s/dswap (single/double precision)
  125. function aocl_sswap!(x::Vector{Float32}, y::Vector{Float32})
  126.     n = length(x)
  127.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  128.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  129.     call_external_function(handle, :sswap_, Cvoid,
  130.                            (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  131.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
  132.     return x, y
  133. end
  134.  
  135. function aocl_dswap!(x::Vector{Float64}, y::Vector{Float64})
  136.     n = length(x)
  137.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  138.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  139.     call_external_function(handle, :dswap_, Cvoid,
  140.                            (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  141.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
  142.     return x, y
  143. end
  144.  
  145. # NRM2: Euclidean norm of a vector
  146. # s/dnrm2 (single/double precision)
  147. function aocl_snrm2(x::Vector{Float32})
  148.     n = length(x)
  149.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  150.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  151.     return call_external_function(handle, :snrm2_, Float32,
  152.                                   (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  153.                                   BlasIntType(n), x, BlasIntType(1))
  154. end
  155.  
  156. function aocl_dnrm2(x::Vector{Float64})
  157.     n = length(x)
  158.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  159.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  160.     return call_external_function(handle, :dnrm2_, Float64,
  161.                                   (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  162.                                   BlasIntType(n), x, BlasIntType(1))
  163. end
  164.  
  165. # ASUM: sum of absolute values
  166. # s/dasum (single/double precision)
  167. function aocl_sasum(x::Vector{Float32})
  168.     n = length(x)
  169.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  170.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  171.     return call_external_function(handle, :sasum_, Float32,
  172.                                   (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  173.                                   BlasIntType(n), x, BlasIntType(1))
  174. end
  175.  
  176. function aocl_dasum(x::Vector{Float64})
  177.     n = length(x)
  178.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  179.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  180.     return call_external_function(handle, :dasum_, Float64,
  181.                                   (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  182.                                   BlasIntType(n), x, BlasIntType(1))
  183. end
  184.  
  185. # IAMIN: index of element with smallest absolute value
  186. # is/idamin (single/double precision)
  187. function aocl_isamin(x::Vector{Float32})
  188.     n = length(x)
  189.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  190.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  191.     # BLAS functions return 1-based index
  192.     return call_external_function(handle, :isamin_, BlasIntType,
  193.                                   (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  194.                                   BlasIntType(n), x, BlasIntType(1))
  195. end
  196.  
  197. function aocl_idamin(x::Vector{Float64})
  198.     n = length(x)
  199.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  200.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  201.     # BLAS functions return 1-based index
  202.     return call_external_function(handle, :idamin_, BlasIntType,
  203.                                   (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  204.                                   BlasIntType(n), x, BlasIntType(1))
  205. end
  206.  
  207. # IAMAX: index of element with largest absolute value
  208. # is/idmax (single/double precision)
  209. function aocl_isamax(x::Vector{Float32})
  210.     n = length(x)
  211.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  212.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  213.     # BLAS functions return 1-based index
  214.     return call_external_function(handle, :isamax_, BlasIntType,
  215.                                   (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  216.                                   BlasIntType(n), x, BlasIntType(1))
  217. end
  218.  
  219. function aocl_idmax(x::Vector{Float64})
  220.     n = length(x)
  221.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  222.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  223.     # BLAS functions return 1-based index
  224.     return call_external_function(handle, :idmax_, BlasIntType,
  225.                                   (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  226.                                   BlasIntType(n), x, BlasIntType(1))
  227. end
  228.  
  229. # ROTG: Generate Givens rotation
  230. # s/drotg (single/double precision)
  231. function aocl_srotg!(a::Ref{Float32}, b::Ref{Float32}, c::Ref{Float32}, s::Ref{Float32})
  232.     handle = blis_lp64_handle # ROTG is typically LP64
  233.     call_external_function(handle, :srotg_, Cvoid,
  234.                            (Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}),
  235.                            a, b, c, s)
  236.     return a[], b[], c[], s[]
  237. end
  238.  
  239. function aocl_drotg!(a::Ref{Float64}, b::Ref{Float64}, c::Ref{Float64}, s::Ref{Float64})
  240.     handle = blis_ilp64_handle # ROTG is typically LP64, but using ILP64 handle for consistency
  241.     call_external_function(handle, :drotg_, Cvoid,
  242.                            (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}),
  243.                            a, b, c, s)
  244.     return a[], b[], c[], s[]
  245. end
  246.  
  247. # ROTMG: Generate modified Givens rotation matrix
  248. # s/drotmg (single/double precision)
  249. function aocl_srotmg!(d1::Ref{Float32}, d2::Ref{Float32}, x1::Ref{Float32}, y1::Float32, param::Vector{Float32})
  250.     handle = blis_lp64_handle # ROTMG is typically LP64
  251.     call_external_function(handle, :srotmg_, Cvoid,
  252.                            (Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{Float32}, Ptr{Float32}),
  253.                            d1, d2, x1, y1, param)
  254.     return d1[], d2[], x1[], y1, param
  255. end
  256.  
  257. function aocl_drotmg!(d1::Ref{Float64}, d2::Ref{Float64}, x1::Ref{Float64}, y1::Float64, param::Vector{Float64})
  258.     handle = blis_ilp64_handle # ROTMG is typically LP64, but using ILP64 handle for consistency
  259.     call_external_function(handle, :drotmg_, Cvoid,
  260.                            (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Float64}, Ptr{Float64}),
  261.                            d1, d2, x1, y1, param)
  262.     return d1[], d2[], x1[], y1, param
  263. end
  264.  
  265. # ROT: Apply Givens rotation
  266. # s/drot (single/double precision)
  267. function aocl_srot!(x::Vector{Float32}, y::Vector{Float32}, c::Float32, s::Float32)
  268.     n = length(x)
  269.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  270.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  271.     call_external_function(handle, :srot_, Cvoid,
  272.                            (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}, Ref{Float32}),
  273.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), c, s)
  274.     return x, y
  275. end
  276.  
  277. function aocl_drot!(x::Vector{Float64}, y::Vector{Float64}, c::Float64, s::Float64)
  278.     n = length(x)
  279.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  280.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  281.     call_external_function(handle, :drot_, Cvoid,
  282.                            (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}, Ref{Float64}),
  283.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), c, s)
  284.     return x, y
  285. end
  286.  
  287. # COPY: copy one vector to another
  288. # s/dcopy (single/double precision)
  289. function aocl_scopy!(y::Vector{Float32}, x::Vector{Float32})
  290.     n = length(x)
  291.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  292.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  293.     call_external_function(handle, :scopy_, Cvoid,
  294.                            (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  295.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
  296.     return y
  297. end
  298.  
  299. function aocl_dcopy!(y::Vector{Float64}, x::Vector{Float64})
  300.     n = length(x)
  301.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  302.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  303.     call_external_function(handle, :dcopy_, Cvoid,
  304.                            (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  305.                            BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
  306.     return y
  307. end
  308.  
  309.  
  310. # --- BLAS Level 2 Functions (Matrix-Vector Operations) ---
  311.  
  312. # GEMV: y = alpha*A*x + beta*y
  313. # s/dgemv (single/double precision)
  314. function aocl_sgemv!(trans::Char, A::Matrix{Float32}, x::Vector{Float32}, y::Vector{Float32}, alpha::Float32, beta::Float32)
  315.     m, n = size(A)
  316.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  317.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  318.     lda = BlasIntType(max(1, stride(A,2)))
  319.     call_external_function(handle, :sgemv_, Cvoid,
  320.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  321.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  322.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  323.                            trans, BlasIntType(m), BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
  324.     return y
  325. end
  326.  
  327. function aocl_dgemv!(trans::Char, A::Matrix{Float64}, x::Vector{Float64}, y::Vector{Float64}, alpha::Float64, beta::Float64)
  328.     m, n = size(A)
  329.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  330.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  331.     lda = BlasIntType(max(1, stride(A,2)))
  332.     call_external_function(handle, :dgemv_, Cvoid,
  333.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  334.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  335.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  336.                            trans, BlasIntType(m), BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
  337.     return y
  338. end
  339.  
  340. # GER: A = alpha*x*y' + A (Rank-1 update)
  341. # s/dger (single/double precision)
  342. function aocl_sger!(A::Matrix{Float32}, alpha::Float32, x::Vector{Float32}, y::Vector{Float32})
  343.     m, n = size(A)
  344.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  345.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  346.     lda = BlasIntType(max(1, stride(A,2)))
  347.     call_external_function(handle, :sger_, Cvoid,
  348.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  349.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  350.                             Ptr{Float32}, Ref{BlasIntType}),
  351.                            BlasIntType(m), BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
  352.     return A
  353. end
  354.  
  355. function aocl_dger!(A::Matrix{Float64}, alpha::Float64, x::Vector{Float64}, y::Vector{Float64})
  356.     m, n = size(A)
  357.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  358.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  359.     lda = BlasIntType(max(1, stride(A,2)))
  360.     call_external_function(handle, :dger_, Cvoid,
  361.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  362.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  363.                             Ptr{Float64}, Ref{BlasIntType}),
  364.                            BlasIntType(m), BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
  365.     return A
  366. end
  367.  
  368. # TRMV: x = A*x or x = A'*x (Triangular Matrix-Vector Multiply)
  369. # s/dtrmv (single/double precision)
  370. function aocl_strmv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float32}, x::Vector{Float32})
  371.     n = size(A, 1)
  372.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  373.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  374.     lda = BlasIntType(max(1, stride(A,2)))
  375.     call_external_function(handle, :strmv_, Cvoid,
  376.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  377.                             Ptr{Float32}, Ref{BlasIntType}),
  378.                            uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
  379.     return x
  380. end
  381.  
  382. function aocl_dtrmv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float64}, x::Vector{Float64})
  383.     n = size(A, 1)
  384.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  385.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  386.     lda = BlasIntType(max(1, stride(A,2)))
  387.     call_external_function(handle, :dtrmv_, Cvoid,
  388.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  389.                             Ptr{Float64}, Ref{BlasIntType}),
  390.                            uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
  391.     return x
  392. end
  393.  
  394. # TRSV: Solve A*x = b or A'*x = b (Triangular Solve)
  395. # s/dtrsv (single/double precision)
  396. function aocl_strsv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float32}, x::Vector{Float32})
  397.     n = size(A, 1)
  398.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  399.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  400.     lda = BlasIntType(max(1, stride(A,2)))
  401.     call_external_function(handle, :strsv_, Cvoid,
  402.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  403.                             Ptr{Float32}, Ref{BlasIntType}),
  404.                            uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
  405.     return x
  406. end
  407.  
  408. function aocl_dtrsv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float64}, x::Vector{Float64})
  409.     n = size(A, 1)
  410.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  411.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  412.     lda = BlasIntType(max(1, stride(A,2)))
  413.     call_external_function(handle, :dtrsv_, Cvoid,
  414.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  415.                             Ptr{Float64}, Ref{BlasIntType}),
  416.                            uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
  417.     return x
  418. end
  419.  
  420. # SYMV: y = alpha*A*x + beta*y (Symmetric Matrix-Vector Multiply)
  421. # s/dsymv (single/double precision)
  422. function aocl_ssymv!(uplo::Char, A::Matrix{Float32}, x::Vector{Float32}, y::Vector{Float32}, alpha::Float32, beta::Float32)
  423.     n = size(A, 1)
  424.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  425.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  426.     lda = BlasIntType(max(1, stride(A,2)))
  427.     call_external_function(handle, :ssymv_, Cvoid,
  428.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
  429.                             Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  430.                            uplo, BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
  431.     return y
  432. end
  433.  
  434. function aocl_dsymv!(uplo::Char, A::Matrix{Float64}, x::Vector{Float64}, y::Vector{Float64}, alpha::Float64, beta::Float64)
  435.     n = size(A, 1)
  436.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  437.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  438.     lda = BlasIntType(max(1, stride(A,2)))
  439.     call_external_function(handle, :dsymv_, Cvoid,
  440.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
  441.                             Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  442.                            uplo, BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
  443.     return y
  444. end
  445.  
  446. # SYR: A = alpha*x*x' + A (Rank-1 update of a symmetric matrix)
  447. # s/dsyr (single/double precision)
  448. function aocl_ssyr!(uplo::Char, A::Matrix{Float32}, alpha::Float32, x::Vector{Float32})
  449.     n = size(A, 1)
  450.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  451.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  452.     lda = BlasIntType(max(1, stride(A,2)))
  453.     call_external_function(handle, :ssyr_, Cvoid,
  454.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
  455.                             Ptr{Float32}, Ref{BlasIntType}),
  456.                            uplo, BlasIntType(n), alpha, x, BlasIntType(1), A, lda)
  457.     return A
  458. end
  459.  
  460. function aocl_dsyr!(uplo::Char, A::Matrix{Float64}, alpha::Float64, x::Vector{Float64})
  461.     n = size(A, 1)
  462.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  463.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  464.     lda = BlasIntType(max(1, stride(A,2)))
  465.     call_external_function(handle, :dsyr_, Cvoid,
  466.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
  467.                             Ptr{Float64}, Ref{BlasIntType}),
  468.                            uplo, BlasIntType(n), alpha, x, BlasIntType(1), A, lda)
  469.     return A
  470. end
  471.  
  472. # SYR2: A = alpha*x*y' + alpha*y*x' + A (Rank-2 update of a symmetric matrix)
  473. # s/dsyr2 (single/double precision)
  474. function aocl_ssyr2!(uplo::Char, A::Matrix{Float32}, alpha::Float32, x::Vector{Float32}, y::Vector{Float32})
  475.     n = size(A, 1)
  476.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  477.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  478.     lda = BlasIntType(max(1, stride(A,2)))
  479.     call_external_function(handle, :ssyr2_, Cvoid,
  480.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
  481.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  482.                            uplo, BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
  483.     return A
  484. end
  485.  
  486. function aocl_dsyr2!(uplo::Char, A::Matrix{Float64}, alpha::Float64, x::Vector{Float64}, y::Vector{Float64})
  487.     n = size(A, 1)
  488.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  489.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  490.     lda = BlasIntType(max(1, stride(A,2)))
  491.     call_external_function(handle, :dsyr2_, Cvoid,
  492.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
  493.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  494.                            uplo, BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
  495.     return A
  496. end
  497.  
  498. # GBMV: y = alpha*A*x + beta*y (General Band Matrix-Vector product)
  499. # s/dgbmv (single/double precision)
  500. function aocl_sgbmv!(trans::Char, m::Int, n::Int, kl::Int, ku::Int, alpha::Float32, A::Matrix{Float32}, x::Vector{Float32}, beta::Float32, y::Vector{Float32})
  501.     handle = max(m, n) > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  502.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  503.     lda = BlasIntType(size(A, 1)) # LDA for banded matrix is KL + KU + 1
  504.     incx = BlasIntType(1)
  505.     incy = BlasIntType(1)
  506.     call_external_function(handle, :sgbmv_, Cvoid,
  507.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  508.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  509.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  510.                            trans, BlasIntType(m), BlasIntType(n), BlasIntType(kl), BlasIntType(ku), alpha,
  511.                            A, lda, x, incx, beta, y, incy)
  512.     return y
  513. end
  514.  
  515. function aocl_dgbmv!(trans::Char, m::Int, n::Int, kl::Int, ku::Int, alpha::Float64, A::Matrix{Float64}, x::Vector{Float64}, beta::Float64, y::Vector{Float64})
  516.     handle = max(m, n) > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  517.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  518.     lda = BlasIntType(size(A, 1)) # LDA for banded matrix is KL + KU + 1
  519.     incx = BlasIntType(1)
  520.     incy = BlasIntType(1)
  521.     call_external_function(handle, :dgbmv_, Cvoid,
  522.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  523.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  524.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  525.                            trans, BlasIntType(m), BlasIntType(n), BlasIntType(kl), BlasIntType(ku), alpha,
  526.                            A, lda, x, incx, beta, y, incy)
  527.     return y
  528. end
  529.  
  530. # SBMV: y = alpha*A*x + beta*y (Symmetric Band Matrix-Vector product)
  531. # s/dsbmv (single/double precision)
  532. function aocl_ssbmv!(uplo::Char, n::Int, k::Int, alpha::Float32, A::Matrix{Float32}, x::Vector{Float32}, beta::Float32, y::Vector{Float32})
  533.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  534.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  535.     lda = BlasIntType(size(A, 1)) # LDA for symmetric band matrix is K + 1
  536.     incx = BlasIntType(1)
  537.     incy = BlasIntType(1)
  538.     call_external_function(handle, :ssbmv_, Cvoid,
  539.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  540.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  541.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  542.                            uplo, BlasIntType(n), BlasIntType(k), alpha, A, lda, x, incx, beta, y, incy)
  543.     return y
  544. end
  545.  
  546. function aocl_dsbmv!(uplo::Char, n::Int, k::Int, alpha::Float64, A::Matrix{Float64}, x::Vector{Float64}, beta::Float64, y::Vector{Float64})
  547.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  548.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  549.     lda = BlasIntType(size(A, 1)) # LDA for symmetric band matrix is K + 1
  550.     incx = BlasIntType(1)
  551.     incy = BlasIntType(1)
  552.     call_external_function(handle, :dsbmv_, Cvoid,
  553.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  554.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  555.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  556.                            uplo, BlasIntType(n), BlasIntType(k), alpha, A, lda, x, incx, beta, y, incy)
  557.     return y
  558. end
  559.  
  560. # SPMV: y = alpha*A*x + beta*y (Symmetric Packed Matrix-Vector product)
  561. # s/dspmv (single/double precision)
  562. function aocl_sspmv!(uplo::Char, n::Int, alpha::Float32, AP::Vector{Float32}, x::Vector{Float32}, beta::Float32, y::Vector{Float32})
  563.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  564.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  565.     incx = BlasIntType(1)
  566.     incy = BlasIntType(1)
  567.     call_external_function(handle, :sspmv_, Cvoid,
  568.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  569.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  570.                            uplo, BlasIntType(n), alpha, AP, x, incx, beta, y, incy)
  571.     return y
  572. end
  573.  
  574. function aocl_dspmv!(uplo::Char, n::Int, alpha::Float64, AP::Vector{Float64}, x::Vector{Float64}, beta::Float64, y::Vector{Float64})
  575.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  576.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  577.     incx = BlasIntType(1)
  578.     incy = BlasIntType(1)
  579.     call_external_function(handle, :dspmv_, Cvoid,
  580.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  581.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  582.                            uplo, BlasIntType(n), alpha, AP, x, incx, beta, y, incy)
  583.     return y
  584. end
  585.  
  586. # TBMV: x = A*x or x = A'*x (Triangular Band Matrix-Vector Multiply)
  587. # s/dtbmv (single/double precision)
  588. function aocl_stbmv!(uplo::Char, trans::Char, diag::Char, n::Int, k::Int, A::Matrix{Float32}, x::Vector{Float32})
  589.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  590.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  591.     lda = BlasIntType(size(A, 1)) # LDA for triangular band matrix is K + 1
  592.     incx = BlasIntType(1)
  593.     call_external_function(handle, :stbmv_, Cvoid,
  594.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  595.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  596.                            uplo, trans, diag, BlasIntType(n), BlasIntType(k), A, lda, x, incx)
  597.     return x
  598. end
  599.  
  600. function aocl_dtbmv!(uplo::Char, trans::Char, diag::Char, n::Int, k::Int, A::Matrix{Float64}, x::Vector{Float64})
  601.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  602.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  603.     lda = BlasIntType(size(A, 1)) # LDA for triangular band matrix is K + 1
  604.     incx = BlasIntType(1)
  605.     call_external_function(handle, :dtbmv_, Cvoid,
  606.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  607.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  608.                            uplo, trans, diag, BlasIntType(n), BlasIntType(k), A, lda, x, incx)
  609.     return x
  610. end
  611.  
  612. # TPMV: x = A*x or x = A'*x (Triangular Packed Matrix-Vector Multiply)
  613. # s/dtpmv (single/double precision)
  614. function aocl_stpmv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float32}, x::Vector{Float32})
  615.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  616.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  617.     incx = BlasIntType(1)
  618.     call_external_function(handle, :stpmv_, Cvoid,
  619.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  620.                            uplo, trans, diag, BlasIntType(n), AP, x, incx)
  621.     return x
  622. end
  623.  
  624. function aocl_dtpmv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float64}, x::Vector{Float64})
  625.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  626.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  627.     incx = BlasIntType(1)
  628.     call_external_function(handle, :dtpmv_, Cvoid,
  629.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  630.                            uplo, trans, diag, BlasIntType(n), AP, x, incx)
  631.     return x
  632. end
  633.  
  634. # TPSV: Solve A*x = b or A'*x = b (Triangular Packed Solve)
  635. # s/dtpsv (single/double precision)
  636. function aocl_stpsv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float32}, x::Vector{Float32})
  637.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  638.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  639.     incx = BlasIntType(1)
  640.     call_external_function(handle, :stpsv_, Cvoid,
  641.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  642.                            uplo, trans, diag, BlasIntType(n), AP, x, incx)
  643.     return x
  644. end
  645.  
  646. function aocl_dtpsv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float64}, x::Vector{Float64})
  647.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  648.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  649.     incx = BlasIntType(1)
  650.     call_external_function(handle, :dtpsv_, Cvoid,
  651.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  652.                            uplo, trans, diag, BlasIntType(n), AP, x, incx)
  653.     return x
  654. end
  655.  
  656. # SPR: A = alpha*x*x' + A (Rank-1 update of a symmetric packed matrix)
  657. # s/dspr (single/double precision)
  658. function aocl_sspr!(uplo::Char, n::Int, alpha::Float32, x::Vector{Float32}, AP::Vector{Float32})
  659.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  660.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  661.     incx = BlasIntType(1)
  662.     call_external_function(handle, :sspr_, Cvoid,
  663.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}),
  664.                            uplo, BlasIntType(n), alpha, x, incx, AP)
  665.     return AP
  666. end
  667.  
  668. function aocl_dspr!(uplo::Char, n::Int, alpha::Float64, x::Vector{Float64}, AP::Vector{Float64})
  669.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  670.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  671.     incx = BlasIntType(1)
  672.     call_external_function(handle, :dspr_, Cvoid,
  673.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}),
  674.                            uplo, BlasIntType(n), alpha, x, incx, AP)
  675.     return AP
  676. end
  677.  
  678. # SPR2: A = alpha*x*y' + alpha*y*x' + A (Rank-2 update of a symmetric packed matrix)
  679. # s/dspr2 (single/double precision)
  680. function aocl_sspr2!(uplo::Char, n::Int, alpha::Float32, x::Vector{Float32}, y::Vector{Float32}, AP::Vector{Float32})
  681.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  682.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  683.     incx = BlasIntType(1)
  684.     incy = BlasIntType(1)
  685.     call_external_function(handle, :sspr2_, Cvoid,
  686.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
  687.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}),
  688.                            uplo, BlasIntType(n), alpha, x, incx, y, incy, AP)
  689.     return AP
  690. end
  691.  
  692. function aocl_dspr2!(uplo::Char, n::Int, alpha::Float64, x::Vector{Float64}, y::Vector{Float64}, AP::Vector{Float64})
  693.     handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  694.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  695.     incx = BlasIntType(1)
  696.     incy = BlasIntType(1)
  697.     call_external_function(handle, :dspr2_, Cvoid,
  698.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
  699.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}),
  700.                            uplo, BlasIntType(n), alpha, x, incx, y, incy, AP)
  701.     return AP
  702. end
  703.  
  704.  
  705. # --- BLAS Level 3 Functions (Matrix-Matrix Operations) ---
  706.  
  707. # GEMM: C = alpha*A*B + beta*C
  708. # s/dgemm (single/double precision)
  709. function aocl_sgemm!(transa::Char, transb::Char, C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32}, alpha::Float32, beta::Float32)
  710.     M, N_ = size(C)
  711.     K = (transa == 'N' || transa == 'n') ? size(A, 2) : size(A, 1)
  712.     handle = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  713.     BlasIntType = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? Int64 : Int32
  714.     m = BlasIntType(M)
  715.     n = BlasIntType(N_)
  716.     k = BlasIntType(K)
  717.     lda = BlasIntType(max(1, stride(A,2)))
  718.     ldb = BlasIntType(max(1, stride(B,2)))
  719.     ldc = BlasIntType(max(1, stride(C,2)))
  720.  
  721.     call_external_function(handle, :sgemm_, Cvoid,
  722.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  723.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  724.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  725.                            transa, transb, m, n, k,
  726.                            alpha, A, lda, B, ldb,
  727.                            beta, C, ldc)
  728.     return C
  729. end
  730.  
  731. function aocl_dgemm!(transa::Char, transb::Char, C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}, alpha::Float64, beta::Float64)
  732.     M, N_ = size(C)
  733.     K = (transa == 'N' || transa == 'n') ? size(A, 2) : size(A, 1)
  734.     handle = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  735.     BlasIntType = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? Int64 : Int32
  736.     m = BlasIntType(M)
  737.     n = BlasIntType(N_)
  738.     k = BlasIntType(K)
  739.     lda = BlasIntType(max(1, stride(A,2)))
  740.     ldb = BlasIntType(max(1, stride(B,2)))
  741.     ldc = BlasIntType(max(1, stride(C,2)))
  742.  
  743.     call_external_function(handle, :dgemm_, Cvoid,
  744.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  745.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  746.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  747.                            transa, transb, m, n, k,
  748.                            alpha, A, lda, B, ldb,
  749.                            beta, C, ldc)
  750.     return C
  751. end
  752.  
  753. # TRMM: B = alpha*A*B or B = alpha*B*A (Triangular Matrix-Matrix Multiply)
  754. # s/dtrmm (single/double precision)
  755. function aocl_strmm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float32}, A::Matrix{Float32}, alpha::Float32)
  756.     m, n = size(B)
  757.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  758.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  759.     lda = BlasIntType(max(1, stride(A,2)))
  760.     ldb = BlasIntType(max(1, stride(B,2)))
  761.     call_external_function(handle, :strmm_, Cvoid,
  762.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  763.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  764.                            side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
  765.                            alpha, A, lda, B, ldb)
  766.     return B
  767. end
  768.  
  769. function aocl_dtrmm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float64}, A::Matrix{Float64}, alpha::Float64)
  770.     m, n = size(B)
  771.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  772.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  773.     lda = BlasIntType(max(1, stride(A,2)))
  774.     ldb = BlasIntType(max(1, stride(B,2)))
  775.     call_external_function(handle, :dtrmm_, Cvoid,
  776.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  777.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  778.                            side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
  779.                            alpha, A, lda, B, ldb)
  780.     return B
  781. end
  782.  
  783. # TRSM: Solve A*X = B or X*A = B (Triangular Solve for multiple right-hand sides)
  784. # s/dtrsm (single/double precision)
  785. function aocl_strsm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float32}, A::Matrix{Float32}, alpha::Float32)
  786.     m, n = size(B)
  787.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  788.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  789.     lda = BlasIntType(max(1, stride(A,2)))
  790.     ldb = BlasIntType(max(1, stride(B,2)))
  791.     call_external_function(handle, :strsm_, Cvoid,
  792.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  793.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
  794.                            side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
  795.                            alpha, A, lda, B, ldb)
  796.     return B
  797. end
  798.  
  799. function aocl_dtrsm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float64}, A::Matrix{Float64}, alpha::Float64)
  800.     m, n = size(B)
  801.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  802.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  803.     lda = BlasIntType(max(1, stride(A,2)))
  804.     ldb = BlasIntType(max(1, stride(B,2)))
  805.     call_external_function(handle, :dtrsm_, Cvoid,
  806.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  807.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
  808.                            side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
  809.                            alpha, A, lda, B, ldb)
  810.     return B
  811. end
  812.  
  813. # SYMM: C = alpha*A*B + beta*C (Symmetric Matrix-Matrix Multiply)
  814. # s/dsymm (single/double precision)
  815. function aocl_ssymm!(side::Char, uplo::Char, C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32}, alpha::Float32, beta::Float32)
  816.     m, n = size(C)
  817.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  818.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  819.     lda = BlasIntType(max(1, stride(A,2)))
  820.     ldb = BlasIntType(max(1, stride(B,2)))
  821.     ldc = BlasIntType(max(1, stride(C,2)))
  822.     call_external_function(handle, :ssymm_, Cvoid,
  823.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  824.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  825.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  826.                            side, uplo, BlasIntType(m), BlasIntType(n), alpha, A, lda, B, ldb, beta, C, ldc)
  827.     return C
  828. end
  829.  
  830. function aocl_dsymm!(side::Char, uplo::Char, C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}, alpha::Float64, beta::Float64)
  831.     m, n = size(C)
  832.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  833.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  834.     lda = BlasIntType(max(1, stride(A,2)))
  835.     ldb = BlasIntType(max(1, stride(B,2)))
  836.     ldc = BlasIntType(max(1, stride(C,2)))
  837.     call_external_function(handle, :dsymm_, Cvoid,
  838.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  839.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  840.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  841.                            side, uplo, BlasIntType(m), BlasIntType(n), alpha, A, lda, B, ldb, beta, C, ldc)
  842.     return C
  843. end
  844.  
  845. # SYRK: C = alpha*A*A' + beta*C (Symmetric Rank-k update)
  846. # s/dsyrk (single/double precision)
  847. function aocl_ssyrk!(uplo::Char, trans::Char, C::Matrix{Float32}, A::Matrix{Float32}, alpha::Float32, beta::Float32)
  848.     n = size(C, 1)
  849.     k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
  850.     handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  851.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  852.     lda = BlasIntType(max(1, stride(A,2)))
  853.     ldc = BlasIntType(max(1, stride(C,2)))
  854.     call_external_function(handle, :ssyrk_, Cvoid,
  855.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  856.                             Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  857.                            uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, beta, C, ldc)
  858.     return C
  859. end
  860.  
  861. function aocl_dsyrk!(uplo::Char, trans::Char, C::Matrix{Float64}, A::Matrix{Float64}, alpha::Float64, beta::Float64)
  862.     n = size(C, 1)
  863.     k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
  864.     handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  865.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  866.     lda = BlasIntType(max(1, stride(A,2)))
  867.     ldc = BlasIntType(max(1, stride(C,2)))
  868.     call_external_function(handle, :dsyrk_, Cvoid,
  869.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  870.                             Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  871.                            uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, beta, C, ldc)
  872.     return C
  873. end
  874.  
  875. # SYR2K: C = alpha*A*B' + alpha*B*A' + beta*C (Symmetric Rank-2k update)
  876. # s/dsyr2k (single/double precision)
  877. function aocl_ssyr2k!(uplo::Char, trans::Char, C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32}, alpha::Float32, beta::Float32)
  878.     n = size(C, 1)
  879.     k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
  880.     handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  881.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  882.     lda = BlasIntType(max(1, stride(A,2)))
  883.     ldb = BlasIntType(max(1, stride(B,2)))
  884.     ldc = BlasIntType(max(1, stride(C,2)))
  885.     call_external_function(handle, :ssyr2k_, Cvoid,
  886.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
  887.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  888.                             Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
  889.                            uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, B, ldb, beta, C, ldc)
  890.     return C
  891. end
  892.  
  893. function aocl_dsyr2k!(uplo::Char, trans::Char, C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}, alpha::Float64, beta::Float64)
  894.     n = size(C, 1)
  895.     k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
  896.     handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  897.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  898.     lda = BlasIntType(max(1, stride(A,2)))
  899.     ldb = BlasIntType(max(1, stride(B,2)))
  900.     ldc = BlasIntType(max(1, stride(C,2)))
  901.     call_external_function(handle, :dsyr2k_, Cvoid,
  902.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  903.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  904.                             Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
  905.                            uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, B, ldb, beta, C, ldc)
  906.     return C
  907. end
  908.  
  909. # HEMM: C = alpha*A*B + beta*C (Hermitian Matrix-Matrix Multiply)
  910. # zhemm (double precision complex)
  911. function aocl_zhemm!(side::Char, uplo::Char, C::Matrix{ComplexF64}, A::Matrix{ComplexF64}, B::Matrix{ComplexF64}, alpha::ComplexF64, beta::ComplexF64)
  912.     m, n = size(C)
  913.     handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  914.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  915.     lda = BlasIntType(max(1, stride(A,2)))
  916.     ldb = BlasIntType(max(1, stride(B,2)))
  917.     ldc = BlasIntType(max(1, stride(C,2)))
  918.     call_external_function(handle, :zhemm_, Cvoid,
  919.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{ComplexF64},
  920.                             Ptr{ComplexF64}, Ref{BlasIntType}, Ptr{ComplexF64}, Ref{BlasIntType},
  921.                             Ref{ComplexF64}, Ptr{ComplexF64}, Ref{BlasIntType}),
  922.                            side, uplo, BlasIntType(m), BlasIntType(n), alpha, A, lda, B, ldb, beta, C, ldc)
  923.     return C
  924. end
  925.  
  926. # HERK: C = alpha*A*A' + beta*C (Hermitian Rank-k update)
  927. # zherk (double precision complex)
  928. function aocl_zherk!(uplo::Char, trans::Char, C::Matrix{ComplexF64}, A::Matrix{ComplexF64}, alpha::Float64, beta::Float64)
  929.     n = size(C, 1)
  930.     k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
  931.     handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  932.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  933.     lda = BlasIntType(max(1, stride(A,2)))
  934.     ldc = BlasIntType(max(1, stride(C,2)))
  935.     call_external_function(handle, :zherk_, Cvoid,
  936.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
  937.                             Ptr{ComplexF64}, Ref{BlasIntType}, Ref{Float64}, Ptr{ComplexF64}, Ref{BlasIntType}),
  938.                            uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, beta, C, ldc)
  939.     return C
  940. end
  941.  
  942. # HER2K: C = alpha*A*B' + alpha*B*A' + beta*C (Hermitian Rank-2k update)
  943. # zher2k (double precision complex)
  944. function aocl_zher2k!(uplo::Char, trans::Char, C::Matrix{ComplexF64}, A::Matrix{ComplexF64}, B::Matrix{ComplexF64}, alpha::ComplexF64, beta::Float64)
  945.     n = size(C, 1)
  946.     k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
  947.     handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
  948.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  949.     lda = BlasIntType(max(1, stride(A,2)))
  950.     ldb = BlasIntType(max(1, stride(B,2)))
  951.     ldc = BlasIntType(max(1, stride(C,2)))
  952.     call_external_function(handle, :zher2k_, Cvoid,
  953.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{ComplexF64},
  954.                             Ptr{ComplexF64}, Ref{BlasIntType}, Ptr{ComplexF64}, Ref{BlasIntType},
  955.                             Ref{Float64}, Ptr{ComplexF64}, Ref{BlasIntType}),
  956.                            uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, B, ldb, beta, C, ldc)
  957.     return C
  958. end
  959.  
  960.  
  961. # --- LAPACK Functions ---
  962.  
  963. # GESV: Solve linear system of equations AX = B
  964. # s/dgesv (single/double precision)
  965. function aocl_sgesv!(A::Matrix{Float32}, B::Matrix{Float32})
  966.     n = size(A, 1)
  967.     nrhs = size(B, 2)
  968.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  969.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  970.     lda = BlasIntType(n)
  971.     ldb = BlasIntType(n)
  972.     ipiv = Vector{Int32}(undef, n) # IPIV is typically Int32 even for ILP64
  973.     info = Ref{BlasIntType}(0)
  974.  
  975.     call_external_function(handle, :sgesv_, Cvoid,
  976.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  977.                             Ptr{Int32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  978.                            BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
  979.     return A, B, ipiv, info[]
  980. end
  981.  
  982. function aocl_dgesv!(A::Matrix{Float64}, B::Matrix{Float64})
  983.     n = size(A, 1)
  984.     nrhs = size(B, 2)
  985.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  986.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  987.     lda = BlasIntType(n)
  988.     ldb = BlasIntType(n)
  989.     ipiv = Vector{Int32}(undef, n) # IPIV is typically Int32 even for ILP64
  990.     info = Ref{BlasIntType}(0)
  991.  
  992.     call_external_function(handle, :dgesv_, Cvoid,
  993.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  994.                             Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  995.                            BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
  996.     return A, B, ipiv, info[]
  997. end
  998.  
  999. # GETRF: LU factorization
  1000. # s/dgetrf (single/double precision)
  1001. function aocl_sgetrf!(A::Matrix{Float32})
  1002.     m, n = size(A)
  1003.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1004.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1005.     lda = BlasIntType(m)
  1006.     ipiv = Vector{Int32}(undef, min(m, n))
  1007.     info = Ref{BlasIntType}(0)
  1008.  
  1009.     call_external_function(handle, :sgetrf_, Cvoid,
  1010.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1011.                             Ptr{Int32}, Ref{BlasIntType}),
  1012.                            BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
  1013.     return A, ipiv, info[]
  1014. end
  1015.  
  1016. function aocl_dgetrf!(A::Matrix{Float64})
  1017.     m, n = size(A)
  1018.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1019.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1020.     lda = BlasIntType(m)
  1021.     ipiv = Vector{Int32}(undef, min(m, n))
  1022.     info = Ref{BlasIntType}(0)
  1023.  
  1024.     call_external_function(handle, :dgetrf_, Cvoid,
  1025.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1026.                             Ptr{Int32}, Ref{BlasIntType}),
  1027.                            BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
  1028.     return A, ipiv, info[]
  1029. end
  1030.  
  1031. # GETRI: Matrix inversion from LU factorization
  1032. # s/dgetri (single/double precision)
  1033. function aocl_sgetri!(A::Matrix{Float32}, ipiv::Vector{Int32})
  1034.     n = size(A, 1)
  1035.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1036.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1037.     lda = BlasIntType(n)
  1038.     info = Ref{BlasIntType}(0)
  1039.  
  1040.     work = Vector{Float32}(undef, 1) # Placeholder, will be resized by LAPACK if LWORK=-1
  1041.     lwork = BlasIntType(-1) # Query optimal workspace size
  1042.  
  1043.     call_external_function(handle, :sgetri_, Cvoid,
  1044.                            (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Int32},
  1045.                             Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1046.                            BlasIntType(n), A, lda, ipiv, work, lwork, info)
  1047.  
  1048.     if info[] == 0 && lwork[] == -1 # Check if query was successful
  1049.         lwork = BlasIntType(round(Int, work[1]))
  1050.         work = Vector{Float32}(undef, lwork)
  1051.         call_external_function(handle, :sgetri_, Cvoid,
  1052.                                (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Int32},
  1053.                                 Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1054.                                BlasIntType(n), A, lda, ipiv, work, lwork, info)
  1055.     end
  1056.     return A, info[]
  1057. end
  1058.  
  1059. function aocl_dgetri!(A::Matrix{Float64}, ipiv::Vector{Int32})
  1060.     n = size(A, 1)
  1061.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1062.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1063.     lda = BlasIntType(n)
  1064.     info = Ref{BlasIntType}(0)
  1065.  
  1066.     work = Vector{Float64}(undef, 1) # Placeholder
  1067.     lwork = BlasIntType(-1) # Query optimal workspace size
  1068.  
  1069.     call_external_function(handle, :dgetri_, Cvoid,
  1070.                            (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32},
  1071.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1072.                            BlasIntType(n), A, lda, ipiv, work, lwork, info)
  1073.  
  1074.     if info[] == 0 && lwork[] == -1
  1075.         lwork = BlasIntType(round(Int, work[1]))
  1076.         work = Vector{Float64}(undef, lwork)
  1077.         call_external_function(handle, :dgetri_, Cvoid,
  1078.                                (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32},
  1079.                                 Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1080.                                BlasIntType(n), A, lda, ipiv, work, lwork, info)
  1081.     end
  1082.     return A, info[]
  1083. end
  1084.  
  1085. # GETRS: Solve system of linear equations using LU factorization
  1086. # s/dgetrs (single/double precision)
  1087. function aocl_sgetrs!(trans::Char, A::Matrix{Float32}, B::Matrix{Float32}, ipiv::Vector{Int32})
  1088.     n = size(A, 1)
  1089.     nrhs = size(B, 2)
  1090.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1091.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1092.     lda = BlasIntType(n)
  1093.     ldb = BlasIntType(n)
  1094.     info = Ref{BlasIntType}(0)
  1095.  
  1096.     call_external_function(handle, :sgetrs_, Cvoid,
  1097.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1098.                             Ptr{Int32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1099.                            trans, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
  1100.     return B, info[]
  1101. end
  1102.  
  1103. function aocl_dgetrs!(trans::Char, A::Matrix{Float64}, B::Matrix{Float64}, ipiv::Vector{Int32})
  1104.     n = size(A, 1)
  1105.     nrhs = size(B, 2)
  1106.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1107.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1108.     lda = BlasIntType(n)
  1109.     ldb = BlasIntType(n)
  1110.     info = Ref{BlasIntType}(0)
  1111.  
  1112.     call_external_function(handle, :dgetrs_, Cvoid,
  1113.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1114.                             Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1115.                            trans, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
  1116.     return B, info[]
  1117. end
  1118.  
  1119.  
  1120. # GEQRF: QR factorization
  1121. # s/dgeqrf (single/double precision)
  1122. function aocl_sgeqrf!(A::Matrix{Float32})
  1123.     m, n = size(A)
  1124.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1125.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1126.     lda = BlasIntType(m)
  1127.     tau = Vector{Float32}(undef, min(m, n))
  1128.     info = Ref{BlasIntType}(0)
  1129.  
  1130.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1131.     lwork = BlasIntType(-1)
  1132.  
  1133.     call_external_function(handle, :sgeqrf_, Cvoid,
  1134.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1135.                             Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1136.                            BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)
  1137.  
  1138.     if info[] == 0 && lwork[] == -1
  1139.         lwork = BlasIntType(round(Int, work[1]))
  1140.         work = Vector{Float32}(undef, lwork)
  1141.         call_external_function(handle, :sgeqrf_, Cvoid,
  1142.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1143.                                 Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1144.                                BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)
  1145.     end
  1146.     return A, tau, info[]
  1147. end
  1148.  
  1149. function aocl_dgeqrf!(A::Matrix{Float64})
  1150.     m, n = size(A)
  1151.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1152.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1153.     lda = BlasIntType(m)
  1154.     tau = Vector{Float64}(undef, min(m, n))
  1155.     info = Ref{BlasIntType}(0)
  1156.  
  1157.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1158.     lwork = BlasIntType(-1)
  1159.  
  1160.     call_external_function(handle, :dgeqrf_, Cvoid,
  1161.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1162.                             Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1163.                            BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)
  1164.  
  1165.     if info[] == 0 && lwork[] == -1
  1166.         lwork = BlasIntType(round(Int, work[1]))
  1167.         work = Vector{Float64}(undef, lwork)
  1168.         call_external_function(handle, :dgeqrf_, Cvoid,
  1169.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1170.                                 Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1171.                                BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)
  1172.     end
  1173.     return A, tau, info[]
  1174. end
  1175.  
  1176. # SYEV: Eigenvalues and (optionally) eigenvectors of a real symmetric matrix
  1177. # s/dsyev (single/double precision)
  1178. function aocl_ssyev!(jobz::Char, uplo::Char, A::Matrix{Float32})
  1179.     n = size(A, 1)
  1180.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1181.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1182.     lda = BlasIntType(n)
  1183.     w = Vector{Float32}(undef, n) # Eigenvalues
  1184.     info = Ref{BlasIntType}(0)
  1185.  
  1186.     # Query optimal workspace size
  1187.     work = Vector{Float32}(undef, 1)
  1188.     lwork = BlasIntType(-1)
  1189.  
  1190.     call_external_function(handle, :ssyev_, Cvoid,
  1191.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1192.                             Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1193.                            jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)
  1194.  
  1195.     if info[] == 0 && lwork[] == -1
  1196.         lwork = BlasIntType(round(Int, work[1]))
  1197.         work = Vector{Float32}(undef, lwork)
  1198.         call_external_function(handle, :ssyev_, Cvoid,
  1199.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1200.                                 Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1201.                                jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)
  1202.     end
  1203.     return A, w, info[] # A contains eigenvectors if jobz='V'
  1204. end
  1205.  
  1206. function aocl_dsyev!(jobz::Char, uplo::Char, A::Matrix{Float64})
  1207.     n = size(A, 1)
  1208.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1209.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1210.     lda = BlasIntType(n)
  1211.     w = Vector{Float64}(undef, n) # Eigenvalues
  1212.     info = Ref{BlasIntType}(0)
  1213.  
  1214.     # Query optimal workspace size
  1215.     work = Vector{Float64}(undef, 1)
  1216.     lwork = BlasIntType(-1)
  1217.  
  1218.     call_external_function(handle, :dsyev_, Cvoid,
  1219.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1220.                             Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1221.                            jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)
  1222.  
  1223.     if info[] == 0 && lwork[] == -1
  1224.         lwork = BlasIntType(round(Int, work[1]))
  1225.         work = Vector{Float64}(undef, lwork)
  1226.         call_external_function(handle, :dsyev_, Cvoid,
  1227.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1228.                                 Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1229.                                jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)
  1230.     end
  1231.     return A, w, info[] # A contains eigenvectors if jobz='V'
  1232. end
  1233.  
  1234. # POTRF: Cholesky factorization of a symmetric positive definite matrix
  1235. # s/dpotrf (single/double precision)
  1236. function aocl_spotrf!(uplo::Char, A::Matrix{Float32})
  1237.     n = size(A, 1)
  1238.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1239.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1240.     lda = BlasIntType(n)
  1241.     info = Ref{BlasIntType}(0)
  1242.  
  1243.     call_external_function(handle, :spotrf_, Cvoid,
  1244.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1245.                            uplo, BlasIntType(n), A, lda, info)
  1246.     return A, info[]
  1247. end
  1248.  
  1249. function aocl_dpotrf!(uplo::Char, A::Matrix{Float64})
  1250.     n = size(A, 1)
  1251.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1252.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1253.     lda = BlasIntType(n)
  1254.     info = Ref{BlasIntType}(0)
  1255.  
  1256.     call_external_function(handle, :dpotrf_, Cvoid,
  1257.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1258.                            uplo, BlasIntType(n), A, lda, info)
  1259.     return A, info[]
  1260. end
  1261.  
  1262. # POTRS: Solve A*X = B for symmetric positive definite A using Cholesky factorization
  1263. # s/dpotrs (single/double precision)
  1264. function aocl_spotrs!(uplo::Char, A::Matrix{Float32}, B::Matrix{Float32})
  1265.     n = size(A, 1)
  1266.     nrhs = size(B, 2)
  1267.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1268.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1269.     lda = BlasIntType(n)
  1270.     ldb = BlasIntType(n)
  1271.     info = Ref{BlasIntType}(0)
  1272.  
  1273.     call_external_function(handle, :spotrs_, Cvoid,
  1274.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1275.                             Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1276.                            uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb, info)
  1277.     return B, info[]
  1278. end
  1279.  
  1280. function aocl_dpotrs!(uplo::Char, A::Matrix{Float64}, B::Matrix{Float64})
  1281.     n = size(A, 1)
  1282.     nrhs = size(B, 2)
  1283.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1284.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1285.     lda = BlasIntType(n)
  1286.     ldb = BlasIntType(n)
  1287.     info = Ref{BlasIntType}(0)
  1288.  
  1289.     call_external_function(handle, :dpotrs_, Cvoid,
  1290.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1291.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1292.                            uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb, info)
  1293.     return B, info[]
  1294. end
  1295.  
  1296. # POTRI: Compute the inverse of a symmetric positive definite matrix using Cholesky factorization
  1297. # s/dpotri (single/double precision)
  1298. function aocl_spotri!(uplo::Char, A::Matrix{Float32})
  1299.     n = size(A, 1)
  1300.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1301.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1302.     lda = BlasIntType(n)
  1303.     info = Ref{BlasIntType}(0)
  1304.  
  1305.     call_external_function(handle, :spotri_, Cvoid,
  1306.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1307.                            uplo, BlasIntType(n), A, lda, info)
  1308.     return A, info[]
  1309. end
  1310.  
  1311. function aocl_dpotri!(uplo::Char, A::Matrix{Float64})
  1312.     n = size(A, 1)
  1313.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1314.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1315.     lda = BlasIntType(n)
  1316.     info = Ref{BlasIntType}(0)
  1317.  
  1318.     call_external_function(handle, :dpotri_, Cvoid,
  1319.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1320.                            uplo, BlasIntType(n), A, lda, info)
  1321.     return A, info[]
  1322. end
  1323.  
  1324. # ORGQR: Generate Q from QR factorization
  1325. # s/dorgqr (single/double precision)
  1326. function aocl_sorgqr!(A::Matrix{Float32}, tau::Vector{Float32})
  1327.     m, n = size(A)
  1328.     k = length(tau)
  1329.     handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1330.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
  1331.     lda = BlasIntType(m)
  1332.     info = Ref{BlasIntType}(0)
  1333.  
  1334.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1335.     lwork = BlasIntType(-1)
  1336.  
  1337.     call_external_function(handle, :sorgqr_, Cvoid,
  1338.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1339.                             Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1340.                            BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)
  1341.  
  1342.     if info[] == 0 && lwork[] == -1
  1343.         lwork = BlasIntType(round(Int, work[1]))
  1344.         work = Vector{Float32}(undef, lwork)
  1345.         call_external_function(handle, :sorgqr_, Cvoid,
  1346.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1347.                                 Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1348.                                BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)
  1349.     end
  1350.     return A, info[]
  1351. end
  1352.  
  1353. function aocl_dorgqr!(A::Matrix{Float64}, tau::Vector{Float64})
  1354.     m, n = size(A)
  1355.     k = length(tau)
  1356.     handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1357.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
  1358.     lda = BlasIntType(m)
  1359.     info = Ref{BlasIntType}(0)
  1360.  
  1361.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1362.     lwork = BlasIntType(-1)
  1363.  
  1364.     call_external_function(handle, :dorgqr_, Cvoid,
  1365.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1366.                             Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1367.                            BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)
  1368.  
  1369.     if info[] == 0 && lwork[] == -1
  1370.         lwork = BlasIntType(round(Int, work[1]))
  1371.         work = Vector{Float64}(undef, lwork)
  1372.         call_external_function(handle, :dorgqr_, Cvoid,
  1373.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1374.                                 Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1375.                                BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)
  1376.     end
  1377.     return A, info[]
  1378. end
  1379.  
  1380. # UNMQR: Apply Q from QR factorization to a matrix
  1381. # s/dunmqr (single/double precision)
  1382. function aocl_sunmqr!(side::Char, trans::Char, C::Matrix{Float32}, A::Matrix{Float32}, tau::Vector{Float32})
  1383.     m, n = size(C)
  1384.     k = length(tau)
  1385.     handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1386.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
  1387.     lda = BlasIntType(size(A, 1))
  1388.     ldc = BlasIntType(m)
  1389.     info = Ref{BlasIntType}(0)
  1390.  
  1391.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1392.     lwork = BlasIntType(-1)
  1393.  
  1394.     call_external_function(handle, :sunmqr_, Cvoid,
  1395.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  1396.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  1397.                             Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1398.                            side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
  1399.                            A, lda, tau, C, ldc, work, lwork, info)
  1400.  
  1401.     if info[] == 0 && lwork[] == -1
  1402.         lwork = BlasIntType(round(Int, work[1]))
  1403.         work = Vector{Float32}(undef, lwork)
  1404.         call_external_function(handle, :sunmqr_, Cvoid,
  1405.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  1406.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  1407.                                 Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1408.                                side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
  1409.                                A, lda, tau, C, ldc, work, lwork, info)
  1410.     end
  1411.     return C, info[]
  1412. end
  1413.  
  1414. function aocl_dunmqr!(side::Char, trans::Char, C::Matrix{Float64}, A::Matrix{Float64}, tau::Vector{Float64})
  1415.     m, n = size(C)
  1416.     k = length(tau)
  1417.     handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1418.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
  1419.     lda = BlasIntType(size(A, 1))
  1420.     ldc = BlasIntType(m)
  1421.     info = Ref{BlasIntType}(0)
  1422.  
  1423.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1424.     lwork = BlasIntType(-1)
  1425.  
  1426.     call_external_function(handle, :dunmqr_, Cvoid,
  1427.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  1428.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  1429.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1430.                            side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
  1431.                            A, lda, tau, C, ldc, work, lwork, info)
  1432.  
  1433.     if info[] == 0 && lwork[] == -1
  1434.         lwork = BlasIntType(round(Int, work[1]))
  1435.         work = Vector{Float64}(undef, lwork)
  1436.         call_external_function(handle, :dunmqr_, Cvoid,
  1437.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  1438.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  1439.                                 Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1440.                                side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
  1441.                                A, lda, tau, C, ldc, work, lwork, info)
  1442.     end
  1443.     return C, info[]
  1444. end
  1445.  
  1446. # GESVD: Singular Value Decomposition
  1447. # s/dgesvd (single/double precision)
  1448. function aocl_sgesvd!(jobu::Char, jobvt::Char, A::Matrix{Float32})
  1449.     m, n = size(A)
  1450.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1451.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1452.     lda = BlasIntType(m)
  1453.     s = Vector{Float32}(undef, min(m, n)) # Singular values
  1454.    
  1455.     # Determine dimensions for U and VT based on jobu/jobvt
  1456.     ldu = BlasIntType(1)
  1457.     if jobu == 'A' || jobu == 'a'
  1458.         ldu = BlasIntType(m)
  1459.     elseif jobu == 'S' || jobu == 's'
  1460.         ldu = BlasIntType(m)
  1461.     end
  1462.     U = Matrix{Float32}(undef, ldu, (jobu == 'A' || jobu == 'a') ? m : (jobu == 'S' || jobu == 's' ? min(m, n) : 1))
  1463.  
  1464.     ldvt = BlasIntType(1)
  1465.     if jobvt == 'A' || jobvt == 'a'
  1466.         ldvt = BlasIntType(n)
  1467.     elseif jobvt == 'O' || jobvt == 'o'
  1468.         ldvt = BlasIntType(min(m, n))
  1469.     end
  1470.     VT = Matrix{Float32}(undef, ldvt, n)
  1471.  
  1472.     info = Ref{BlasIntType}(0)
  1473.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1474.     lwork = BlasIntType(-1)
  1475.  
  1476.     call_external_function(handle, :sgesvd_, Cvoid,
  1477.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  1478.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  1479.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1480.                            jobu, jobvt, BlasIntType(m), BlasIntType(n),
  1481.                            A, lda, s, U, ldu, VT, ldvt, work, lwork, info)
  1482.  
  1483.     if info[] == 0 && lwork[] == -1
  1484.         lwork = BlasIntType(round(Int, work[1]))
  1485.         work = Vector{Float32}(undef, lwork)
  1486.         call_external_function(handle, :sgesvd_, Cvoid,
  1487.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  1488.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  1489.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1490.                                jobu, jobvt, BlasIntType(m), BlasIntType(n),
  1491.                                A, lda, s, U, ldu, VT, ldvt, work, lwork, info)
  1492.     end
  1493.     return U, s, VT, info[]
  1494. end
  1495.  
  1496. function aocl_dgesvd!(jobu::Char, jobvt::Char, A::Matrix{Float64})
  1497.     m, n = size(A)
  1498.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1499.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1500.     lda = BlasIntType(m)
  1501.     s = Vector{Float64}(undef, min(m, n)) # Singular values
  1502.  
  1503.     ldu = BlasIntType(1)
  1504.     if jobu == 'A' || jobu == 'a'
  1505.         ldu = BlasIntType(m)
  1506.     elseif jobu == 'S' || jobu == 's'
  1507.         ldu = BlasIntType(m)
  1508.     end
  1509.     U = Matrix{Float64}(undef, ldu, (jobu == 'A' || jobu == 'a') ? m : (jobu == 'S' || jobu == 's' ? min(m, n) : 1))
  1510.  
  1511.     ldvt = BlasIntType(1)
  1512.     if jobvt == 'A' || jobvt == 'a'
  1513.         ldvt = BlasIntType(n)
  1514.     elseif jobvt == 'O' || jobvt == 'o'
  1515.         ldvt = BlasIntType(min(m, n))
  1516.     end
  1517.     VT = Matrix{Float64}(undef, ldvt, n)
  1518.  
  1519.     info = Ref{BlasIntType}(0)
  1520.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1521.     lwork = BlasIntType(-1)
  1522.  
  1523.     call_external_function(handle, :dgesvd_, Cvoid,
  1524.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  1525.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  1526.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1527.                            jobu, jobvt, BlasIntType(m), BlasIntType(n),
  1528.                            A, lda, s, U, ldu, VT, ldvt, work, lwork, info)
  1529.  
  1530.     if info[] == 0 && lwork[] == -1
  1531.         lwork = BlasIntType(round(Int, work[1]))
  1532.         work = Vector{Float64}(undef, lwork)
  1533.         call_external_function(handle, :dgesvd_, Cvoid,
  1534.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
  1535.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  1536.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1537.                                jobu, jobvt, BlasIntType(m), BlasIntType(n),
  1538.                                A, lda, s, U, ldu, VT, ldvt, work, lwork, info)
  1539.     end
  1540.     return U, s, VT, info[]
  1541. end
  1542.  
  1543. # LASET: Initializes a matrix to zero or to a constant
  1544. # s/dlaset (single/double precision)
  1545. function aocl_slaset!(uplo::Char, m::Int, n::Int, alpha::Float32, beta::Float32, A::Matrix{Float32})
  1546.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1547.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1548.     lda = BlasIntType(size(A, 1))
  1549.     call_external_function(handle, :slaset_, Cvoid,
  1550.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32}, Ref{Float32},
  1551.                             Ptr{Float32}, Ref{BlasIntType}),
  1552.                            uplo, BlasIntType(m), BlasIntType(n), alpha, beta, A, lda)
  1553.     return A
  1554. end
  1555.  
  1556. function aocl_dlaset!(uplo::Char, m::Int, n::Int, alpha::Float64, beta::Float64, A::Matrix{Float64})
  1557.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1558.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1559.     lda = BlasIntType(size(A, 1))
  1560.     call_external_function(handle, :dlaset_, Cvoid,
  1561.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64}, Ref{Float64},
  1562.                             Ptr{Float64}, Ref{BlasIntType}),
  1563.                            uplo, BlasIntType(m), BlasIntType(n), alpha, beta, A, lda)
  1564.     return A
  1565. end
  1566.  
  1567. # GEEV: Compute eigenvalues and, optionally, eigenvectors of a general matrix
  1568. # s/dgeev (single/double precision)
  1569. function aocl_sgeev!(jobvl::Char, jobvr::Char, A::Matrix{Float32})
  1570.     n = size(A, 1)
  1571.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1572.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1573.     lda = BlasIntType(n)
  1574.     wr = Vector{Float32}(undef, n)
  1575.     wi = Vector{Float32}(undef, n)
  1576.  
  1577.     ldvl = BlasIntType(1)
  1578.     if jobvl == 'V' || jobvl == 'v'
  1579.         ldvl = BlasIntType(n)
  1580.     end
  1581.     VL = Matrix{Float32}(undef, ldvl, n)
  1582.  
  1583.     ldvr = BlasIntType(1)
  1584.     if jobvr == 'V' || jobvr == 'v'
  1585.         ldvr = BlasIntType(n)
  1586.     end
  1587.     VR = Matrix{Float32}(undef, ldvr, n)
  1588.  
  1589.     info = Ref{BlasIntType}(0)
  1590.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1591.     lwork = BlasIntType(-1)
  1592.  
  1593.     call_external_function(handle, :sgeev_, Cvoid,
  1594.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1595.                             Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  1596.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1597.                            jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)
  1598.  
  1599.     if info[] == 0 && lwork[] == -1
  1600.         lwork = BlasIntType(round(Int, work[1]))
  1601.         work = Vector{Float32}(undef, lwork)
  1602.         call_external_function(handle, :sgeev_, Cvoid,
  1603.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1604.                                 Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
  1605.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1606.                                jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)
  1607.     end
  1608.     return wr, wi, VL, VR, info[]
  1609. end
  1610.  
  1611. function aocl_dgeev!(jobvl::Char, jobvr::Char, A::Matrix{Float64})
  1612.     n = size(A, 1)
  1613.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1614.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1615.     lda = BlasIntType(n)
  1616.     wr = Vector{Float64}(undef, n)
  1617.     wi = Vector{Float64}(undef, n)
  1618.  
  1619.     ldvl = BlasIntType(1)
  1620.     if jobvl == 'V' || jobvl == 'v'
  1621.         ldvl = BlasIntType(n)
  1622.     end
  1623.     VL = Matrix{Float64}(undef, ldvl, n)
  1624.  
  1625.     ldvr = BlasIntType(1)
  1626.     if jobvr == 'V' || jobvr == 'v'
  1627.         ldvr = BlasIntType(n)
  1628.     end
  1629.     VR = Matrix{Float64}(undef, ldvr, n)
  1630.  
  1631.     info = Ref{BlasIntType}(0)
  1632.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1633.     lwork = BlasIntType(-1)
  1634.  
  1635.     call_external_function(handle, :dgeev_, Cvoid,
  1636.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1637.                             Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  1638.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1639.                            jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)
  1640.  
  1641.     if info[] == 0 && lwork[] == -1
  1642.         lwork = BlasIntType(round(Int, work[1]))
  1643.         work = Vector{Float64}(undef, lwork)
  1644.         call_external_function(handle, :dgeev_, Cvoid,
  1645.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1646.                                 Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
  1647.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1648.                                jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)
  1649.     end
  1650.     return wr, wi, VL, VR, info[]
  1651. end
  1652.  
  1653. # GELSS: Least Squares solution using SVD
  1654. # s/dgelss (single/double precision)
  1655. function aocl_sgelss!(A::Matrix{Float32}, B::Matrix{Float32}, rcond::Float32)
  1656.     m, n = size(A)
  1657.     nrhs = size(B, 2)
  1658.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1659.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  1660.     lda = BlasIntType(m)
  1661.     ldb = BlasIntType(max(m, n))
  1662.     s = Vector{Float32}(undef, min(m, n)) # Singular values
  1663.     rank = Ref{BlasIntType}(0)
  1664.     info = Ref{BlasIntType}(0)
  1665.  
  1666.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1667.     lwork = BlasIntType(-1)
  1668.  
  1669.     call_external_function(handle, :sgelss_, Cvoid,
  1670.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1671.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
  1672.                             Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1673.                            BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1674.                            s, rcond, rank, work, lwork, info)
  1675.  
  1676.     if info[] == 0 && lwork[] == -1
  1677.         lwork = BlasIntType(round(Int, work[1]))
  1678.         work = Vector{Float32}(undef, lwork)
  1679.         call_external_function(handle, :sgelss_, Cvoid,
  1680.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1681.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
  1682.                                 Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1683.                                BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1684.                                s, rcond, rank, work, lwork, info)
  1685.     end
  1686.     return B, s, rank[], info[]
  1687. end
  1688.  
  1689. function aocl_dgelss!(A::Matrix{Float64}, B::Matrix{Float64}, rcond::Float64)
  1690.     m, n = size(A)
  1691.     nrhs = size(B, 2)
  1692.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1693.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  1694.     lda = BlasIntType(m)
  1695.     ldb = BlasIntType(max(m, n))
  1696.     s = Vector{Float64}(undef, min(m, n)) # Singular values
  1697.     rank = Ref{BlasIntType}(0)
  1698.     info = Ref{BlasIntType}(0)
  1699.  
  1700.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1701.     lwork = BlasIntType(-1)
  1702.  
  1703.     call_external_function(handle, :dgelss_, Cvoid,
  1704.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1705.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
  1706.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1707.                            BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1708.                            s, rcond, rank, work, lwork, info)
  1709.  
  1710.     if info[] == 0 && lwork[] == -1
  1711.         lwork = BlasIntType(round(Int, work[1]))
  1712.         work = Vector{Float64}(undef, lwork)
  1713.         call_external_function(handle, :dgelss_, Cvoid,
  1714.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1715.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
  1716.                                 Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1717.                                BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1718.                                s, rcond, rank, work, lwork, info)
  1719.     end
  1720.     return B, s, rank[], info[]
  1721. end
  1722.  
  1723. # GELSD: Least Squares solution using SVD with divide and conquer
  1724. # s/dgelsd (single/double precision)
  1725. function aocl_sgelsd!(A::Matrix{Float32}, B::Matrix{Float32}, rcond::Float32)
  1726.     m, n = size(A)
  1727.     nrhs = size(B, 2)
  1728.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1729.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  1730.     lda = BlasIntType(m)
  1731.     ldb = BlasIntType(max(m, n))
  1732.     s = Vector{Float32}(undef, min(m, n)) # Singular values
  1733.     rank = Ref{BlasIntType}(0)
  1734.     info = Ref{BlasIntType}(0)
  1735.  
  1736.     # Query optimal workspace sizes
  1737.     work = Vector{Float32}(undef, 1)
  1738.     lwork = BlasIntType(-1)
  1739.     iwork = Vector{BlasIntType}(undef, 1) # IWORK size depends on M, N, NRHS
  1740.  
  1741.     call_external_function(handle, :sgelsd_, Cvoid,
  1742.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1743.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
  1744.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1745.                            BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1746.                            s, rcond, rank, work, lwork, iwork, info)
  1747.  
  1748.     if info[] == 0 && lwork[] == -1
  1749.         lwork = BlasIntType(round(Int, work[1]))
  1750.         # The optimal IWORK size is not returned by query, it's typically fixed or derived.
  1751.         # For GELSD, IWORK size is typically max(1, 3*MIN(M,N)*SMALSIZ + 11*MIN(M,N)) where SMALSIZ is a small constant.
  1752.         # For simplicity, we'll use a heuristic or a sufficiently large size if not querying.
  1753.         # For a proper wrapper, one might need to calculate this based on LAPACK's formula.
  1754.         # For now, we'll assume a reasonable fixed size or rely on the query for LWORK.
  1755.         # Since IWORK is not queried for size with LWORK=-1, we'll use a common safe upper bound.
  1756.         iwork_size = max(1, 3 * min(m,n) * 10 + 11 * min(m,n)) # Heuristic based on common LAPACK impl.
  1757.         iwork = Vector{BlasIntType}(undef, iwork_size)
  1758.         work = Vector{Float32}(undef, lwork)
  1759.  
  1760.         call_external_function(handle, :sgelsd_, Cvoid,
  1761.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1762.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
  1763.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1764.                                BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1765.                                s, rcond, rank, work, lwork, iwork, info)
  1766.     end
  1767.     return B, s, rank[], info[]
  1768. end
  1769.  
  1770. function aocl_dgelsd!(A::Matrix{Float64}, B::Matrix{Float64}, rcond::Float64)
  1771.     m, n = size(A)
  1772.     nrhs = size(B, 2)
  1773.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1774.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  1775.     lda = BlasIntType(m)
  1776.     ldb = BlasIntType(max(m, n))
  1777.     s = Vector{Float64}(undef, min(m, n)) # Singular values
  1778.     rank = Ref{BlasIntType}(0)
  1779.     info = Ref{BlasIntType}(0)
  1780.  
  1781.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1782.     lwork = BlasIntType(-1)
  1783.     iwork = Vector{BlasIntType}(undef, 1)
  1784.  
  1785.     call_external_function(handle, :dgelsd_, Cvoid,
  1786.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1787.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
  1788.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1789.                            BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1790.                            s, rcond, rank, work, lwork, iwork, info)
  1791.  
  1792.     if info[] == 0 && lwork[] == -1
  1793.         lwork = BlasIntType(round(Int, work[1]))
  1794.         iwork_size = max(1, 3 * min(m,n) * 10 + 11 * min(m,n)) # Heuristic
  1795.         iwork = Vector{BlasIntType}(undef, iwork_size)
  1796.         work = Vector{Float64}(undef, lwork)
  1797.  
  1798.         call_external_function(handle, :dgelsd_, Cvoid,
  1799.                                (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1800.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
  1801.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1802.                                BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1803.                                s, rcond, rank, work, lwork, iwork, info)
  1804.     end
  1805.     return B, s, rank[], info[]
  1806. end
  1807.  
  1808. # GELS: Least Squares solution using QR or LU factorization
  1809. # s/dgels (single/double precision)
  1810. function aocl_sgels!(trans::Char, A::Matrix{Float32}, B::Matrix{Float32})
  1811.     m, n = size(A)
  1812.     nrhs = size(B, 2)
  1813.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1814.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  1815.     lda = BlasIntType(m)
  1816.     ldb = BlasIntType(max(m, n)) # LDB must be at least max(M,N) for GELS
  1817.     info = Ref{BlasIntType}(0)
  1818.  
  1819.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1820.     lwork = BlasIntType(-1)
  1821.  
  1822.     call_external_function(handle, :sgels_, Cvoid,
  1823.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1824.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1825.                            trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1826.                            work, lwork, info)
  1827.  
  1828.     if info[] == 0 && lwork[] == -1
  1829.         lwork = BlasIntType(round(Int, work[1]))
  1830.         work = Vector{Float32}(undef, lwork)
  1831.         call_external_function(handle, :sgels_, Cvoid,
  1832.                                (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1833.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  1834.                                trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1835.                                work, lwork, info)
  1836.     end
  1837.     return A, B, info[]
  1838. end
  1839.  
  1840. function aocl_dgels!(trans::Char, A::Matrix{Float64}, B::Matrix{Float64})
  1841.     m, n = size(A)
  1842.     nrhs = size(B, 2)
  1843.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1844.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  1845.     lda = BlasIntType(m)
  1846.     ldb = BlasIntType(max(m, n)) # LDB must be at least max(M,N) for GELS
  1847.     info = Ref{BlasIntType}(0)
  1848.  
  1849.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  1850.     lwork = BlasIntType(-1)
  1851.  
  1852.     call_external_function(handle, :dgels_, Cvoid,
  1853.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1854.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1855.                            trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1856.                            work, lwork, info)
  1857.  
  1858.     if info[] == 0 && lwork[] == -1
  1859.         lwork = BlasIntType(round(Int, work[1]))
  1860.         work = Vector{Float64}(undef, lwork)
  1861.         call_external_function(handle, :dgels_, Cvoid,
  1862.                                (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1863.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  1864.                                trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
  1865.                                work, lwork, info)
  1866.     end
  1867.     return A, B, info[]
  1868. end
  1869.  
  1870. # GEEQU: Estimate the condition number of a general matrix
  1871. # s/dgeequ (single/double precision)
  1872. function aocl_sgeequ!(A::Matrix{Float32})
  1873.     m, n = size(A)
  1874.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1875.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1876.     lda = BlasIntType(m)
  1877.     r = Vector{Float32}(undef, m)
  1878.     c = Vector{Float32}(undef, n)
  1879.     row_cond = Ref{Float32}(0.0)
  1880.     col_cond = Ref{Float32}(0.0)
  1881.     amax = Ref{Float32}(0.0)
  1882.     info = Ref{BlasIntType}(0)
  1883.  
  1884.     call_external_function(handle, :sgeequ_, Cvoid,
  1885.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1886.                             Ptr{Float32}, Ptr{Float32}, Ref{Float32}, Ref{Float32}, Ref{Float32}, Ref{BlasIntType}),
  1887.                            BlasIntType(m), BlasIntType(n), A, lda, r, c, row_cond, col_cond, amax, info)
  1888.     return r, c, row_cond[], col_cond[], amax[], info[]
  1889. end
  1890.  
  1891. function aocl_dgeequ!(A::Matrix{Float64})
  1892.     m, n = size(A)
  1893.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1894.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1895.     lda = BlasIntType(m)
  1896.     r = Vector{Float64}(undef, m)
  1897.     c = Vector{Float64}(undef, n)
  1898.     row_cond = Ref{Float64}(0.0)
  1899.     col_cond = Ref{Float64}(0.0)
  1900.     amax = Ref{Float64}(0.0)
  1901.     info = Ref{BlasIntType}(0)
  1902.  
  1903.     call_external_function(handle, :dgeequ_, Cvoid,
  1904.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  1905.                             Ptr{Float64}, Ptr{Float64}, Ref{Float64}, Ref{Float64}, Ref{Float64}, Ref{BlasIntType}),
  1906.                            BlasIntType(m), BlasIntType(n), A, lda, r, c, row_cond, col_cond, amax, info)
  1907.     return r, c, row_cond[], col_cond[], amax[], info[]
  1908. end
  1909.  
  1910. # GECON: Estimate the reciprocal of the condition number of a general matrix
  1911. # s/dgecon (single/double precision)
  1912. function aocl_sgecon!(norm_type::Char, A::Matrix{Float32}, anorm::Float32)
  1913.     n = size(A, 1)
  1914.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1915.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1916.     lda = BlasIntType(n)
  1917.     rcond = Ref{Float32}(0.0)
  1918.     info = Ref{BlasIntType}(0)
  1919.  
  1920.     work = Vector{Float32}(undef, 4*n) # Workspace size
  1921.     iwork = Vector{BlasIntType}(undef, n) # Integer workspace
  1922.  
  1923.     call_external_function(handle, :sgecon_, Cvoid,
  1924.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{Float32},
  1925.                             Ref{Float32}, Ptr{Float32}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1926.                            norm_type, BlasIntType(n), A, lda, anorm, rcond, work, iwork, info)
  1927.     return rcond[], info[]
  1928. end
  1929.  
  1930. function aocl_dgecon!(norm_type::Char, A::Matrix{Float64}, anorm::Float64)
  1931.     n = size(A, 1)
  1932.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1933.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  1934.     lda = BlasIntType(n)
  1935.     rcond = Ref{Float64}(0.0)
  1936.     info = Ref{BlasIntType}(0)
  1937.  
  1938.     work = Vector{Float64}(undef, 4*n) # Workspace size
  1939.     iwork = Vector{BlasIntType}(undef, n) # Integer workspace
  1940.  
  1941.     call_external_function(handle, :dgecon_, Cvoid,
  1942.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{Float64},
  1943.                             Ref{Float64}, Ptr{Float64}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1944.                            norm_type, BlasIntType(n), A, lda, anorm, rcond, work, iwork, info)
  1945.     return rcond[], info[]
  1946. end
  1947.  
  1948. # GESDD: Singular Value Decomposition using divide and conquer
  1949. # s/dgesdd (single/double precision)
  1950. function aocl_sgesdd!(jobz::Char, A::Matrix{Float32})
  1951.     m, n = size(A)
  1952.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  1953.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  1954.     lda = BlasIntType(m)
  1955.     s = Vector{Float32}(undef, min(m, n)) # Singular values
  1956.    
  1957.     # Determine dimensions for U and VT based on jobz
  1958.     ldu = BlasIntType(1)
  1959.     if jobz == 'A' || jobz == 'a'
  1960.         ldu = BlasIntType(m)
  1961.     elseif jobz == 'S' || jobz == 's' || jobz == 'O' || jobz == 'o'
  1962.         ldu = BlasIntType(m)
  1963.     end
  1964.     U = Matrix{Float32}(undef, ldu, (jobz == 'A' || jobz == 'a') ? m : (jobz == 'S' || jobz == 's' ? min(m, n) : 1))
  1965.  
  1966.     ldvt = BlasIntType(1)
  1967.     if jobz == 'A' || jobz == 'a'
  1968.         ldvt = BlasIntType(n)
  1969.     elseif jobz == 'O' || jobz == 'o'
  1970.         ldvt = BlasIntType(min(m, n))
  1971.     elseif jobz == 'S' || jobz == 's'
  1972.         ldvt = BlasIntType(min(m, n))
  1973.     end
  1974.     VT = Matrix{Float32}(undef, ldvt, n)
  1975.  
  1976.     info = Ref{BlasIntType}(0)
  1977.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  1978.     lwork = BlasIntType(-1)
  1979.     iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size
  1980.  
  1981.     call_external_function(handle, :sgesdd_, Cvoid,
  1982.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1983.                             Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  1984.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  1985.                            jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)
  1986.  
  1987.     if info[] == 0 && lwork[] == -1
  1988.         lwork = BlasIntType(round(Int, work[1]))
  1989.         iwork_size = BlasIntType(round(Int, iwork[1])) # If iwork is also queried
  1990.         # If iwork is not queried, a heuristic is needed. For GESDD, it's typically 8*min(M,N) for 'A' or 'S', 5*min(M,N) for 'O'.
  1991.         if iwork_size == 1 && (jobz == 'A' || jobz == 'S' || jobz == 'a' || jobz == 's')
  1992.             iwork_size = BlasIntType(8 * min(m,n))
  1993.         elseif iwork_size == 1 && (jobz == 'O' || jobz == 'o')
  1994.             iwork_size = BlasIntType(5 * min(m,n))
  1995.         end
  1996.         work = Vector{Float32}(undef, lwork)
  1997.         iwork = Vector{BlasIntType}(undef, iwork_size)
  1998.  
  1999.         call_external_function(handle, :sgesdd_, Cvoid,
  2000.                                (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2001.                                 Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2002.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2003.                                jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)
  2004.     end
  2005.     return U, s, VT, info[]
  2006. end
  2007.  
  2008. function aocl_dgesdd!(jobz::Char, A::Matrix{Float64})
  2009.     m, n = size(A)
  2010.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2011.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  2012.     lda = BlasIntType(m)
  2013.     s = Vector{Float64}(undef, min(m, n)) # Singular values
  2014.  
  2015.     ldu = BlasIntType(1)
  2016.     if jobz == 'A' || jobz == 'a'
  2017.         ldu = BlasIntType(m)
  2018.     elseif jobz == 'S' || jobz == 's' || jobz == 'O' || jobz == 'o'
  2019.         ldu = BlasIntType(m)
  2020.     end
  2021.     U = Matrix{Float64}(undef, ldu, (jobz == 'A' || jobz == 'a') ? m : (jobz == 'S' || jobz == 's' ? min(m, n) : 1))
  2022.  
  2023.     ldvt = BlasIntType(1)
  2024.     if jobz == 'A' || jobz == 'a'
  2025.         ldvt = BlasIntType(n)
  2026.     elseif jobz == 'O' || jobz == 'o'
  2027.         ldvt = BlasIntType(min(m, n))
  2028.     elseif jobz == 'S' || jobz == 's'
  2029.         ldvt = BlasIntType(min(m, n))
  2030.     end
  2031.     VT = Matrix{Float64}(undef, ldvt, n)
  2032.  
  2033.     info = Ref{BlasIntType}(0)
  2034.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  2035.     lwork = BlasIntType(-1)
  2036.     iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size
  2037.  
  2038.     call_external_function(handle, :dgesdd_, Cvoid,
  2039.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2040.                             Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2041.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2042.                            jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)
  2043.  
  2044.     if info[] == 0 && lwork[] == -1
  2045.         lwork = BlasIntType(round(Int, work[1]))
  2046.         iwork_size = BlasIntType(round(Int, iwork[1]))
  2047.         if iwork_size == 1 && (jobz == 'A' || jobz == 'S' || jobz == 'a' || jobz == 's')
  2048.             iwork_size = BlasIntType(8 * min(m,n))
  2049.         elseif iwork_size == 1 && (jobz == 'O' || jobz == 'o')
  2050.             iwork_size = BlasIntType(5 * min(m,n))
  2051.         end
  2052.         work = Vector{Float64}(undef, lwork)
  2053.         iwork = Vector{BlasIntType}(undef, iwork_size)
  2054.  
  2055.         call_external_function(handle, :dgesdd_, Cvoid,
  2056.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2057.                                 Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2058.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2059.                                jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)
  2060.     end
  2061.     return U, s, VT, info[]
  2062. end
  2063.  
  2064. # GEEVX: Compute eigenvalues and, optionally, eigenvectors of a general matrix
  2065. # s/dgeevx (single/double precision)
  2066. function aocl_sgeevx!(balanc::Char, jobvl::Char, jobvr::Char, sense::Char, A::Matrix{Float32})
  2067.     n = size(A, 1)
  2068.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2069.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2070.     lda = BlasIntType(n)
  2071.     wr = Vector{Float32}(undef, n)
  2072.     wi = Vector{Float32}(undef, n)
  2073.  
  2074.     ldvl = BlasIntType(1)
  2075.     if jobvl == 'V' || jobvl == 'v'
  2076.         ldvl = BlasIntType(n)
  2077.     end
  2078.     VL = Matrix{Float32}(undef, ldvl, n)
  2079.  
  2080.     ldvr = BlasIntType(1)
  2081.     if jobvr == 'V' || jobvr == 'v'
  2082.         ldvr = BlasIntType(n)
  2083.     end
  2084.     VR = Matrix{Float32}(undef, ldvr, n)
  2085.  
  2086.     ilo = Ref{BlasIntType}(0)
  2087.     ihi = Ref{BlasIntType}(0)
  2088.     scale = Vector{Float32}(undef, n)
  2089.     abnrm = Ref{Float32}(0.0)
  2090.     rconde = Vector{Float32}(undef, n)
  2091.     rcondv = Vector{Float32}(undef, n)
  2092.     info = Ref{BlasIntType}(0)
  2093.  
  2094.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  2095.     lwork = BlasIntType(-1)
  2096.     iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size
  2097.  
  2098.     call_external_function(handle, :sgeevx_, Cvoid,
  2099.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2100.                             Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2101.                             Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32},
  2102.                             Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2103.                            balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
  2104.                            ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)
  2105.  
  2106.     if info[] == 0 && lwork[] == -1
  2107.         lwork = BlasIntType(round(Int, work[1]))
  2108.         iwork_size = BlasIntType(round(Int, iwork[1]))
  2109.         work = Vector{Float32}(undef, lwork)
  2110.         iwork = Vector{BlasIntType}(undef, iwork_size)
  2111.  
  2112.         call_external_function(handle, :sgeevx_, Cvoid,
  2113.                                (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2114.                                 Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2115.                                 Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32},
  2116.                                 Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2117.                                balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
  2118.                                ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)
  2119.     end
  2120.     return wr, wi, VL, VR, ilo[], ihi[], scale, abnrm[], rconde, rcondv, info[]
  2121. end
  2122.  
  2123. function aocl_dgeevx!(balanc::Char, jobvl::Char, jobvr::Char, sense::Char, A::Matrix{Float64})
  2124.     n = size(A, 1)
  2125.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2126.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2127.     lda = BlasIntType(n)
  2128.     wr = Vector{Float64}(undef, n)
  2129.     wi = Vector{Float64}(undef, n)
  2130.  
  2131.     ldvl = BlasIntType(1)
  2132.     if jobvl == 'V' || jobvl == 'v'
  2133.         ldvl = BlasIntType(n)
  2134.     end
  2135.     VL = Matrix{Float64}(undef, ldvl, n)
  2136.  
  2137.     ldvr = BlasIntType(1)
  2138.     if jobvr == 'V' || jobvr == 'v'
  2139.         ldvr = BlasIntType(n)
  2140.     end
  2141.     VR = Matrix{Float64}(undef, ldvr, n)
  2142.  
  2143.     ilo = Ref{BlasIntType}(0)
  2144.     ihi = Ref{BlasIntType}(0)
  2145.     scale = Vector{Float64}(undef, n)
  2146.     abnrm = Ref{Float64}(0.0)
  2147.     rconde = Vector{Float64}(undef, n)
  2148.     rcondv = Vector{Float64}(undef, n)
  2149.     info = Ref{BlasIntType}(0)
  2150.  
  2151.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  2152.     lwork = BlasIntType(-1)
  2153.     iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size
  2154.  
  2155.     call_external_function(handle, :dgeevx_, Cvoid,
  2156.                            (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2157.                             Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2158.                             Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64},
  2159.                             Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2160.                            balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
  2161.                            ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)
  2162.  
  2163.     if info[] == 0 && lwork[] == -1
  2164.         lwork = BlasIntType(round(Int, work[1]))
  2165.         iwork_size = BlasIntType(round(Int, iwork[1]))
  2166.         work = Vector{Float64}(undef, lwork)
  2167.         iwork = Vector{BlasIntType}(undef, iwork_size)
  2168.  
  2169.         call_external_function(handle, :dgeevx_, Cvoid,
  2170.                                (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2171.                                 Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2172.                                 Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64},
  2173.                                 Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
  2174.                                balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
  2175.                                ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)
  2176.     end
  2177.     return wr, wi, VL, VR, ilo[], ihi[], scale, abnrm[], rconde, rcondv, info[]
  2178. end
  2179.  
  2180. # GETF2: LU factorization (unblocked algorithm)
  2181. # s/dgetf2 (single/double precision)
  2182. function aocl_sgetf2!(A::Matrix{Float32})
  2183.     m, n = size(A)
  2184.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2185.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  2186.     lda = BlasIntType(m)
  2187.     ipiv = Vector{Int32}(undef, min(m, n))
  2188.     info = Ref{BlasIntType}(0)
  2189.  
  2190.     call_external_function(handle, :sgetf2_, Cvoid,
  2191.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2192.                             Ptr{Int32}, Ref{BlasIntType}),
  2193.                            BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
  2194.     return A, ipiv, info[]
  2195. end
  2196.  
  2197. function aocl_dgetf2!(A::Matrix{Float64})
  2198.     m, n = size(A)
  2199.     handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2200.     BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
  2201.     lda = BlasIntType(m)
  2202.     ipiv = Vector{Int32}(undef, min(m, n))
  2203.     info = Ref{BlasIntType}(0)
  2204.  
  2205.     call_external_function(handle, :dgetf2_, Cvoid,
  2206.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2207.                             Ptr{Int32}, Ref{BlasIntType}),
  2208.                            BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
  2209.     return A, ipiv, info[]
  2210. end
  2211.  
  2212. # GBSV: Solve general band system of linear equations
  2213. # s/dgbsv (single/double precision)
  2214. function aocl_sgbsv!(A::Matrix{Float32}, B::Matrix{Float32}, kl::Int, ku::Int)
  2215.     n = size(A, 2) # N is the number of columns in A (matrix is N x N)
  2216.     nrhs = size(B, 2)
  2217.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2218.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2219.     lda = BlasIntType(size(A, 1)) # LDA = KL + KU + 1
  2220.     ldb = BlasIntType(n)
  2221.     ipiv = Vector{Int32}(undef, n)
  2222.     info = Ref{BlasIntType}(0)
  2223.  
  2224.     call_external_function(handle, :sgbsv_, Cvoid,
  2225.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  2226.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Int32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  2227.                            BlasIntType(n), BlasIntType(kl), BlasIntType(ku), BlasIntType(nrhs),
  2228.                            A, lda, ipiv, B, ldb, info)
  2229.     return A, B, ipiv, info[]
  2230. end
  2231.  
  2232. function aocl_dgbsv!(A::Matrix{Float64}, B::Matrix{Float64}, kl::Int, ku::Int)
  2233.     n = size(A, 2) # N is the number of columns in A (matrix is N x N)
  2234.     nrhs = size(B, 2)
  2235.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2236.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2237.     lda = BlasIntType(size(A, 1)) # LDA = KL + KU + 1
  2238.     ldb = BlasIntType(n)
  2239.     ipiv = Vector{Int32}(undef, n)
  2240.     info = Ref{BlasIntType}(0)
  2241.  
  2242.     call_external_function(handle, :dgbsv_, Cvoid,
  2243.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  2244.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2245.                            BlasIntType(n), BlasIntType(kl), BlasIntType(ku), BlasIntType(nrhs),
  2246.                            A, lda, ipiv, B, ldb, info)
  2247.     return A, B, ipiv, info[]
  2248. end
  2249.  
  2250. # GTSV: Solve general tridiagonal system of linear equations
  2251. # s/dgtsv (single/double precision)
  2252. function aocl_sgtsv!(dl::Vector{Float32}, d::Vector{Float32}, du::Vector{Float32}, B::Matrix{Float32})
  2253.     n = length(d)
  2254.     nrhs = size(B, 2)
  2255.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2256.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2257.     ldb = BlasIntType(n)
  2258.     info = Ref{BlasIntType}(0)
  2259.  
  2260.     call_external_function(handle, :sgtsv_, Cvoid,
  2261.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32},
  2262.                             Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  2263.                            BlasIntType(n), BlasIntType(nrhs), dl, d, du, B, ldb, info)
  2264.     return dl, d, du, B, info[]
  2265. end
  2266.  
  2267. function aocl_dgtsv!(dl::Vector{Float64}, d::Vector{Float64}, du::Vector{Float64}, B::Matrix{Float64})
  2268.     n = length(d)
  2269.     nrhs = size(B, 2)
  2270.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2271.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2272.     ldb = BlasIntType(n)
  2273.     info = Ref{BlasIntType}(0)
  2274.  
  2275.     call_external_function(handle, :dgtsv_, Cvoid,
  2276.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
  2277.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2278.                            BlasIntType(n), BlasIntType(nrhs), dl, d, du, B, ldb, info)
  2279.     return dl, d, du, B, info[]
  2280. end
  2281.  
  2282. # GGEV: Compute eigenvalues and, optionally, eigenvectors of a general matrix pencil (A,B)
  2283. # s/dggev (single/double precision)
  2284. function aocl_sggev!(jobvl::Char, jobvr::Char, A::Matrix{Float32}, B::Matrix{Float32})
  2285.     n = size(A, 1)
  2286.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2287.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2288.     lda = BlasIntType(n)
  2289.     ldb = BlasIntType(n)
  2290.     alphar = Vector{Float32}(undef, n)
  2291.     alphai = Vector{Float32}(undef, n)
  2292.     beta = Vector{Float32}(undef, n)
  2293.  
  2294.     ldvl = BlasIntType(1)
  2295.     if jobvl == 'V' || jobvl == 'v'
  2296.         ldvl = BlasIntType(n)
  2297.     end
  2298.     VL = Matrix{Float32}(undef, ldvl, n)
  2299.  
  2300.     ldvr = BlasIntType(1)
  2301.     if jobvr == 'V' || jobvr == 'v'
  2302.         ldvr = BlasIntType(n)
  2303.     end
  2304.     VR = Matrix{Float32}(undef, ldvr, n)
  2305.  
  2306.     info = Ref{BlasIntType}(0)
  2307.     work = Vector{Float32}(undef, 1) # Query optimal workspace size
  2308.     lwork = BlasIntType(-1)
  2309.  
  2310.     call_external_function(handle, :sggev_, Cvoid,
  2311.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2312.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32},
  2313.                             Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2314.                             Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  2315.                            jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
  2316.                            VL, ldvl, VR, ldvr, work, lwork, info)
  2317.  
  2318.     if info[] == 0 && lwork[] == -1
  2319.         lwork = BlasIntType(round(Int, work[1]))
  2320.         work = Vector{Float32}(undef, lwork)
  2321.         call_external_function(handle, :sggev_, Cvoid,
  2322.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2323.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32},
  2324.                                 Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
  2325.                                 Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
  2326.                                jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
  2327.                                VL, ldvl, VR, ldvr, work, lwork, info)
  2328.     end
  2329.     return alphar, alphai, beta, VL, VR, info[]
  2330. end
  2331.  
  2332. function aocl_dggev!(jobvl::Char, jobvr::Char, A::Matrix{Float64}, B::Matrix{Float64})
  2333.     n = size(A, 1)
  2334.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2335.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2336.     lda = BlasIntType(n)
  2337.     ldb = BlasIntType(n)
  2338.     alphar = Vector{Float64}(undef, n)
  2339.     alphai = Vector{Float64}(undef, n)
  2340.     beta = Vector{Float64}(undef, n)
  2341.  
  2342.     ldvl = BlasIntType(1)
  2343.     if jobvl == 'V' || jobvl == 'v'
  2344.         ldvl = BlasIntType(n)
  2345.     end
  2346.     VL = Matrix{Float64}(undef, ldvl, n)
  2347.  
  2348.     ldvr = BlasIntType(1)
  2349.     if jobvr == 'V' || jobvr == 'v'
  2350.         ldvr = BlasIntType(n)
  2351.     end
  2352.     VR = Matrix{Float64}(undef, ldvr, n)
  2353.  
  2354.     info = Ref{BlasIntType}(0)
  2355.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  2356.     lwork = BlasIntType(-1)
  2357.  
  2358.     call_external_function(handle, :dggev_, Cvoid,
  2359.                            (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2360.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
  2361.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2362.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2363.                            jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
  2364.                            VL, ldvl, VR, ldvr, work, lwork, info)
  2365.  
  2366.     if info[] == 0 && lwork[] == -1
  2367.         lwork = BlasIntType(round(Int, work[1]))
  2368.         work = Vector{Float64}(undef, lwork)
  2369.         call_external_function(handle, :dggev_, Cvoid,
  2370.                                (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2371.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
  2372.                                 Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2373.                                 Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2374.                                jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
  2375.                                VL, ldvl, VR, ldvr, work, lwork, info)
  2376.     end
  2377.     return alphar, alphai, beta, VL, VR, info[]
  2378. end
  2379.  
  2380. # POSV: Solves AX = B for symmetric positive definite A
  2381. # dposv (double precision)
  2382. function aocl_dposv!(uplo::Char, A::Matrix{Float64}, B::Matrix{Float64})
  2383.     n = size(A, 1)
  2384.     nrhs = size(B, 2)
  2385.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2386.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2387.     lda = BlasIntType(n)
  2388.     ldb = BlasIntType(n)
  2389.     info = Ref{BlasIntType}(0)
  2390.  
  2391.     call_external_function(handle, :dposv_, Cvoid,
  2392.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2393.                             Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2394.                            uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb, info)
  2395.     return A, B, info[]
  2396. end
  2397.  
  2398. # SYTRF: Computes the Bunch-Kaufman factorization of a real symmetric indefinite matrix
  2399. # dsytrf (double precision)
  2400. function aocl_dsytrf!(uplo::Char, A::Matrix{Float64})
  2401.     n = size(A, 1)
  2402.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2403.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2404.     lda = BlasIntType(n)
  2405.     ipiv = Vector{BlasIntType}(undef, n) # IPIV for SYTRF is BlasIntType
  2406.     info = Ref{BlasIntType}(0)
  2407.  
  2408.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  2409.     lwork = BlasIntType(-1)
  2410.  
  2411.     call_external_function(handle, :dsytrf_, Cvoid,
  2412.                            (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2413.                             Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2414.                            uplo, BlasIntType(n), A, lda, ipiv, work, lwork, info)
  2415.  
  2416.     if info[] == 0 && lwork[] == -1
  2417.         lwork = BlasIntType(round(Int, work[1]))
  2418.         work = Vector{Float64}(undef, lwork)
  2419.         call_external_function(handle, :dsytrf_, Cvoid,
  2420.                                (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2421.                                 Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2422.                                uplo, BlasIntType(n), A, lda, ipiv, work, lwork, info)
  2423.     end
  2424.     return A, ipiv, info[]
  2425. end
  2426.  
  2427. # SYTRS: Solves a system of linear equations using the Bunch-Kaufman factorization
  2428. # dsytrs (double precision)
  2429. function aocl_dsytrs!(uplo::Char, A::Matrix{Float64}, ipiv::Vector{BlasInt}, B::Matrix{Float64})
  2430.     n = size(A, 1)
  2431.     nrhs = size(B, 2)
  2432.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2433.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2434.     lda = BlasIntType(n)
  2435.     ldb = BlasIntType(n)
  2436.     info = Ref{BlasIntType}(0)
  2437.  
  2438.     call_external_function(handle, :dsytrs_, Cvoid,
  2439.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2440.                             Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2441.                            uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
  2442.     return B, info[]
  2443. end
  2444.  
  2445. # SYSV: Solves AX = B for symmetric indefinite A
  2446. # dsysv (double precision)
  2447. function aocl_dsysv!(uplo::Char, A::Matrix{Float64}, B::Matrix{Float64})
  2448.     n = size(A, 1)
  2449.     nrhs = size(B, 2)
  2450.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2451.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2452.     lda = BlasIntType(n)
  2453.     ldb = BlasIntType(n)
  2454.     ipiv = Vector{BlasIntType}(undef, n)
  2455.     info = Ref{BlasIntType}(0)
  2456.  
  2457.     work = Vector{Float64}(undef, 1) # Query optimal workspace size
  2458.     lwork = BlasIntType(-1)
  2459.  
  2460.     call_external_function(handle, :dsysv_, Cvoid,
  2461.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2462.                             Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2463.                            uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, work, lwork, info)
  2464.  
  2465.     if info[] == 0 && lwork[] == -1
  2466.         lwork = BlasIntType(round(Int, work[1]))
  2467.         work = Vector{Float64}(undef, lwork)
  2468.         call_external_function(handle, :dsysv_, Cvoid,
  2469.                                (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
  2470.                                 Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2471.                                uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, work, lwork, info)
  2472.     end
  2473.     return A, B, ipiv, info[]
  2474. end
  2475.  
  2476. # GBTRF: Computes the LU factorization of a general band matrix
  2477. # dgbtrf (double precision)
  2478. function aocl_dgbtrf!(A::Matrix{Float64}, m::Int, n::Int, kl::Int, ku::Int)
  2479.     handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2480.     BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
  2481.     lda = BlasIntType(size(A, 1)) # LDA for band matrix is KL + KU + 1
  2482.     ipiv = Vector{Int32}(undef, min(m, n))
  2483.     info = Ref{BlasIntType}(0)
  2484.  
  2485.     call_external_function(handle, :dgbtrf_, Cvoid,
  2486.                            (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  2487.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32}, Ref{BlasIntType}),
  2488.                            BlasIntType(m), BlasIntType(n), BlasIntType(kl), BlasIntType(ku),
  2489.                            A, lda, ipiv, info)
  2490.     return A, ipiv, info[]
  2491. end
  2492.  
  2493. # GBTRS: Solves a system of linear equations using the LU factorization of a general band matrix
  2494. # dgbtrs (double precision)
  2495. function aocl_dgbtrs!(trans::Char, A::Matrix{Float64}, ipiv::Vector{Int32}, B::Matrix{Float64}, kl::Int, ku::Int)
  2496.     n = size(A, 2) # A is N x N, stored in band format
  2497.     nrhs = size(B, 2)
  2498.     handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
  2499.     BlasIntType = n > typemax(Int32) ? Int64 : Int32
  2500.     lda = BlasIntType(size(A, 1)) # LDA = KL + KU + 1
  2501.     ldb = BlasIntType(n)
  2502.     info = Ref{BlasIntType}(0)
  2503.  
  2504.     call_external_function(handle, :dgbtrs_, Cvoid,
  2505.                            (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
  2506.                             Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
  2507.                            trans, BlasIntType(n), BlasIntType(kl), BlasIntType(ku), BlasIntType(nrhs),
  2508.                            A, lda, ipiv, B, ldb, info)
  2509.     return B, info[]
  2510. end
  2511.  
Tags: aocl wrapper
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement