Untitled

using Libdl
using LinearAlgebra # For BlasInt, I, and general linear algebra types

# --- Define Library Paths ---
# IMPORTANT: Ensure these paths are correct for your system.
# Replace with your actual AOCL installation paths.
const LIBBLIS_ILP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-blis\\lib\\ILP64\\AOCL-LibBlis-Win-MT-dll.dll"
const LIBBLIS_LP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-blis\\lib\\LP64\\AOCL-LibBlis-Win-MT-dll.dll"

const LIBFLAME_ILP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-libflame\\lib\\ILP64\\AOCL-LibFlame-Win-MT-dll.dll"
const LIBFLAME_LP64_PATH = "C:\\Program Files\\AMD\\AOCL-Windows\\amd-libflame\\lib\\LP64\\AOCL-LibFlame-Win-MT-dll.dll"

# --- Global Library Handles ---
# These will hold the handles to the loaded DLLs.
# Using `global` allows them to be modified within `try-catch` blocks.
global blis_ilp64_handle = C_NULL
global blis_lp64_handle = C_NULL
global flame_ilp64_handle = C_NULL
global flame_lp64_handle = C_NULL

# Function to load a library safely
function load_library(path::String)
    handle = C_NULL
    try
        handle = Libdl.dlopen(path)
        println("Successfully loaded library: $path") # Keep this for loading feedback
    catch e
        @error "Could not load library: $path. Please ensure the path is correct and dependencies are met." exception=(e, catch_backtrace())
    end
    return handle
end

# Load all necessary libraries at startup
blis_ilp64_handle = load_library(LIBBLIS_ILP64_PATH)
blis_lp64_handle = load_library(LIBBLIS_LP64_PATH)
flame_ilp64_handle = load_library(LIBFLAME_ILP64_PATH)
flame_lp64_handle = load_library(LIBFLAME_LP64_PATH)

# --- Helper for BLAS/LAPACK function calls ---
# This function is a generic dispatcher for ccall.
# It checks if the library handle is valid before attempting to call the function.
function call_external_function(lib_handle, func_name::Symbol, ret_type::DataType, arg_types::Tuple, args...)
    if lib_handle === C_NULL
        error("Library for function :$func_name is not loaded.")
    end
    func_ptr = Libdl.dlsym(lib_handle, func_name)
    if func_ptr == C_NULL
        error("Function :$func_name not found in library.")
    end
    return ccall(func_ptr, ret_type, arg_types, args...)
end

# --- BLAS Level 1 Functions (Vector-Vector Operations) ---

# AXPY: y = alpha*x + y
# s/daxpy (single/double precision)
function aocl_saxpy!(y::Vector{Float32}, alpha::Float32, x::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :saxpy_, Cvoid,
                           (Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1))
    return y
end

function aocl_daxpy!(y::Vector{Float64}, alpha::Float64, x::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :daxpy_, Cvoid,
                           (Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1))
    return y
end

# DOT: dot product of two vectors
# s/ddot (single/double precision)
function aocl_sdot(x::Vector{Float32}, y::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    result = Ref{Float32}(0.0)
    call_external_function(handle, :sdot_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), result)
    return result[]
end

function aocl_ddot(x::Vector{Float64}, y::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    result = Ref{Float64}(0.0)
    call_external_function(handle, :ddot_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), result)
    return result[]
end

# SCAL: x = alpha*x
# s/dscal (single/double precision)
function aocl_sscal!(x::Vector{Float32}, alpha::Float32)
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :sscal_, Cvoid,
                           (Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           BlasIntType(n), alpha, x, BlasIntType(1))
    return x
end

function aocl_dscal!(x::Vector{Float64}, alpha::Float64)
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :dscal_, Cvoid,
                           (Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           BlasIntType(n), alpha, x, BlasIntType(1))
    return x
end

# SWAP: swap two vectors
# s/dswap (single/double precision)
function aocl_sswap!(x::Vector{Float32}, y::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :sswap_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
    return x, y
end

function aocl_dswap!(x::Vector{Float64}, y::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :dswap_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
    return x, y
end

# NRM2: Euclidean norm of a vector
# s/dnrm2 (single/double precision)
function aocl_snrm2(x::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    return call_external_function(handle, :snrm2_, Float32,
                                  (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

function aocl_dnrm2(x::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    return call_external_function(handle, :dnrm2_, Float64,
                                  (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

# ASUM: sum of absolute values
# s/dasum (single/double precision)
function aocl_sasum(x::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    return call_external_function(handle, :sasum_, Float32,
                                  (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

function aocl_dasum(x::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    return call_external_function(handle, :dasum_, Float64,
                                  (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

# IAMIN: index of element with smallest absolute value
# is/idamin (single/double precision)
function aocl_isamin(x::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    # BLAS functions return 1-based index
    return call_external_function(handle, :isamin_, BlasIntType,
                                  (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

function aocl_idamin(x::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    # BLAS functions return 1-based index
    return call_external_function(handle, :idamin_, BlasIntType,
                                  (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

# IAMAX: index of element with largest absolute value
# is/idmax (single/double precision)
function aocl_isamax(x::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    # BLAS functions return 1-based index
    return call_external_function(handle, :isamax_, BlasIntType,
                                  (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

function aocl_idmax(x::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    # BLAS functions return 1-based index
    return call_external_function(handle, :idmax_, BlasIntType,
                                  (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                                  BlasIntType(n), x, BlasIntType(1))
end

# ROTG: Generate Givens rotation
# s/drotg (single/double precision)
function aocl_srotg!(a::Ref{Float32}, b::Ref{Float32}, c::Ref{Float32}, s::Ref{Float32})
    handle = blis_lp64_handle # ROTG is typically LP64
    call_external_function(handle, :srotg_, Cvoid,
                           (Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}),
                           a, b, c, s)
    return a[], b[], c[], s[]
end

function aocl_drotg!(a::Ref{Float64}, b::Ref{Float64}, c::Ref{Float64}, s::Ref{Float64})
    handle = blis_ilp64_handle # ROTG is typically LP64, but using ILP64 handle for consistency
    call_external_function(handle, :drotg_, Cvoid,
                           (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}),
                           a, b, c, s)
    return a[], b[], c[], s[]
end

# ROTMG: Generate modified Givens rotation matrix
# s/drotmg (single/double precision)
function aocl_srotmg!(d1::Ref{Float32}, d2::Ref{Float32}, x1::Ref{Float32}, y1::Float32, param::Vector{Float32})
    handle = blis_lp64_handle # ROTMG is typically LP64
    call_external_function(handle, :srotmg_, Cvoid,
                           (Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{Float32}, Ptr{Float32}),
                           d1, d2, x1, y1, param)
    return d1[], d2[], x1[], y1, param
end

function aocl_drotmg!(d1::Ref{Float64}, d2::Ref{Float64}, x1::Ref{Float64}, y1::Float64, param::Vector{Float64})
    handle = blis_ilp64_handle # ROTMG is typically LP64, but using ILP64 handle for consistency
    call_external_function(handle, :drotmg_, Cvoid,
                           (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Float64}, Ptr{Float64}),
                           d1, d2, x1, y1, param)
    return d1[], d2[], x1[], y1, param
end

# ROT: Apply Givens rotation
# s/drot (single/double precision)
function aocl_srot!(x::Vector{Float32}, y::Vector{Float32}, c::Float32, s::Float32)
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :srot_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}, Ref{Float32}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), c, s)
    return x, y
end

function aocl_drot!(x::Vector{Float64}, y::Vector{Float64}, c::Float64, s::Float64)
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :drot_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}, Ref{Float64}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1), c, s)
    return x, y
end

# COPY: copy one vector to another
# s/dcopy (single/double precision)
function aocl_scopy!(y::Vector{Float32}, x::Vector{Float32})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :scopy_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
    return y
end

function aocl_dcopy!(y::Vector{Float64}, x::Vector{Float64})
    n = length(x)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    call_external_function(handle, :dcopy_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           BlasIntType(n), x, BlasIntType(1), y, BlasIntType(1))
    return y
end


# --- BLAS Level 2 Functions (Matrix-Vector Operations) ---

# GEMV: y = alpha*A*x + beta*y
# s/dgemv (single/double precision)
function aocl_sgemv!(trans::Char, A::Matrix{Float32}, x::Vector{Float32}, y::Vector{Float32}, alpha::Float32, beta::Float32)
    m, n = size(A)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :sgemv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           trans, BlasIntType(m), BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
    return y
end

function aocl_dgemv!(trans::Char, A::Matrix{Float64}, x::Vector{Float64}, y::Vector{Float64}, alpha::Float64, beta::Float64)
    m, n = size(A)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dgemv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           trans, BlasIntType(m), BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
    return y
end

# GER: A = alpha*x*y' + A (Rank-1 update)
# s/dger (single/double precision)
function aocl_sger!(A::Matrix{Float32}, alpha::Float32, x::Vector{Float32}, y::Vector{Float32})
    m, n = size(A)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :sger_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
    return A
end

function aocl_dger!(A::Matrix{Float64}, alpha::Float64, x::Vector{Float64}, y::Vector{Float64})
    m, n = size(A)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dger_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
    return A
end

# TRMV: x = A*x or x = A'*x (Triangular Matrix-Vector Multiply)
# s/dtrmv (single/double precision)
function aocl_strmv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float32}, x::Vector{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :strmv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
    return x
end

function aocl_dtrmv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float64}, x::Vector{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dtrmv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
    return x
end

# TRSV: Solve A*x = b or A'*x = b (Triangular Solve)
# s/dtrsv (single/double precision)
function aocl_strsv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float32}, x::Vector{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :strsv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
    return x
end

function aocl_dtrsv!(uplo::Char, trans::Char, diag::Char, A::Matrix{Float64}, x::Vector{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dtrsv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), A, lda, x, BlasIntType(1))
    return x
end

# SYMV: y = alpha*A*x + beta*y (Symmetric Matrix-Vector Multiply)
# s/dsymv (single/double precision)
function aocl_ssymv!(uplo::Char, A::Matrix{Float32}, x::Vector{Float32}, y::Vector{Float32}, alpha::Float32, beta::Float32)
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :ssymv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
    return y
end

function aocl_dsymv!(uplo::Char, A::Matrix{Float64}, x::Vector{Float64}, y::Vector{Float64}, alpha::Float64, beta::Float64)
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dsymv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, A, lda, x, BlasIntType(1), beta, y, BlasIntType(1))
    return y
end

# SYR: A = alpha*x*x' + A (Rank-1 update of a symmetric matrix)
# s/dsyr (single/double precision)
function aocl_ssyr!(uplo::Char, A::Matrix{Float32}, alpha::Float32, x::Vector{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :ssyr_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, x, BlasIntType(1), A, lda)
    return A
end

function aocl_dsyr!(uplo::Char, A::Matrix{Float64}, alpha::Float64, x::Vector{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dsyr_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, x, BlasIntType(1), A, lda)
    return A
end

# SYR2: A = alpha*x*y' + alpha*y*x' + A (Rank-2 update of a symmetric matrix)
# s/dsyr2 (single/double precision)
function aocl_ssyr2!(uplo::Char, A::Matrix{Float32}, alpha::Float32, x::Vector{Float32}, y::Vector{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :ssyr2_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
    return A
end

function aocl_dsyr2!(uplo::Char, A::Matrix{Float64}, alpha::Float64, x::Vector{Float64}, y::Vector{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    call_external_function(handle, :dsyr2_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, x, BlasIntType(1), y, BlasIntType(1), A, lda)
    return A
end

# GBMV: y = alpha*A*x + beta*y (General Band Matrix-Vector product)
# s/dgbmv (single/double precision)
function aocl_sgbmv!(trans::Char, m::Int, n::Int, kl::Int, ku::Int, alpha::Float32, A::Matrix{Float32}, x::Vector{Float32}, beta::Float32, y::Vector{Float32})
    handle = max(m, n) > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for banded matrix is KL + KU + 1
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :sgbmv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           trans, BlasIntType(m), BlasIntType(n), BlasIntType(kl), BlasIntType(ku), alpha,
                           A, lda, x, incx, beta, y, incy)
    return y
end

function aocl_dgbmv!(trans::Char, m::Int, n::Int, kl::Int, ku::Int, alpha::Float64, A::Matrix{Float64}, x::Vector{Float64}, beta::Float64, y::Vector{Float64})
    handle = max(m, n) > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for banded matrix is KL + KU + 1
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :dgbmv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           trans, BlasIntType(m), BlasIntType(n), BlasIntType(kl), BlasIntType(ku), alpha,
                           A, lda, x, incx, beta, y, incy)
    return y
end

# SBMV: y = alpha*A*x + beta*y (Symmetric Band Matrix-Vector product)
# s/dsbmv (single/double precision)
function aocl_ssbmv!(uplo::Char, n::Int, k::Int, alpha::Float32, A::Matrix{Float32}, x::Vector{Float32}, beta::Float32, y::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for symmetric band matrix is K + 1
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :ssbmv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(k), alpha, A, lda, x, incx, beta, y, incy)
    return y
end

function aocl_dsbmv!(uplo::Char, n::Int, k::Int, alpha::Float64, A::Matrix{Float64}, x::Vector{Float64}, beta::Float64, y::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for symmetric band matrix is K + 1
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :dsbmv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(k), alpha, A, lda, x, incx, beta, y, incy)
    return y
end

# SPMV: y = alpha*A*x + beta*y (Symmetric Packed Matrix-Vector product)
# s/dspmv (single/double precision)
function aocl_sspmv!(uplo::Char, n::Int, alpha::Float32, AP::Vector{Float32}, x::Vector{Float32}, beta::Float32, y::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :sspmv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, AP, x, incx, beta, y, incy)
    return y
end

function aocl_dspmv!(uplo::Char, n::Int, alpha::Float64, AP::Vector{Float64}, x::Vector{Float64}, beta::Float64, y::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :dspmv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), alpha, AP, x, incx, beta, y, incy)
    return y
end

# TBMV: x = A*x or x = A'*x (Triangular Band Matrix-Vector Multiply)
# s/dtbmv (single/double precision)
function aocl_stbmv!(uplo::Char, trans::Char, diag::Char, n::Int, k::Int, A::Matrix{Float32}, x::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for triangular band matrix is K + 1
    incx = BlasIntType(1)
    call_external_function(handle, :stbmv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), BlasIntType(k), A, lda, x, incx)
    return x
end

function aocl_dtbmv!(uplo::Char, trans::Char, diag::Char, n::Int, k::Int, A::Matrix{Float64}, x::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for triangular band matrix is K + 1
    incx = BlasIntType(1)
    call_external_function(handle, :dtbmv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), BlasIntType(k), A, lda, x, incx)
    return x
end

# TPMV: x = A*x or x = A'*x (Triangular Packed Matrix-Vector Multiply)
# s/dtpmv (single/double precision)
function aocl_stpmv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float32}, x::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    call_external_function(handle, :stpmv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), AP, x, incx)
    return x
end

function aocl_dtpmv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float64}, x::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    call_external_function(handle, :dtpmv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), AP, x, incx)
    return x
end

# TPSV: Solve A*x = b or A'*x = b (Triangular Packed Solve)
# s/dtpsv (single/double precision)
function aocl_stpsv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float32}, x::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    call_external_function(handle, :stpsv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), AP, x, incx)
    return x
end

function aocl_dtpsv!(uplo::Char, trans::Char, diag::Char, n::Int, AP::Vector{Float64}, x::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    call_external_function(handle, :dtpsv_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, diag, BlasIntType(n), AP, x, incx)
    return x
end

# SPR: A = alpha*x*x' + A (Rank-1 update of a symmetric packed matrix)
# s/dspr (single/double precision)
function aocl_sspr!(uplo::Char, n::Int, alpha::Float32, x::Vector{Float32}, AP::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    call_external_function(handle, :sspr_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}),
                           uplo, BlasIntType(n), alpha, x, incx, AP)
    return AP
end

function aocl_dspr!(uplo::Char, n::Int, alpha::Float64, x::Vector{Float64}, AP::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    call_external_function(handle, :dspr_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}),
                           uplo, BlasIntType(n), alpha, x, incx, AP)
    return AP
end

# SPR2: A = alpha*x*y' + alpha*y*x' + A (Rank-2 update of a symmetric packed matrix)
# s/dspr2 (single/double precision)
function aocl_sspr2!(uplo::Char, n::Int, alpha::Float32, x::Vector{Float32}, y::Vector{Float32}, AP::Vector{Float32})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :sspr2_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}),
                           uplo, BlasIntType(n), alpha, x, incx, y, incy, AP)
    return AP
end

function aocl_dspr2!(uplo::Char, n::Int, alpha::Float64, x::Vector{Float64}, y::Vector{Float64}, AP::Vector{Float64})
    handle = n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    incx = BlasIntType(1)
    incy = BlasIntType(1)
    call_external_function(handle, :dspr2_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}),
                           uplo, BlasIntType(n), alpha, x, incx, y, incy, AP)
    return AP
end


# --- BLAS Level 3 Functions (Matrix-Matrix Operations) ---

# GEMM: C = alpha*A*B + beta*C
# s/dgemm (single/double precision)
function aocl_sgemm!(transa::Char, transb::Char, C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32}, alpha::Float32, beta::Float32)
    M, N_ = size(C)
    K = (transa == 'N' || transa == 'n') ? size(A, 2) : size(A, 1)
    handle = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? Int64 : Int32
    m = BlasIntType(M)
    n = BlasIntType(N_)
    k = BlasIntType(K)
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))

    call_external_function(handle, :sgemm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           transa, transb, m, n, k,
                           alpha, A, lda, B, ldb,
                           beta, C, ldc)
    return C
end

function aocl_dgemm!(transa::Char, transb::Char, C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}, alpha::Float64, beta::Float64)
    M, N_ = size(C)
    K = (transa == 'N' || transa == 'n') ? size(A, 2) : size(A, 1)
    handle = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = M > typemax(Int32) || N_ > typemax(Int32) || K > typemax(Int32) ? Int64 : Int32
    m = BlasIntType(M)
    n = BlasIntType(N_)
    k = BlasIntType(K)
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))

    call_external_function(handle, :dgemm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           transa, transb, m, n, k,
                           alpha, A, lda, B, ldb,
                           beta, C, ldc)
    return C
end

# TRMM: B = alpha*A*B or B = alpha*B*A (Triangular Matrix-Matrix Multiply)
# s/dtrmm (single/double precision)
function aocl_strmm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float32}, A::Matrix{Float32}, alpha::Float32)
    m, n = size(B)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    call_external_function(handle, :strmm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
                           alpha, A, lda, B, ldb)
    return B
end

function aocl_dtrmm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float64}, A::Matrix{Float64}, alpha::Float64)
    m, n = size(B)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    call_external_function(handle, :dtrmm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
                           alpha, A, lda, B, ldb)
    return B
end

# TRSM: Solve A*X = B or X*A = B (Triangular Solve for multiple right-hand sides)
# s/dtrsm (single/double precision)
function aocl_strsm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float32}, A::Matrix{Float32}, alpha::Float32)
    m, n = size(B)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    call_external_function(handle, :strsm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}),
                           side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
                           alpha, A, lda, B, ldb)
    return B
end

function aocl_dtrsm!(side::Char, uplo::Char, transa::Char, diag::Char, B::Matrix{Float64}, A::Matrix{Float64}, alpha::Float64)
    m, n = size(B)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    call_external_function(handle, :dtrsm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}),
                           side, uplo, transa, diag, BlasIntType(m), BlasIntType(n),
                           alpha, A, lda, B, ldb)
    return B
end

# SYMM: C = alpha*A*B + beta*C (Symmetric Matrix-Matrix Multiply)
# s/dsymm (single/double precision)
function aocl_ssymm!(side::Char, uplo::Char, C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32}, alpha::Float32, beta::Float32)
    m, n = size(C)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :ssymm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           side, uplo, BlasIntType(m), BlasIntType(n), alpha, A, lda, B, ldb, beta, C, ldc)
    return C
end

function aocl_dsymm!(side::Char, uplo::Char, C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}, alpha::Float64, beta::Float64)
    m, n = size(C)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :dsymm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           side, uplo, BlasIntType(m), BlasIntType(n), alpha, A, lda, B, ldb, beta, C, ldc)
    return C
end

# SYRK: C = alpha*A*A' + beta*C (Symmetric Rank-k update)
# s/dsyrk (single/double precision)
function aocl_ssyrk!(uplo::Char, trans::Char, C::Matrix{Float32}, A::Matrix{Float32}, alpha::Float32, beta::Float32)
    n = size(C, 1)
    k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
    handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :ssyrk_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, beta, C, ldc)
    return C
end

function aocl_dsyrk!(uplo::Char, trans::Char, C::Matrix{Float64}, A::Matrix{Float64}, alpha::Float64, beta::Float64)
    n = size(C, 1)
    k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
    handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :dsyrk_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, beta, C, ldc)
    return C
end

# SYR2K: C = alpha*A*B' + alpha*B*A' + beta*C (Symmetric Rank-2k update)
# s/dsyr2k (single/double precision)
function aocl_ssyr2k!(uplo::Char, trans::Char, C::Matrix{Float32}, A::Matrix{Float32}, B::Matrix{Float32}, alpha::Float32, beta::Float32)
    n = size(C, 1)
    k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
    handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :ssyr2k_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{Float32}, Ptr{Float32}, Ref{BlasIntType}),
                           uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, B, ldb, beta, C, ldc)
    return C
end

function aocl_dsyr2k!(uplo::Char, trans::Char, C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}, alpha::Float64, beta::Float64)
    n = size(C, 1)
    k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
    handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :dsyr2k_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{Float64}, Ref{BlasIntType}),
                           uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, B, ldb, beta, C, ldc)
    return C
end

# HEMM: C = alpha*A*B + beta*C (Hermitian Matrix-Matrix Multiply)
# zhemm (double precision complex)
function aocl_zhemm!(side::Char, uplo::Char, C::Matrix{ComplexF64}, A::Matrix{ComplexF64}, B::Matrix{ComplexF64}, alpha::ComplexF64, beta::ComplexF64)
    m, n = size(C)
    handle = m > typemax(Int32) || n > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :zhemm_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{ComplexF64},
                            Ptr{ComplexF64}, Ref{BlasIntType}, Ptr{ComplexF64}, Ref{BlasIntType},
                            Ref{ComplexF64}, Ptr{ComplexF64}, Ref{BlasIntType}),
                           side, uplo, BlasIntType(m), BlasIntType(n), alpha, A, lda, B, ldb, beta, C, ldc)
    return C
end

# HERK: C = alpha*A*A' + beta*C (Hermitian Rank-k update)
# zherk (double precision complex)
function aocl_zherk!(uplo::Char, trans::Char, C::Matrix{ComplexF64}, A::Matrix{ComplexF64}, alpha::Float64, beta::Float64)
    n = size(C, 1)
    k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
    handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :zherk_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64},
                            Ptr{ComplexF64}, Ref{BlasIntType}, Ref{Float64}, Ptr{ComplexF64}, Ref{BlasIntType}),
                           uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, beta, C, ldc)
    return C
end

# HER2K: C = alpha*A*B' + alpha*B*A' + beta*C (Hermitian Rank-2k update)
# zher2k (double precision complex)
function aocl_zher2k!(uplo::Char, trans::Char, C::Matrix{ComplexF64}, A::Matrix{ComplexF64}, B::Matrix{ComplexF64}, alpha::ComplexF64, beta::Float64)
    n = size(C, 1)
    k = (trans == 'N' || trans == 'n') ? size(A, 2) : size(A, 1)
    handle = n > typemax(Int32) || k > typemax(Int32) ? blis_ilp64_handle : blis_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(max(1, stride(A,2)))
    ldb = BlasIntType(max(1, stride(B,2)))
    ldc = BlasIntType(max(1, stride(C,2)))
    call_external_function(handle, :zher2k_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{ComplexF64},
                            Ptr{ComplexF64}, Ref{BlasIntType}, Ptr{ComplexF64}, Ref{BlasIntType},
                            Ref{Float64}, Ptr{ComplexF64}, Ref{BlasIntType}),
                           uplo, trans, BlasIntType(n), BlasIntType(k), alpha, A, lda, B, ldb, beta, C, ldc)
    return C
end


# --- LAPACK Functions ---

# GESV: Solve linear system of equations AX = B
# s/dgesv (single/double precision)
function aocl_sgesv!(A::Matrix{Float32}, B::Matrix{Float32})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    ipiv = Vector{Int32}(undef, n) # IPIV is typically Int32 even for ILP64
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgesv_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Int32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
    return A, B, ipiv, info[]
end

function aocl_dgesv!(A::Matrix{Float64}, B::Matrix{Float64})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    ipiv = Vector{Int32}(undef, n) # IPIV is typically Int32 even for ILP64
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgesv_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
    return A, B, ipiv, info[]
end

# GETRF: LU factorization
# s/dgetrf (single/double precision)
function aocl_sgetrf!(A::Matrix{Float32})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ipiv = Vector{Int32}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgetrf_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Int32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
    return A, ipiv, info[]
end

function aocl_dgetrf!(A::Matrix{Float64})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ipiv = Vector{Int32}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgetrf_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Int32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
    return A, ipiv, info[]
end

# GETRI: Matrix inversion from LU factorization
# s/dgetri (single/double precision)
function aocl_sgetri!(A::Matrix{Float32}, ipiv::Vector{Int32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Placeholder, will be resized by LAPACK if LWORK=-1
    lwork = BlasIntType(-1) # Query optimal workspace size

    call_external_function(handle, :sgetri_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Int32},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), A, lda, ipiv, work, lwork, info)

    if info[] == 0 && lwork[] == -1 # Check if query was successful
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sgetri_, Cvoid,
                               (Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Int32},
                                Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(n), A, lda, ipiv, work, lwork, info)
    end
    return A, info[]
end

function aocl_dgetri!(A::Matrix{Float64}, ipiv::Vector{Int32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Placeholder
    lwork = BlasIntType(-1) # Query optimal workspace size

    call_external_function(handle, :dgetri_, Cvoid,
                           (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), A, lda, ipiv, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dgetri_, Cvoid,
                               (Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32},
                                Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(n), A, lda, ipiv, work, lwork, info)
    end
    return A, info[]
end

# GETRS: Solve system of linear equations using LU factorization
# s/dgetrs (single/double precision)
function aocl_sgetrs!(trans::Char, A::Matrix{Float32}, B::Matrix{Float32}, ipiv::Vector{Int32})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgetrs_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Int32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           trans, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
    return B, info[]
end

function aocl_dgetrs!(trans::Char, A::Matrix{Float64}, B::Matrix{Float64}, ipiv::Vector{Int32})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgetrs_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           trans, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
    return B, info[]
end


# GEQRF: QR factorization
# s/dgeqrf (single/double precision)
function aocl_sgeqrf!(A::Matrix{Float32})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    tau = Vector{Float32}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sgeqrf_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sgeqrf_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)
    end
    return A, tau, info[]
end

function aocl_dgeqrf!(A::Matrix{Float64})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    tau = Vector{Float64}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dgeqrf_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dgeqrf_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), A, lda, tau, work, lwork, info)
    end
    return A, tau, info[]
end

# SYEV: Eigenvalues and (optionally) eigenvectors of a real symmetric matrix
# s/dsyev (single/double precision)
function aocl_ssyev!(jobz::Char, uplo::Char, A::Matrix{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    w = Vector{Float32}(undef, n) # Eigenvalues
    info = Ref{BlasIntType}(0)

    # Query optimal workspace size
    work = Vector{Float32}(undef, 1)
    lwork = BlasIntType(-1)

    call_external_function(handle, :ssyev_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :ssyev_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)
    end
    return A, w, info[] # A contains eigenvectors if jobz='V'
end

function aocl_dsyev!(jobz::Char, uplo::Char, A::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    w = Vector{Float64}(undef, n) # Eigenvalues
    info = Ref{BlasIntType}(0)

    # Query optimal workspace size
    work = Vector{Float64}(undef, 1)
    lwork = BlasIntType(-1)

    call_external_function(handle, :dsyev_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dsyev_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobz, uplo, BlasIntType(n), A, lda, w, work, lwork, info)
    end
    return A, w, info[] # A contains eigenvectors if jobz='V'
end

# POTRF: Cholesky factorization of a symmetric positive definite matrix
# s/dpotrf (single/double precision)
function aocl_spotrf!(uplo::Char, A::Matrix{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :spotrf_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), A, lda, info)
    return A, info[]
end

function aocl_dpotrf!(uplo::Char, A::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dpotrf_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), A, lda, info)
    return A, info[]
end

# POTRS: Solve A*X = B for symmetric positive definite A using Cholesky factorization
# s/dpotrs (single/double precision)
function aocl_spotrs!(uplo::Char, A::Matrix{Float32}, B::Matrix{Float32})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :spotrs_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb, info)
    return B, info[]
end

function aocl_dpotrs!(uplo::Char, A::Matrix{Float64}, B::Matrix{Float64})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dpotrs_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb, info)
    return B, info[]
end

# POTRI: Compute the inverse of a symmetric positive definite matrix using Cholesky factorization
# s/dpotri (single/double precision)
function aocl_spotri!(uplo::Char, A::Matrix{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :spotri_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), A, lda, info)
    return A, info[]
end

function aocl_dpotri!(uplo::Char, A::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dpotri_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), A, lda, info)
    return A, info[]
end

# ORGQR: Generate Q from QR factorization
# s/dorgqr (single/double precision)
function aocl_sorgqr!(A::Matrix{Float32}, tau::Vector{Float32})
    m, n = size(A)
    k = length(tau)
    handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sorgqr_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sorgqr_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)
    end
    return A, info[]
end

function aocl_dorgqr!(A::Matrix{Float64}, tau::Vector{Float64})
    m, n = size(A)
    k = length(tau)
    handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dorgqr_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dorgqr_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), BlasIntType(k), A, lda, tau, work, lwork, info)
    end
    return A, info[]
end

# UNMQR: Apply Q from QR factorization to a matrix
# s/dunmqr (single/double precision)
function aocl_sunmqr!(side::Char, trans::Char, C::Matrix{Float32}, A::Matrix{Float32}, tau::Vector{Float32})
    m, n = size(C)
    k = length(tau)
    handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1))
    ldc = BlasIntType(m)
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sunmqr_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
                           A, lda, tau, C, ldc, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sunmqr_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
                               A, lda, tau, C, ldc, work, lwork, info)
    end
    return C, info[]
end

function aocl_dunmqr!(side::Char, trans::Char, C::Matrix{Float64}, A::Matrix{Float64}, tau::Vector{Float64})
    m, n = size(C)
    k = length(tau)
    handle = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) || k > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1))
    ldc = BlasIntType(m)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dunmqr_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
                           A, lda, tau, C, ldc, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dunmqr_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               side, trans, BlasIntType(m), BlasIntType(n), BlasIntType(k),
                               A, lda, tau, C, ldc, work, lwork, info)
    end
    return C, info[]
end

# GESVD: Singular Value Decomposition
# s/dgesvd (single/double precision)
function aocl_sgesvd!(jobu::Char, jobvt::Char, A::Matrix{Float32})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    s = Vector{Float32}(undef, min(m, n)) # Singular values

    # Determine dimensions for U and VT based on jobu/jobvt
    ldu = BlasIntType(1)
    if jobu == 'A' || jobu == 'a'
        ldu = BlasIntType(m)
    elseif jobu == 'S' || jobu == 's'
        ldu = BlasIntType(m)
    end
    U = Matrix{Float32}(undef, ldu, (jobu == 'A' || jobu == 'a') ? m : (jobu == 'S' || jobu == 's' ? min(m, n) : 1))

    ldvt = BlasIntType(1)
    if jobvt == 'A' || jobvt == 'a'
        ldvt = BlasIntType(n)
    elseif jobvt == 'O' || jobvt == 'o'
        ldvt = BlasIntType(min(m, n))
    end
    VT = Matrix{Float32}(undef, ldvt, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sgesvd_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobu, jobvt, BlasIntType(m), BlasIntType(n),
                           A, lda, s, U, ldu, VT, ldvt, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sgesvd_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobu, jobvt, BlasIntType(m), BlasIntType(n),
                               A, lda, s, U, ldu, VT, ldvt, work, lwork, info)
    end
    return U, s, VT, info[]
end

function aocl_dgesvd!(jobu::Char, jobvt::Char, A::Matrix{Float64})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    s = Vector{Float64}(undef, min(m, n)) # Singular values

    ldu = BlasIntType(1)
    if jobu == 'A' || jobu == 'a'
        ldu = BlasIntType(m)
    elseif jobu == 'S' || jobu == 's'
        ldu = BlasIntType(m)
    end
    U = Matrix{Float64}(undef, ldu, (jobu == 'A' || jobu == 'a') ? m : (jobu == 'S' || jobu == 's' ? min(m, n) : 1))

    ldvt = BlasIntType(1)
    if jobvt == 'A' || jobvt == 'a'
        ldvt = BlasIntType(n)
    elseif jobvt == 'O' || jobvt == 'o'
        ldvt = BlasIntType(min(m, n))
    end
    VT = Matrix{Float64}(undef, ldvt, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dgesvd_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobu, jobvt, BlasIntType(m), BlasIntType(n),
                           A, lda, s, U, ldu, VT, ldvt, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dgesvd_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobu, jobvt, BlasIntType(m), BlasIntType(n),
                               A, lda, s, U, ldu, VT, ldvt, work, lwork, info)
    end
    return U, s, VT, info[]
end

# LASET: Initializes a matrix to zero or to a constant
# s/dlaset (single/double precision)
function aocl_slaset!(uplo::Char, m::Int, n::Int, alpha::Float32, beta::Float32, A::Matrix{Float32})
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1))
    call_external_function(handle, :slaset_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float32}, Ref{Float32},
                            Ptr{Float32}, Ref{BlasIntType}),
                           uplo, BlasIntType(m), BlasIntType(n), alpha, beta, A, lda)
    return A
end

function aocl_dlaset!(uplo::Char, m::Int, n::Int, alpha::Float64, beta::Float64, A::Matrix{Float64})
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1))
    call_external_function(handle, :dlaset_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{Float64}, Ref{Float64},
                            Ptr{Float64}, Ref{BlasIntType}),
                           uplo, BlasIntType(m), BlasIntType(n), alpha, beta, A, lda)
    return A
end

# GEEV: Compute eigenvalues and, optionally, eigenvectors of a general matrix
# s/dgeev (single/double precision)
function aocl_sgeev!(jobvl::Char, jobvr::Char, A::Matrix{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    wr = Vector{Float32}(undef, n)
    wi = Vector{Float32}(undef, n)

    ldvl = BlasIntType(1)
    if jobvl == 'V' || jobvl == 'v'
        ldvl = BlasIntType(n)
    end
    VL = Matrix{Float32}(undef, ldvl, n)

    ldvr = BlasIntType(1)
    if jobvr == 'V' || jobvr == 'v'
        ldvr = BlasIntType(n)
    end
    VR = Matrix{Float32}(undef, ldvr, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sgeev_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sgeev_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)
    end
    return wr, wi, VL, VR, info[]
end

function aocl_dgeev!(jobvl::Char, jobvr::Char, A::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    wr = Vector{Float64}(undef, n)
    wi = Vector{Float64}(undef, n)

    ldvl = BlasIntType(1)
    if jobvl == 'V' || jobvl == 'v'
        ldvl = BlasIntType(n)
    end
    VL = Matrix{Float64}(undef, ldvl, n)

    ldvr = BlasIntType(1)
    if jobvr == 'V' || jobvr == 'v'
        ldvr = BlasIntType(n)
    end
    VR = Matrix{Float64}(undef, ldvr, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dgeev_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dgeev_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobvl, jobvr, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr, work, lwork, info)
    end
    return wr, wi, VL, VR, info[]
end

# GELSS: Least Squares solution using SVD
# s/dgelss (single/double precision)
function aocl_sgelss!(A::Matrix{Float32}, B::Matrix{Float32}, rcond::Float32)
    m, n = size(A)
    nrhs = size(B, 2)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ldb = BlasIntType(max(m, n))
    s = Vector{Float32}(undef, min(m, n)) # Singular values
    rank = Ref{BlasIntType}(0)
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sgelss_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                           s, rcond, rank, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sgelss_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                               s, rcond, rank, work, lwork, info)
    end
    return B, s, rank[], info[]
end

function aocl_dgelss!(A::Matrix{Float64}, B::Matrix{Float64}, rcond::Float64)
    m, n = size(A)
    nrhs = size(B, 2)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ldb = BlasIntType(max(m, n))
    s = Vector{Float64}(undef, min(m, n)) # Singular values
    rank = Ref{BlasIntType}(0)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dgelss_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                           s, rcond, rank, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dgelss_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                               s, rcond, rank, work, lwork, info)
    end
    return B, s, rank[], info[]
end

# GELSD: Least Squares solution using SVD with divide and conquer
# s/dgelsd (single/double precision)
function aocl_sgelsd!(A::Matrix{Float32}, B::Matrix{Float32}, rcond::Float32)
    m, n = size(A)
    nrhs = size(B, 2)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ldb = BlasIntType(max(m, n))
    s = Vector{Float32}(undef, min(m, n)) # Singular values
    rank = Ref{BlasIntType}(0)
    info = Ref{BlasIntType}(0)

    # Query optimal workspace sizes
    work = Vector{Float32}(undef, 1)
    lwork = BlasIntType(-1)
    iwork = Vector{BlasIntType}(undef, 1) # IWORK size depends on M, N, NRHS

    call_external_function(handle, :sgelsd_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                           s, rcond, rank, work, lwork, iwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        # The optimal IWORK size is not returned by query, it's typically fixed or derived.
        # For GELSD, IWORK size is typically max(1, 3*MIN(M,N)*SMALSIZ + 11*MIN(M,N)) where SMALSIZ is a small constant.
        # For simplicity, we'll use a heuristic or a sufficiently large size if not querying.
        # For a proper wrapper, one might need to calculate this based on LAPACK's formula.
        # For now, we'll assume a reasonable fixed size or rely on the query for LWORK.
        # Since IWORK is not queried for size with LWORK=-1, we'll use a common safe upper bound.
        iwork_size = max(1, 3 * min(m,n) * 10 + 11 * min(m,n)) # Heuristic based on common LAPACK impl.
        iwork = Vector{BlasIntType}(undef, iwork_size)
        work = Vector{Float32}(undef, lwork)

        call_external_function(handle, :sgelsd_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                               s, rcond, rank, work, lwork, iwork, info)
    end
    return B, s, rank[], info[]
end

function aocl_dgelsd!(A::Matrix{Float64}, B::Matrix{Float64}, rcond::Float64)
    m, n = size(A)
    nrhs = size(B, 2)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ldb = BlasIntType(max(m, n))
    s = Vector{Float64}(undef, min(m, n)) # Singular values
    rank = Ref{BlasIntType}(0)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)
    iwork = Vector{BlasIntType}(undef, 1)

    call_external_function(handle, :dgelsd_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                           s, rcond, rank, work, lwork, iwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        iwork_size = max(1, 3 * min(m,n) * 10 + 11 * min(m,n)) # Heuristic
        iwork = Vector{BlasIntType}(undef, iwork_size)
        work = Vector{Float64}(undef, lwork)

        call_external_function(handle, :dgelsd_, Cvoid,
                               (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                               BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                               s, rcond, rank, work, lwork, iwork, info)
    end
    return B, s, rank[], info[]
end

# GELS: Least Squares solution using QR or LU factorization
# s/dgels (single/double precision)
function aocl_sgels!(trans::Char, A::Matrix{Float32}, B::Matrix{Float32})
    m, n = size(A)
    nrhs = size(B, 2)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ldb = BlasIntType(max(m, n)) # LDB must be at least max(M,N) for GELS
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sgels_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                           work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sgels_, Cvoid,
                               (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                               work, lwork, info)
    end
    return A, B, info[]
end

function aocl_dgels!(trans::Char, A::Matrix{Float64}, B::Matrix{Float64})
    m, n = size(A)
    nrhs = size(B, 2)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ldb = BlasIntType(max(m, n)) # LDB must be at least max(M,N) for GELS
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dgels_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                           work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dgels_, Cvoid,
                               (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               trans, BlasIntType(m), BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb,
                               work, lwork, info)
    end
    return A, B, info[]
end

# GEEQU: Estimate the condition number of a general matrix
# s/dgeequ (single/double precision)
function aocl_sgeequ!(A::Matrix{Float32})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    r = Vector{Float32}(undef, m)
    c = Vector{Float32}(undef, n)
    row_cond = Ref{Float32}(0.0)
    col_cond = Ref{Float32}(0.0)
    amax = Ref{Float32}(0.0)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgeequ_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ref{Float32}, Ref{Float32}, Ref{Float32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, r, c, row_cond, col_cond, amax, info)
    return r, c, row_cond[], col_cond[], amax[], info[]
end

function aocl_dgeequ!(A::Matrix{Float64})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    r = Vector{Float64}(undef, m)
    c = Vector{Float64}(undef, n)
    row_cond = Ref{Float64}(0.0)
    col_cond = Ref{Float64}(0.0)
    amax = Ref{Float64}(0.0)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgeequ_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ref{Float64}, Ref{Float64}, Ref{Float64}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, r, c, row_cond, col_cond, amax, info)
    return r, c, row_cond[], col_cond[], amax[], info[]
end

# GECON: Estimate the reciprocal of the condition number of a general matrix
# s/dgecon (single/double precision)
function aocl_sgecon!(norm_type::Char, A::Matrix{Float32}, anorm::Float32)
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    rcond = Ref{Float32}(0.0)
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 4*n) # Workspace size
    iwork = Vector{BlasIntType}(undef, n) # Integer workspace

    call_external_function(handle, :sgecon_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType}, Ref{Float32},
                            Ref{Float32}, Ptr{Float32}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           norm_type, BlasIntType(n), A, lda, anorm, rcond, work, iwork, info)
    return rcond[], info[]
end

function aocl_dgecon!(norm_type::Char, A::Matrix{Float64}, anorm::Float64)
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    rcond = Ref{Float64}(0.0)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 4*n) # Workspace size
    iwork = Vector{BlasIntType}(undef, n) # Integer workspace

    call_external_function(handle, :dgecon_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{Float64},
                            Ref{Float64}, Ptr{Float64}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           norm_type, BlasIntType(n), A, lda, anorm, rcond, work, iwork, info)
    return rcond[], info[]
end

# GESDD: Singular Value Decomposition using divide and conquer
# s/dgesdd (single/double precision)
function aocl_sgesdd!(jobz::Char, A::Matrix{Float32})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    s = Vector{Float32}(undef, min(m, n)) # Singular values

    # Determine dimensions for U and VT based on jobz
    ldu = BlasIntType(1)
    if jobz == 'A' || jobz == 'a'
        ldu = BlasIntType(m)
    elseif jobz == 'S' || jobz == 's' || jobz == 'O' || jobz == 'o'
        ldu = BlasIntType(m)
    end
    U = Matrix{Float32}(undef, ldu, (jobz == 'A' || jobz == 'a') ? m : (jobz == 'S' || jobz == 's' ? min(m, n) : 1))

    ldvt = BlasIntType(1)
    if jobz == 'A' || jobz == 'a'
        ldvt = BlasIntType(n)
    elseif jobz == 'O' || jobz == 'o'
        ldvt = BlasIntType(min(m, n))
    elseif jobz == 'S' || jobz == 's'
        ldvt = BlasIntType(min(m, n))
    end
    VT = Matrix{Float32}(undef, ldvt, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)
    iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size

    call_external_function(handle, :sgesdd_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        iwork_size = BlasIntType(round(Int, iwork[1])) # If iwork is also queried
        # If iwork is not queried, a heuristic is needed. For GESDD, it's typically 8*min(M,N) for 'A' or 'S', 5*min(M,N) for 'O'.
        if iwork_size == 1 && (jobz == 'A' || jobz == 'S' || jobz == 'a' || jobz == 's')
            iwork_size = BlasIntType(8 * min(m,n))
        elseif iwork_size == 1 && (jobz == 'O' || jobz == 'o')
            iwork_size = BlasIntType(5 * min(m,n))
        end
        work = Vector{Float32}(undef, lwork)
        iwork = Vector{BlasIntType}(undef, iwork_size)

        call_external_function(handle, :sgesdd_, Cvoid,
                               (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                               jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)
    end
    return U, s, VT, info[]
end

function aocl_dgesdd!(jobz::Char, A::Matrix{Float64})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    s = Vector{Float64}(undef, min(m, n)) # Singular values

    ldu = BlasIntType(1)
    if jobz == 'A' || jobz == 'a'
        ldu = BlasIntType(m)
    elseif jobz == 'S' || jobz == 's' || jobz == 'O' || jobz == 'o'
        ldu = BlasIntType(m)
    end
    U = Matrix{Float64}(undef, ldu, (jobz == 'A' || jobz == 'a') ? m : (jobz == 'S' || jobz == 's' ? min(m, n) : 1))

    ldvt = BlasIntType(1)
    if jobz == 'A' || jobz == 'a'
        ldvt = BlasIntType(n)
    elseif jobz == 'O' || jobz == 'o'
        ldvt = BlasIntType(min(m, n))
    elseif jobz == 'S' || jobz == 's'
        ldvt = BlasIntType(min(m, n))
    end
    VT = Matrix{Float64}(undef, ldvt, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)
    iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size

    call_external_function(handle, :dgesdd_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        iwork_size = BlasIntType(round(Int, iwork[1]))
        if iwork_size == 1 && (jobz == 'A' || jobz == 'S' || jobz == 'a' || jobz == 's')
            iwork_size = BlasIntType(8 * min(m,n))
        elseif iwork_size == 1 && (jobz == 'O' || jobz == 'o')
            iwork_size = BlasIntType(5 * min(m,n))
        end
        work = Vector{Float64}(undef, lwork)
        iwork = Vector{BlasIntType}(undef, iwork_size)

        call_external_function(handle, :dgesdd_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                               jobz, BlasIntType(m), BlasIntType(n), A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info)
    end
    return U, s, VT, info[]
end

# GEEVX: Compute eigenvalues and, optionally, eigenvectors of a general matrix
# s/dgeevx (single/double precision)
function aocl_sgeevx!(balanc::Char, jobvl::Char, jobvr::Char, sense::Char, A::Matrix{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    wr = Vector{Float32}(undef, n)
    wi = Vector{Float32}(undef, n)

    ldvl = BlasIntType(1)
    if jobvl == 'V' || jobvl == 'v'
        ldvl = BlasIntType(n)
    end
    VL = Matrix{Float32}(undef, ldvl, n)

    ldvr = BlasIntType(1)
    if jobvr == 'V' || jobvr == 'v'
        ldvr = BlasIntType(n)
    end
    VR = Matrix{Float32}(undef, ldvr, n)

    ilo = Ref{BlasIntType}(0)
    ihi = Ref{BlasIntType}(0)
    scale = Vector{Float32}(undef, n)
    abnrm = Ref{Float32}(0.0)
    rconde = Vector{Float32}(undef, n)
    rcondv = Vector{Float32}(undef, n)
    info = Ref{BlasIntType}(0)

    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)
    iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size

    call_external_function(handle, :sgeevx_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32},
                            Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
                           ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        iwork_size = BlasIntType(round(Int, iwork[1]))
        work = Vector{Float32}(undef, lwork)
        iwork = Vector{BlasIntType}(undef, iwork_size)

        call_external_function(handle, :sgeevx_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{Float32},
                                Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                               balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
                               ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)
    end
    return wr, wi, VL, VR, ilo[], ihi[], scale, abnrm[], rconde, rcondv, info[]
end

function aocl_dgeevx!(balanc::Char, jobvl::Char, jobvr::Char, sense::Char, A::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    wr = Vector{Float64}(undef, n)
    wi = Vector{Float64}(undef, n)

    ldvl = BlasIntType(1)
    if jobvl == 'V' || jobvl == 'v'
        ldvl = BlasIntType(n)
    end
    VL = Matrix{Float64}(undef, ldvl, n)

    ldvr = BlasIntType(1)
    if jobvr == 'V' || jobvr == 'v'
        ldvr = BlasIntType(n)
    end
    VR = Matrix{Float64}(undef, ldvr, n)

    ilo = Ref{BlasIntType}(0)
    ihi = Ref{BlasIntType}(0)
    scale = Vector{Float64}(undef, n)
    abnrm = Ref{Float64}(0.0)
    rconde = Vector{Float64}(undef, n)
    rcondv = Vector{Float64}(undef, n)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)
    iwork = Vector{BlasIntType}(undef, 1) # Query optimal integer workspace size

    call_external_function(handle, :dgeevx_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64},
                            Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                           balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
                           ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        iwork_size = BlasIntType(round(Int, iwork[1]))
        work = Vector{Float64}(undef, lwork)
        iwork = Vector{BlasIntType}(undef, iwork_size)

        call_external_function(handle, :dgeevx_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{Float64},
                                Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{BlasIntType}, Ptr{BlasIntType}, Ref{BlasIntType}),
                               balanc, jobvl, jobvr, sense, BlasIntType(n), A, lda, wr, wi, VL, ldvl, VR, ldvr,
                               ilo, ihi, scale, abnrm, rconde, rcondv, work, lwork, iwork, info)
    end
    return wr, wi, VL, VR, ilo[], ihi[], scale, abnrm[], rconde, rcondv, info[]
end

# GETF2: LU factorization (unblocked algorithm)
# s/dgetf2 (single/double precision)
function aocl_sgetf2!(A::Matrix{Float32})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ipiv = Vector{Int32}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgetf2_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Int32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
    return A, ipiv, info[]
end

function aocl_dgetf2!(A::Matrix{Float64})
    m, n = size(A)
    handle = m > typemax(Int32) || n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = m > typemax(Int32) || n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(m)
    ipiv = Vector{Int32}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgetf2_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Int32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), A, lda, ipiv, info)
    return A, ipiv, info[]
end

# GBSV: Solve general band system of linear equations
# s/dgbsv (single/double precision)
function aocl_sgbsv!(A::Matrix{Float32}, B::Matrix{Float32}, kl::Int, ku::Int)
    n = size(A, 2) # N is the number of columns in A (matrix is N x N)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA = KL + KU + 1
    ldb = BlasIntType(n)
    ipiv = Vector{Int32}(undef, n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgbsv_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Int32}, Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), BlasIntType(kl), BlasIntType(ku), BlasIntType(nrhs),
                           A, lda, ipiv, B, ldb, info)
    return A, B, ipiv, info[]
end

function aocl_dgbsv!(A::Matrix{Float64}, B::Matrix{Float64}, kl::Int, ku::Int)
    n = size(A, 2) # N is the number of columns in A (matrix is N x N)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA = KL + KU + 1
    ldb = BlasIntType(n)
    ipiv = Vector{Int32}(undef, n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgbsv_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), BlasIntType(kl), BlasIntType(ku), BlasIntType(nrhs),
                           A, lda, ipiv, B, ldb, info)
    return A, B, ipiv, info[]
end

# GTSV: Solve general tridiagonal system of linear equations
# s/dgtsv (single/double precision)
function aocl_sgtsv!(dl::Vector{Float32}, d::Vector{Float32}, du::Vector{Float32}, B::Matrix{Float32})
    n = length(d)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :sgtsv_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), BlasIntType(nrhs), dl, d, du, B, ldb, info)
    return dl, d, du, B, info[]
end

function aocl_dgtsv!(dl::Vector{Float64}, d::Vector{Float64}, du::Vector{Float64}, B::Matrix{Float64})
    n = length(d)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgtsv_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           BlasIntType(n), BlasIntType(nrhs), dl, d, du, B, ldb, info)
    return dl, d, du, B, info[]
end

# GGEV: Compute eigenvalues and, optionally, eigenvectors of a general matrix pencil (A,B)
# s/dggev (single/double precision)
function aocl_sggev!(jobvl::Char, jobvr::Char, A::Matrix{Float32}, B::Matrix{Float32})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    alphar = Vector{Float32}(undef, n)
    alphai = Vector{Float32}(undef, n)
    beta = Vector{Float32}(undef, n)

    ldvl = BlasIntType(1)
    if jobvl == 'V' || jobvl == 'v'
        ldvl = BlasIntType(n)
    end
    VL = Matrix{Float32}(undef, ldvl, n)

    ldvr = BlasIntType(1)
    if jobvr == 'V' || jobvr == 'v'
        ldvr = BlasIntType(n)
    end
    VR = Matrix{Float32}(undef, ldvr, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float32}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :sggev_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32},
                            Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                            Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
                           VL, ldvl, VR, ldvr, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float32}(undef, lwork)
        call_external_function(handle, :sggev_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ptr{Float32}, Ptr{Float32},
                                Ptr{Float32}, Ref{BlasIntType}, Ptr{Float32}, Ref{BlasIntType},
                                Ptr{Float32}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
                               VL, ldvl, VR, ldvr, work, lwork, info)
    end
    return alphar, alphai, beta, VL, VR, info[]
end

function aocl_dggev!(jobvl::Char, jobvr::Char, A::Matrix{Float64}, B::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    alphar = Vector{Float64}(undef, n)
    alphai = Vector{Float64}(undef, n)
    beta = Vector{Float64}(undef, n)

    ldvl = BlasIntType(1)
    if jobvl == 'V' || jobvl == 'v'
        ldvl = BlasIntType(n)
    end
    VL = Matrix{Float64}(undef, ldvl, n)

    ldvr = BlasIntType(1)
    if jobvr == 'V' || jobvr == 'v'
        ldvr = BlasIntType(n)
    end
    VR = Matrix{Float64}(undef, ldvr, n)

    info = Ref{BlasIntType}(0)
    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dggev_, Cvoid,
                           (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
                           VL, ldvl, VR, ldvr, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dggev_, Cvoid,
                               (Ref{UInt8}, Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
                                Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               jobvl, jobvr, BlasIntType(n), A, lda, B, ldb, alphar, alphai, beta,
                               VL, ldvl, VR, ldvr, work, lwork, info)
    end
    return alphar, alphai, beta, VL, VR, info[]
end

# POSV: Solves AX = B for symmetric positive definite A
# dposv (double precision)
function aocl_dposv!(uplo::Char, A::Matrix{Float64}, B::Matrix{Float64})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dposv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, B, ldb, info)
    return A, B, info[]
end

# SYTRF: Computes the Bunch-Kaufman factorization of a real symmetric indefinite matrix
# dsytrf (double precision)
function aocl_dsytrf!(uplo::Char, A::Matrix{Float64})
    n = size(A, 1)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ipiv = Vector{BlasIntType}(undef, n) # IPIV for SYTRF is BlasIntType
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dsytrf_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), A, lda, ipiv, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dsytrf_, Cvoid,
                               (Ref{UInt8}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               uplo, BlasIntType(n), A, lda, ipiv, work, lwork, info)
    end
    return A, ipiv, info[]
end

# SYTRS: Solves a system of linear equations using the Bunch-Kaufman factorization
# dsytrs (double precision)
function aocl_dsytrs!(uplo::Char, A::Matrix{Float64}, ipiv::Vector{BlasInt}, B::Matrix{Float64})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dsytrs_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, info)
    return B, info[]
end

# SYSV: Solves AX = B for symmetric indefinite A
# dsysv (double precision)
function aocl_dsysv!(uplo::Char, A::Matrix{Float64}, B::Matrix{Float64})
    n = size(A, 1)
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(n)
    ldb = BlasIntType(n)
    ipiv = Vector{BlasIntType}(undef, n)
    info = Ref{BlasIntType}(0)

    work = Vector{Float64}(undef, 1) # Query optimal workspace size
    lwork = BlasIntType(-1)

    call_external_function(handle, :dsysv_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                            Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, work, lwork, info)

    if info[] == 0 && lwork[] == -1
        lwork = BlasIntType(round(Int, work[1]))
        work = Vector{Float64}(undef, lwork)
        call_external_function(handle, :dsysv_, Cvoid,
                               (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType},
                                Ptr{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                               uplo, BlasIntType(n), BlasIntType(nrhs), A, lda, ipiv, B, ldb, work, lwork, info)
    end
    return A, B, ipiv, info[]
end

# GBTRF: Computes the LU factorization of a general band matrix
# dgbtrf (double precision)
function aocl_dgbtrf!(A::Matrix{Float64}, m::Int, n::Int, kl::Int, ku::Int)
    handle = max(m, n) > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = max(m, n) > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA for band matrix is KL + KU + 1
    ipiv = Vector{Int32}(undef, min(m, n))
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgbtrf_, Cvoid,
                           (Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32}, Ref{BlasIntType}),
                           BlasIntType(m), BlasIntType(n), BlasIntType(kl), BlasIntType(ku),
                           A, lda, ipiv, info)
    return A, ipiv, info[]
end

# GBTRS: Solves a system of linear equations using the LU factorization of a general band matrix
# dgbtrs (double precision)
function aocl_dgbtrs!(trans::Char, A::Matrix{Float64}, ipiv::Vector{Int32}, B::Matrix{Float64}, kl::Int, ku::Int)
    n = size(A, 2) # A is N x N, stored in band format
    nrhs = size(B, 2)
    handle = n > typemax(Int32) ? flame_ilp64_handle : flame_lp64_handle
    BlasIntType = n > typemax(Int32) ? Int64 : Int32
    lda = BlasIntType(size(A, 1)) # LDA = KL + KU + 1
    ldb = BlasIntType(n)
    info = Ref{BlasIntType}(0)

    call_external_function(handle, :dgbtrs_, Cvoid,
                           (Ref{UInt8}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType}, Ref{BlasIntType},
                            Ptr{Float64}, Ref{BlasIntType}, Ptr{Int32}, Ptr{Float64}, Ref{BlasIntType}, Ref{BlasIntType}),
                           trans, BlasIntType(n), BlasIntType(kl), BlasIntType(ku), BlasIntType(nrhs),
                           A, lda, ipiv, B, ldb, info)
    return B, info[]
end