Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import absolute_import, print_function, division
- import numpy as np
- import theano
- from theano import Op
- import theano.tensor as T
- from theano.gpuarray.basic_ops import (gpu_contiguous, as_gpuarray_variable,
- infer_context_name)
- from theano.gpuarray.type import get_context
- try:
- import pygpu
- pygpu_available = True
- except ImportError:
- pygpu_available = False
- try:
- import pycuda.driver
- pycuda_available = True
- except ImportError:
- pycuda_available = False
- try:
- import skcuda
- import skcuda.cusolver as solver
- scikits_cuda_available = True
- except (ImportError, Exception):
- scikits_cuda_available = False
- try:
- import scipy.linalg
- imported_scipy = True
- except ImportError:
- # some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
- imported_scipy = False
- # def cusolver_solve(a, b, d, n, overwrite_a=False, overwrite_b=False):
- # a = a if overwrite_a else a.copy()
- # b = b if overwrite_b else b.copy()
- # h = solver.cusolverDnCreate()
- # # Set up work buffers:
- # Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, a.gpudata, d)
- # workspace_gpu = gpuarray.zeros(Lwork, np.float32)
- # devipiv_gpu = gpuarray.zeros(d, np.int32)
- # devinfo_gpu = gpuarray.zeros(1, np.int32)
- # solver.cusolverDnSgetrf(h, d, d, a.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata)
- # solver.cusolverDnSgetrs(h, 0, d, n, a.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, devinfo_gpu.gpudata)
- # solver.cusolverDnDestroy(h)
- # return b
- class CuSolve(Op):
- __props__ = ('context_name', 'overwrite_A', 'overwrite_b')
- _f16_ok = False
- def __init__(self, context_name=None,
- overwrite_A=False,
- overwrite_b=False):
- self.context_name = context_name
- self.overwrite_A = overwrite_A
- self.overwrite_b = overwrite_b
- def __repr__(self):
- return 'CuChoSolve{%s}' % str(self._props())
- def get_params(self, node):
- return get_context(self.context_name), self.overwrite_A, self.overwrite_b
- def infer_shape(self, node, shapes):
- Ashape, Bshape = shapes
- rows = Ashape[0]
- if len(Bshape) == 1: # b is a Vector
- return [(rows,)]
- else:
- cols = Bshape[1] # b is a Matrix
- return [(rows, cols)]
- def make_node(self, a, b):
- if not scikits_cuda_available:
- raise RuntimeError("skcuda is needed for CuFFTOp")
- if not pygpu_available:
- raise RuntimeError("pygpu is needed for CuFFTOp")
- if not pycuda_available:
- raise RuntimeError("pycuda is needed for CuFFTOp")
- a = gpu_contiguous(as_gpuarray_variable(a, infer_context_name(a)))
- b = gpu_contiguous(as_gpuarray_variable(b, infer_context_name(b)))
- assert a.dtype == "float32"
- assert a.ndim == 2
- assert b.dtype == "float32"
- assert b.ndim == 2
- return theano.Apply(self, [a, b], [b.type()])
- def make_thunk(self, node, storage_map, _, _2, impl=None):
- inputs = [storage_map[v] for v in node.inputs]
- outputs = [storage_map[v] for v in node.outputs]
- # Initiliaze cuda context to the input's.
- with node.inputs[0].type.context:
- skcuda.misc.init()
- # Make a copy of b to outputs as cusolve is performed always in place
- def thunk():
- A = inputs[0][0].copy()
- b_init = inputs[1][0]
- b = outputs[0]
- # a_shape = A.shape
- # b_shape = b.shape
- # A must be square
- # assert a_shape[0] == a_shape[1]
- # First dimension of b must match A
- # assert b_shape[0] == a_shape[0]
- # d, n = b_shape
- # only allocate if there is no previous allocation of the
- # right size.
- output_shape = b_init.shape
- d, n = output_shape
- if b[0] is None or b[0].shape != output_shape:
- b[0] = pygpu.zeros(output_shape,
- context=A.context,
- dtype='float32')
- with A.context:
- # Sync GPU variables before computation
- A.sync()
- b_init.sync()
- b[0][:] = b_init
- b = b[0]
- b.sync()
- # Initialize handle
- h = solver.cusolverDnCreate()
- # Set up work buffers:
- work = solver.cusolverDnSgetrf_bufferSize(h, d, d, A.gpudata, d)
- workspace_gpu = pygpu.zeros(work, context=A.context, dtype=np.float32)
- # workspace_gpu = gpuarray.zeros(work, np.float32)
- devipiv_gpu = pygpu.zeros(d, context=A.context, dtype=np.float32)
- # devipiv_gpu = gpuarray.zeros(d, np.int32)
- devinfo_gpu = pygpu.zeros(1, context=A.context, dtype=np.float32)
- # devinfo_gpu = gpuarray.zeros(1, np.int32)
- # Do LU decomposition
- solver.cusolverDnSgetrf(h, d, d, A.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata,
- devinfo_gpu.gpudata)
- # Do the actual linear system solve
- solver.cusolverDnSgetrs(h, 0, d, n, A.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d,
- devinfo_gpu.gpudata)
- # Destroy the handle
- solver.cusolverDnDestroy(h)
- # Sync results to ensure output contains completed computation
- pycuda.driver.Context.synchronize()
- thunk.inputs = inputs
- thunk.outputs = outputs
- thunk.lazy = False
- return thunk
- def cu_solve(a, b):
- return CuSolve()(a, b)
- a_in = theano.tensor.fmatrix()
- b_in = theano.tensor.fmatrix()
- c = cu_solve(a_in, b_in)
- f = theano.function([a_in, b_in], c)
- a_np = np.random.randn(20, 20).astype(theano.config.floatX)
- a_np = np.dot(a_np, a_np.T)
- b_np = np.random.randn(20, 5).astype(theano.config.floatX)
- sp_x = scipy.linalg.solve(a_np, b_np)
- cu_x = f(a_np, b_np)
- print(np.linalg.norm(np.dot(a_np, sp_x) - b_np))
- print(np.linalg.norm(np.dot(a_np, cu_x) - b_np))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement