Advertisement
Guest User

Untitled

a guest
Mar 23rd, 2017
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.13 KB | None | 0 0
  1. from __future__ import absolute_import, print_function, division
  2.  
  3. import numpy as np
  4. import theano
  5. from theano import Op
  6. import theano.tensor as T
  7. from theano.gpuarray.basic_ops import (gpu_contiguous, as_gpuarray_variable,
  8. infer_context_name)
  9. from theano.gpuarray.type import get_context
  10.  
  11. try:
  12. import pygpu
  13. pygpu_available = True
  14. except ImportError:
  15. pygpu_available = False
  16.  
  17. try:
  18. import pycuda.driver
  19. pycuda_available = True
  20. except ImportError:
  21. pycuda_available = False
  22.  
  23. try:
  24. import skcuda
  25. import skcuda.cusolver as solver
  26. scikits_cuda_available = True
  27. except (ImportError, Exception):
  28. scikits_cuda_available = False
  29.  
  30. try:
  31. import scipy.linalg
  32. imported_scipy = True
  33. except ImportError:
  34. # some ops (e.g. Cholesky, Solve, A_Xinv_b) won't work
  35. imported_scipy = False
  36.  
  37.  
  38. # def cusolver_solve(a, b, d, n, overwrite_a=False, overwrite_b=False):
  39. # a = a if overwrite_a else a.copy()
  40. # b = b if overwrite_b else b.copy()
  41. # h = solver.cusolverDnCreate()
  42. # # Set up work buffers:
  43. # Lwork = solver.cusolverDnSgetrf_bufferSize(h, d, d, a.gpudata, d)
  44. # workspace_gpu = gpuarray.zeros(Lwork, np.float32)
  45. # devipiv_gpu = gpuarray.zeros(d, np.int32)
  46. # devinfo_gpu = gpuarray.zeros(1, np.int32)
  47. # solver.cusolverDnSgetrf(h, d, d, a.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata, devinfo_gpu.gpudata)
  48. # solver.cusolverDnSgetrs(h, 0, d, n, a.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d, devinfo_gpu.gpudata)
  49. # solver.cusolverDnDestroy(h)
  50. # return b
  51.  
  52.  
  53. class CuSolve(Op):
  54. __props__ = ('context_name', 'overwrite_A', 'overwrite_b')
  55. _f16_ok = False
  56.  
  57. def __init__(self, context_name=None,
  58. overwrite_A=False,
  59. overwrite_b=False):
  60. self.context_name = context_name
  61. self.overwrite_A = overwrite_A
  62. self.overwrite_b = overwrite_b
  63.  
  64. def __repr__(self):
  65. return 'CuChoSolve{%s}' % str(self._props())
  66.  
  67. def get_params(self, node):
  68. return get_context(self.context_name), self.overwrite_A, self.overwrite_b
  69.  
  70. def infer_shape(self, node, shapes):
  71. Ashape, Bshape = shapes
  72. rows = Ashape[0]
  73. if len(Bshape) == 1: # b is a Vector
  74. return [(rows,)]
  75. else:
  76. cols = Bshape[1] # b is a Matrix
  77. return [(rows, cols)]
  78.  
  79. def make_node(self, a, b):
  80. if not scikits_cuda_available:
  81. raise RuntimeError("skcuda is needed for CuFFTOp")
  82.  
  83. if not pygpu_available:
  84. raise RuntimeError("pygpu is needed for CuFFTOp")
  85.  
  86. if not pycuda_available:
  87. raise RuntimeError("pycuda is needed for CuFFTOp")
  88.  
  89. a = gpu_contiguous(as_gpuarray_variable(a, infer_context_name(a)))
  90. b = gpu_contiguous(as_gpuarray_variable(b, infer_context_name(b)))
  91.  
  92. assert a.dtype == "float32"
  93. assert a.ndim == 2
  94. assert b.dtype == "float32"
  95. assert b.ndim == 2
  96.  
  97. return theano.Apply(self, [a, b], [b.type()])
  98.  
  99. def make_thunk(self, node, storage_map, _, _2, impl=None):
  100.  
  101. inputs = [storage_map[v] for v in node.inputs]
  102. outputs = [storage_map[v] for v in node.outputs]
  103. # Initiliaze cuda context to the input's.
  104. with node.inputs[0].type.context:
  105. skcuda.misc.init()
  106. # Make a copy of b to outputs as cusolve is performed always in place
  107.  
  108. def thunk():
  109. A = inputs[0][0].copy()
  110. b_init = inputs[1][0]
  111. b = outputs[0]
  112. # a_shape = A.shape
  113. # b_shape = b.shape
  114. # A must be square
  115. # assert a_shape[0] == a_shape[1]
  116. # First dimension of b must match A
  117. # assert b_shape[0] == a_shape[0]
  118. # d, n = b_shape
  119. # only allocate if there is no previous allocation of the
  120. # right size.
  121. output_shape = b_init.shape
  122. d, n = output_shape
  123. if b[0] is None or b[0].shape != output_shape:
  124. b[0] = pygpu.zeros(output_shape,
  125. context=A.context,
  126. dtype='float32')
  127. with A.context:
  128. # Sync GPU variables before computation
  129. A.sync()
  130. b_init.sync()
  131. b[0][:] = b_init
  132. b = b[0]
  133. b.sync()
  134. # Initialize handle
  135. h = solver.cusolverDnCreate()
  136. # Set up work buffers:
  137. work = solver.cusolverDnSgetrf_bufferSize(h, d, d, A.gpudata, d)
  138. workspace_gpu = pygpu.zeros(work, context=A.context, dtype=np.float32)
  139. # workspace_gpu = gpuarray.zeros(work, np.float32)
  140. devipiv_gpu = pygpu.zeros(d, context=A.context, dtype=np.float32)
  141. # devipiv_gpu = gpuarray.zeros(d, np.int32)
  142. devinfo_gpu = pygpu.zeros(1, context=A.context, dtype=np.float32)
  143. # devinfo_gpu = gpuarray.zeros(1, np.int32)
  144. # Do LU decomposition
  145. solver.cusolverDnSgetrf(h, d, d, A.gpudata, d, workspace_gpu.gpudata, devipiv_gpu.gpudata,
  146. devinfo_gpu.gpudata)
  147. # Do the actual linear system solve
  148. solver.cusolverDnSgetrs(h, 0, d, n, A.gpudata, d, devipiv_gpu.gpudata, b.gpudata, d,
  149. devinfo_gpu.gpudata)
  150. # Destroy the handle
  151. solver.cusolverDnDestroy(h)
  152. # Sync results to ensure output contains completed computation
  153. pycuda.driver.Context.synchronize()
  154. thunk.inputs = inputs
  155. thunk.outputs = outputs
  156. thunk.lazy = False
  157.  
  158. return thunk
  159.  
  160.  
  161. def cu_solve(a, b):
  162. return CuSolve()(a, b)
  163.  
  164. a_in = theano.tensor.fmatrix()
  165. b_in = theano.tensor.fmatrix()
  166. c = cu_solve(a_in, b_in)
  167. f = theano.function([a_in, b_in], c)
  168. a_np = np.random.randn(20, 20).astype(theano.config.floatX)
  169. a_np = np.dot(a_np, a_np.T)
  170. b_np = np.random.randn(20, 5).astype(theano.config.floatX)
  171. sp_x = scipy.linalg.solve(a_np, b_np)
  172. cu_x = f(a_np, b_np)
  173. print(np.linalg.norm(np.dot(a_np, sp_x) - b_np))
  174. print(np.linalg.norm(np.dot(a_np, cu_x) - b_np))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement