Advertisement
Guest User

Modified Reduction.py

a guest
Jun 10th, 2011
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.34 KB | None | 0 0
  1. """Computation of reductions on vectors."""
  2.  
  3. from __future__ import division
  4.  
  5. __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
  6.  
  7. __license__ = """
  8. Permission is hereby granted, free of charge, to any person
  9. obtaining a copy of this software and associated documentation
  10. files (the "Software"), to deal in the Software without
  11. restriction, including without limitation the rights to use,
  12. copy, modify, merge, publish, distribute, sublicense, and/or sell
  13. copies of the Software, and to permit persons to whom the
  14. Software is furnished to do so, subject to the following
  15. conditions:
  16.  
  17. The above copyright notice and this permission notice shall be
  18. included in all copies or substantial portions of the Software.
  19.  
  20. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  22. OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23. NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  24. HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  25. WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  26. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27. OTHER DEALINGS IN THE SOFTWARE.
  28.  
  29. Based on code/ideas by Mark Harris <mharris@nvidia.com>.
  30.  
  31. Original License:
  32.  
  33. Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
  34.  
  35. NOTICE TO USER:
  36.  
  37. This source code is subject to NVIDIA ownership rights under U.S. and
  38. international Copyright laws.
  39.  
  40. NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
  41. CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
  42. IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
  43. REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
  44. MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  45. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
  46. OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  47. OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  48. OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  49. OR PERFORMANCE OF THIS SOURCE CODE.
  50.  
  51. U.S. Government End Users. This source code is a "commercial item" as
  52. that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
  53. "commercial computer software" and "commercial computer software
  54. documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
  55. and is provided to the U.S. Government only as a commercial end item.
  56. Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
  57. 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
  58. source code with only those rights set forth herein.
  59. """
  60.  
  61.  
  62.  
  63.  
  64. from pycuda.tools import context_dependent_memoize
  65. from pycuda.tools import dtype_to_ctype
  66. import numpy
  67.  
  68.  
  69.  
  70.  
  71. def get_reduction_module(out_type, block_size,
  72. neutral, reduce_expr, map_expr, arguments,
  73. name="reduce_kernel", keep=False, options=[], preamble=""):
  74.  
  75. from pycuda.compiler import SourceModule
  76. src = """
  77. #define BLOCK_SIZE %(block_size)d
  78. #define READ_AND_MAP(i) (%(map_expr)s)
  79. #define REDUCE(a, b) (%(reduce_expr)s)
  80.  
  81. typedef %(out_type)s out_type;
  82.  
  83. %(preamble)s
  84.  
  85. __global__ void %(name)s(out_type *out, %(arguments)s,
  86. unsigned int seq_count, unsigned int n)
  87. {
  88. __shared__ out_type sdata[BLOCK_SIZE];
  89.  
  90. unsigned int tid = threadIdx.x;
  91.  
  92. unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid;
  93.  
  94. out_type acc = %(neutral)s;
  95. for (unsigned s = 0; s < seq_count; ++s)
  96. {
  97. if (i >= n)
  98. break;
  99. acc = REDUCE(acc, READ_AND_MAP(i));
  100.  
  101. i += BLOCK_SIZE;
  102. }
  103.  
  104. sdata[tid] = acc;
  105.  
  106. __syncthreads();
  107.  
  108. #if (BLOCK_SIZE >= 512)
  109. if (tid < 256) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 256]); }
  110. __syncthreads();
  111. #endif
  112.  
  113. #if (BLOCK_SIZE >= 256)
  114. if (tid < 128) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 128]); }
  115. __syncthreads();
  116. #endif
  117.  
  118. #if (BLOCK_SIZE >= 128)
  119. if (tid < 64) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 64]); }
  120. __syncthreads();
  121. #endif
  122.  
  123. if (tid < 32)
  124. {
  125. // 'volatile' required according to Fermi compatibility guide 1.2.2
  126. volatile out_type * smem = sdata;
  127. if (BLOCK_SIZE >= 64) smem[tid] = REDUCE(smem[tid], smem[tid + 32]);
  128. if (BLOCK_SIZE >= 32) smem[tid] = REDUCE(smem[tid], smem[tid + 16]);
  129. if (BLOCK_SIZE >= 16) smem[tid] = REDUCE(smem[tid], smem[tid + 8]);
  130. if (BLOCK_SIZE >= 8) smem[tid] = REDUCE(smem[tid], smem[tid + 4]);
  131. if (BLOCK_SIZE >= 4) smem[tid] = REDUCE(smem[tid], smem[tid + 2]);
  132. if (BLOCK_SIZE >= 2) smem[tid] = REDUCE(smem[tid], smem[tid + 1]);
  133. }
  134.  
  135. if (tid == 0) out[blockIdx.x] = sdata[0];
  136. }
  137. """ % {
  138. "out_type": out_type,
  139. "arguments": arguments,
  140. "block_size": block_size,
  141. "neutral": neutral,
  142. "reduce_expr": reduce_expr,
  143. "map_expr": map_expr,
  144. "name": name,
  145. "preamble": preamble
  146. }
  147.  
  148. return SourceModule(src, options=options, keep=keep)
  149.  
  150.  
  151.  
  152.  
  153. def get_reduction_kernel_and_types(out_type, block_size,
  154. neutral, reduce_expr, map_expr=None, arguments=None,
  155. name="reduce_kernel", keep=False, options=[], preamble=""):
  156. if map_expr is None:
  157. map_expr = "in[i]"
  158.  
  159. if arguments is None:
  160. arguments = "const %s *in" % out_type
  161.  
  162. mod = get_reduction_module(out_type, block_size,
  163. neutral, reduce_expr, map_expr, arguments,
  164. name, keep, options, preamble)
  165.  
  166. from pycuda.tools import get_arg_type
  167. func = mod.get_function(name)
  168. arg_types = [get_arg_type(arg) for arg in arguments.split(",")]
  169. func.prepare("P%sII" % "".join(arg_types), (block_size,1,1))
  170.  
  171. return func, arg_types
  172.  
  173.  
  174.  
  175.  
  176. class ReductionKernel:
  177. def __init__(self, dtype_out,
  178. neutral, reduce_expr, map_expr=None, arguments=None,
  179. name="reduce_kernel", keep=False, options=[], preamble="", hack=False):
  180. self.hack = hack
  181. self.dtype_out = dtype_out
  182.  
  183. self.block_size = 512
  184.  
  185. s1_func, self.stage1_arg_types = get_reduction_kernel_and_types(
  186. dtype_to_ctype(dtype_out), self.block_size,
  187. neutral, reduce_expr, map_expr,
  188. arguments, name=name+"_stage1", keep=keep, options=options,
  189. preamble=preamble)
  190. self.stage1_func = s1_func.prepared_async_call
  191.  
  192. # stage 2 has only one input and no map expression
  193. if hack:
  194. s2_func, self.stage2_arg_types = get_reduction_kernel_and_types(
  195. dtype_to_ctype(dtype_out), self.block_size,
  196. neutral, reduce_expr, arguments="float *in, " + arguments,
  197. name=name+"_stage2", keep=keep, options=options,
  198. preamble=preamble)
  199. self.stage2_func = s2_func.prepared_async_call
  200. else:
  201. s2_func, self.stage2_arg_types = get_reduction_kernel_and_types(
  202. dtype_to_ctype(dtype_out), self.block_size,
  203. neutral, reduce_expr,
  204. name=name+"_stage2", keep=keep, options=options,
  205. preamble=preamble)
  206. self.stage2_func = s2_func.prepared_async_call
  207.  
  208. assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], \
  209. "ReductionKernel can only be used with functions that have at least one " \
  210. "vector argument"
  211.  
  212. def __call__(self, *args, **kwargs):
  213. MAX_BLOCK_COUNT = 1024
  214. SMALL_SEQ_COUNT = 4
  215.  
  216. s1_func = self.stage1_func
  217. s2_func = self.stage2_func
  218.  
  219. kernel_wrapper = kwargs.get("kernel_wrapper")
  220. if kernel_wrapper is not None:
  221. s1_func = kernel_wrapper(s1_func)
  222. s2_func = kernel_wrapper(s2_func)
  223.  
  224. stream = kwargs.get("stream")
  225.  
  226. from gpuarray import empty
  227.  
  228. f = s1_func
  229. arg_types = self.stage1_arg_types
  230.  
  231. while True:
  232. invocation_args = []
  233. vectors = []
  234.  
  235. for arg, arg_tp in zip(args, arg_types):
  236. if arg_tp == "P":
  237. vectors.append(arg)
  238. invocation_args.append(arg.gpudata)
  239. else:
  240. invocation_args.append(arg)
  241.  
  242. repr_vec = vectors[0]
  243. sz = repr_vec.size
  244.  
  245. if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT:
  246. total_block_size = SMALL_SEQ_COUNT*self.block_size
  247. block_count = (sz + total_block_size - 1) // total_block_size
  248. seq_count = SMALL_SEQ_COUNT
  249. else:
  250. block_count = MAX_BLOCK_COUNT
  251. macroblock_size = block_count*self.block_size
  252. seq_count = (sz + macroblock_size - 1) // macroblock_size
  253.  
  254. if block_count == 1:
  255. result = empty((), self.dtype_out, repr_vec.allocator)
  256. else:
  257. result = empty((block_count,), self.dtype_out, repr_vec.allocator)
  258.  
  259. #print block_count, seq_count, self.block_size
  260. f((block_count, 1), stream,
  261. *([result.gpudata]+invocation_args+[seq_count, sz]))
  262.  
  263. if block_count == 1:
  264. return result
  265. else:
  266. f = s2_func
  267. arg_types = self.stage2_arg_types
  268. if self.hack:
  269. args = [result].extend(args)
  270. else:
  271. args = [result]
  272.  
  273.  
  274.  
  275.  
  276. @context_dependent_memoize
  277. def get_sum_kernel(dtype_out, dtype_in):
  278. if dtype_out is None:
  279. dtype_out = dtype_in
  280.  
  281. return ReductionKernel(dtype_out, "0", "a+b",
  282. arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)},
  283. keep=True)
  284.  
  285.  
  286.  
  287.  
  288. @context_dependent_memoize
  289. def get_dot_kernel(dtype_out, dtype_a=None, dtype_b=None):
  290. if dtype_out is None:
  291. dtype_out = dtype_a
  292.  
  293. if dtype_b is None:
  294. if dtype_a is None:
  295. dtype_b = dtype_out
  296. else:
  297. dtype_b = dtype_a
  298.  
  299. if dtype_a is None:
  300. dtype_a = dtype_out
  301.  
  302. return ReductionKernel(dtype_out, neutral="0",
  303. reduce_expr="a+b", map_expr="a[i]*b[i]",
  304. arguments="const %(tp_a)s *a, const %(tp_b)s *b" % {
  305. "tp_a": dtype_to_ctype(dtype_a),
  306. "tp_b": dtype_to_ctype(dtype_b),
  307. }, keep=True)
  308.  
  309.  
  310.  
  311.  
  312. @context_dependent_memoize
  313. def get_subset_dot_kernel(dtype_out, dtype_a=None, dtype_b=None):
  314. if dtype_out is None:
  315. dtype_out = dtype_a
  316.  
  317. if dtype_b is None:
  318. if dtype_a is None:
  319. dtype_b = dtype_out
  320. else:
  321. dtype_b = dtype_a
  322.  
  323. if dtype_a is None:
  324. dtype_a = dtype_out
  325.  
  326. # important: lookup_tbl must be first--it controls the length
  327. return ReductionKernel(dtype_out, neutral="0",
  328. reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]",
  329. arguments="const unsigned int *lookup_tbl, "
  330. "const %(tp_a)s *a, const %(tp_b)s *b" % {
  331. "tp_a": dtype_to_ctype(dtype_a),
  332. "tp_b": dtype_to_ctype(dtype_b),
  333. })
  334.  
  335.  
  336.  
  337.  
  338. def get_minmax_neutral(what, dtype):
  339. dtype = numpy.dtype(dtype)
  340. if issubclass(dtype.type, numpy.inexact):
  341. if what == "min":
  342. return "MY_INFINITY"
  343. elif what == "max":
  344. return "-MY_INFINITY"
  345. else:
  346. raise ValueError("what is not min or max.")
  347. else:
  348. if what == "min":
  349. return str(numpy.iinfo(dtype).max)
  350. elif what == "max":
  351. return str(numpy.iinfo(dtype).min)
  352. else:
  353. raise ValueError("what is not min or max.")
  354.  
  355.  
  356.  
  357.  
  358. @context_dependent_memoize
  359. def get_minmax_kernel(what, dtype):
  360. if dtype == numpy.float64:
  361. reduce_expr = "f%s(a,b)" % what
  362. elif dtype == numpy.float32:
  363. reduce_expr = "f%sf(a,b)" % what
  364. elif dtype.kind in "iu":
  365. reduce_expr = "%s(a,b)" % what
  366. else:
  367. raise TypeError("unsupported dtype specified")
  368.  
  369. return ReductionKernel(dtype,
  370. neutral=get_minmax_neutral(what, dtype),
  371. reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
  372. arguments="const %(tp)s *in" % {
  373. "tp": dtype_to_ctype(dtype),
  374. }, preamble="#define MY_INFINITY (1./0)")
  375.  
  376.  
  377.  
  378.  
  379. @context_dependent_memoize
  380. def get_subset_minmax_kernel(what, dtype):
  381. if dtype == numpy.float64:
  382. reduce_expr = "f%s(a,b)" % what
  383. elif dtype == numpy.float32:
  384. reduce_expr = "f%sf(a,b)" % what
  385. elif dtype.kind in "iu":
  386. reduce_expr = "%s(a,b)" % what
  387. else:
  388. raise TypeError("unsupported dtype specified")
  389.  
  390. return ReductionKernel(dtype,
  391. neutral=get_minmax_neutral(what, dtype),
  392. reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
  393. map_expr="in[lookup_tbl[i]]",
  394. arguments="const unsigned int *lookup_tbl, "
  395. "const %(tp)s *in" % {
  396. "tp": dtype_to_ctype(dtype),
  397. }, preamble="#define MY_INFINITY (1./0)")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement