Advertisement
EelcoHoogendoorn

numpycuda tests

Aug 28th, 2012
170
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.46 KB | None | 0 0
  1. """
  2. kernel test
  3.  
  4. """
  5.  
  6. from pycuda.driver import init, Device
  7. init()
  8. device = Device(0)
  9. context = device.make_context()
  10.  
  11. from pycuda import gpuarray
  12.  
  13. import numpy as np
  14.  
  15.  
  16.  
  17. class Kernel(object):
  18.     """wrapper for pycuda kernel; checks and binds arguments"""
  19.  
  20.     def __init__(self, decl, kernel, parallel_axes):
  21.         self.decl = decl                        #kernel declaration object; encodes function signature
  22.         self.kernel = kernel                    #pycuda kernel handle
  23.         self.parallel_axes = parallel_axes      #bool array to specify parallel axes; needed for thread blocking
  24.  
  25.  
  26. class CheckedKernel(Kernel):
  27.     """
  28.    quite a complex beast it appears
  29.    anyway, we can worry about optimizing later
  30.    perhaps a specialized subclass when all constants are know at compile time
  31.    which is the case most often encountered in practice
  32.    """
  33.     def __call__(self, *args, **kwargs):
  34.         arg_list = []
  35.  
  36.         dummies = {}
  37.         def handle_shape(decl, value):
  38.             """check a shape value versus a shape declaration. dummies encountered in the declaration are bound"""
  39.             assert(len(value) == len(decl))          #assert correct number of dimensions
  40.  
  41.             for dsize, vsize in zip(decl, value):
  42.                 if isinstance( dsize, np.uint32):
  43.                     assert(dsize == vsize)                  #compile-time specified shape; make sure it matches
  44.                 elif dsize==':':
  45.                     arg_list.append(np.uint32( vsize))          #this is a runtime argument; append it to argument list
  46.                 else:                                       #dummy arg; add it to dummy dict, and check consistency
  47.                     size = dummies.get(dsize, vsize)
  48.                     dummies[dsize] = vsize
  49.                     assert(size==vsize)
  50.  
  51.         #perform runtime checks on input args
  52.         for i, (value, decl) in enumerate( zip(args, self.decl.inputs)):
  53.             if decl.is_array:
  54.                 try:
  55.                     assert(value.dtype.name == decl.dtype)         #type check on string representation of dtype; is that safe?
  56.                     arg_list.append(value.gpudata)
  57.                     handle_shape(decl.shape, value.shape)
  58.                 except:
  59.                     raise Exception("argument {i} does not satisfy gpuarray interface".format(i=i))
  60.             else:   #scalar type
  61.                 try:
  62.                     cast = getattr(np, decl.dtype)
  63.                     arg_list.append(cast(value))
  64.                 except:
  65.                     raise Exception("argument {i} could not be cast to {dtype}".format(i=i,dtype=decl.dtype))
  66.  
  67.         #try if we can get dummies from optional kernel shape argument
  68.         shape = kwargs.pop('shape', None)
  69.         if shape: handle_shape(self.decl.shape, shape)
  70.  
  71.         #make sure all declared dummies are indeed bound
  72.         assert(all(d in dummies for d in self.decl.dummies))
  73.  
  74.         #transform a shape declaration to its bound form, by expanding its dummies
  75.         def complete_shape(shape):
  76.             return tuple(int(dummies.get(s, s)) for s in shape)
  77.  
  78.         #process output arguments; they are all internally allocated for now, but we should also allow for prealloced output args to be passed in to kernel
  79.         return_list = []    #list of things to return from kernel
  80.         for decl in self.decl.outputs:
  81.             value = gpuarray.GPUArray(complete_shape(decl.shape), decl.dtype)
  82.             if decl.default: value.fill(decl.default)
  83.             arg_list.append(value.gpudata)
  84.             return_list.append(value)
  85.  
  86.         #append dummies to argument list
  87.         for d in self.decl.dummies: arg_list.append(np.uint32(dummies[d]))
  88.  
  89.         #set thread grid; all shape dimensions are known at this point
  90.         shape = complete_shape(self.decl.shape)
  91.         axes = np.flatnonzero(self.parallel_axes)
  92.  
  93.         THREADS_PER_BLOCK = 512     #this should be pulled from the context.device, really
  94.         leftover = THREADS_PER_BLOCK
  95.         block = [1]*3
  96.         #rather simple thread block assignment; assign as much as possible to slowest stride, and overflow into the next
  97.         #but should work well under most circumstances.
  98.         #this needs a manual override
  99.         for i,a in enumerate( axes):
  100.             block[i] = min(leftover, shape[a])
  101.             leftover = leftover //  block[i]
  102.             if leftover == 0: break
  103.         block = tuple(block)
  104.  
  105.  
  106.         def iDivUp(a, b):
  107.             # Round a / b to nearest higher integer value
  108.             a = np.int32(a)
  109.             b = np.int32(b)
  110.             r = (a / b + 1) if (a % b != 0) else (a / b)
  111.             return int(r)
  112.  
  113.         grid = tuple(iDivUp(shape[a],b) for b,a in zip(block, axes))
  114.  
  115.         kwargs.update(dict(grid=grid, block=block) )
  116.  
  117.         self.kernel(*arg_list, **kwargs)
  118.         return tuple(return_list)
  119.  
  120.  
  121. def nd_kernel(source, axes=None):
  122.     """
  123.    transform syntactic sugar to valid CUDA-C code
  124.  
  125.    """
  126.     print 'ORIGINAL CODE:'
  127.     print
  128.     print source
  129.     print
  130.  
  131.     from numpycuda_parsing import parsing
  132.     decl, body = parsing(source)
  133.  
  134.  
  135.     #create various string options for axes param? 'all_serial', 'all_parallel', 'max_parallel', 'first_serial'?
  136.     if axes is None:
  137.         axes = np.zeros(len(decl.shape), np.bool)
  138.         axes[-3:] = 1       #do last three axes in parallel (needs to be set to two, for old cards
  139.         axes[0] = 0         #first axis is serial, by default. having at least one looping axis can be beneficial, for shared mem or other reusable terms
  140.         axes[-1] = 1        #unless it is also the last, which is always parallel; otherwise we have no coalesing
  141.     axes = np.array(axes, np.bool)
  142.     assert(axes[-1]==True)  #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason why youd want this
  143.  
  144.  
  145.     #nd-array indexing over serial and parallel axes
  146.     for axis in np.flatnonzero(axes==0):  #emit loops for serial axes
  147.         loop = """
  148.        //loop over serial axis
  149.        unsigned i{n};
  150.        for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
  151.        """.format(n=axis)
  152.         body = loop + '{\n' + body + '\n}'
  153.  
  154.     supported_axes = 'xyz'
  155.     if np.count_nonzero(axes==1) > len(supported_axes):
  156.         raise Exception('requested more parallel axes than are supported by your hardware!')
  157.     for cuda_axis,nd_axis in zip(supported_axes, np.flatnonzero( axes==1)):    #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
  158.         parallel_axes = """
  159.        //launch threads for parallel axis
  160.        unsigned const i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
  161.        if (i{n} >= kernel_shape_{n}) return;
  162.        """.format(n=nd_axis, a=cuda_axis)
  163.         body = parallel_axes + body
  164.  
  165.  
  166.  
  167.  
  168.  
  169.     #macro-based substitutions, to assemble total kernel
  170.     generic_template = """
  171.    __global__ void ${funcname}(${arguments})
  172.    {
  173.        ${init}
  174.        //kernel body
  175.        ${body}
  176.    }
  177.    """
  178.     from string import Template
  179.     template = Template(generic_template)   #would be nice to have identation-aware template...
  180.     funcname = 'nd_kernel'
  181.     transform = template.substitute(
  182.         funcname = funcname,
  183.         arguments = decl.argument_string,
  184.         init = decl.init_string,
  185.         body = body,
  186.         )
  187.  
  188.     print 'TRANSFORMED CODE:'
  189.     print
  190.     print transform
  191.     print
  192.  
  193.     def compile(source):
  194.         from pycuda.compiler import SourceModule
  195.         module = SourceModule(source)
  196.         func = module.get_function(funcname)
  197.         return CheckedKernel(decl, func, axes)
  198.  
  199.     return compile(transform)
  200.  
  201.  
  202.  
  203. #2d meshgrid function
  204. mesh_grid_2 = nd_kernel(
  205.     """
  206.    (int32[2,n,m] result) << [n,m] << ():
  207.        result(0, i0,i1) = i0;
  208.        result(1, i0,i1) = i1;
  209.    """,
  210. )
  211. print mesh_grid_2(shape=(4,4))      #we have no input args; kernel shape needs to be explicitly specified
  212.  
  213. outer_product = nd_kernel(
  214.     """
  215.    (float32[n,m] result = 0) << [n,m] << (float32[n] x0, float32[m] x1):
  216.        result(i0, i1) = x0(i0) * x1(i1);
  217.    """,
  218.     )
  219. x0 = gpuarray.empty((10), np.float32); x0.fill(1)
  220. x1 = gpuarray.empty((20), np.float32); x1.fill(1)
  221. print outer_product(x0, x1)         #kernel shape is deduced from input arguments
  222.  
  223.  
  224. funky_product = nd_kernel(
  225.     """
  226.    (float32[n,m] result, uint32 count=0) << [n,m] << (uint32[:,n,m] foo, uint32[8,n,m] bar, float32 scalar):
  227.        float32 r = 0;
  228.        for (i in 0:foo.shape[0])
  229.            for (j in 0:bar.shape[0]:2)
  230.                r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
  231.        result(i0, i1) = r;
  232.        atomicAdd(count, 1);
  233.    """,
  234.     axes = [1,1]            #override default axes parallism; gives the same result
  235.     )
  236.  
  237. foo = gpuarray.zeros((3,100,99), np.uint32); foo.fill(1)
  238. bar = gpuarray.zeros((8,100,99), np.uint32); bar.fill(1)
  239. print funky_product(foo, bar, 1.1 )
  240. quit()
  241.  
  242.  
  243.  
  244.  
  245. def mesh_grid_nd(ndim, dtype=np.int32):
  246.     """nd meshgrid factory function. parametrizes the function on dimensionality and type; you kinda want to know these at compile time"""
  247.     dummies = ','.join( alphas[:ndim] )
  248.     idx = ','.join('i%i'%n for n in xrange(ndim))
  249.     mesh_grid_kernel = nd_kernel(
  250.         decl = """<<<{dummies}>>>: {dtype}[{ndim},{dummies}] result""".format(dtype=dtype, ndim=ndim, dummies=dummies),
  251.         body = '\n'.join( """result({i}, {idx} = i{i};""".format(i=i,idx=idx) for i in xrange(ndim))
  252.         )
  253.     #wrapper func, to handle allocation and kernel launch
  254.     def mesh_grid(shape):
  255.         """returns a gpu mesh grid of shape:{shape}, dtype:{dtype}""".format(shape=shape, dtype=dtype)
  256.         result = gpuarray.zeros(shape, dtype)
  257.         mesh_grid_n(result)
  258.         return result
  259.     return mesh_grid
  260.  
  261.  
  262. def broadcasting_op(op, ndim, dtype):
  263.     """
  264.    general broadcasting requires smart strides
  265.    as a simple hack; any size == 1 dimension is broadcastable
  266.    all we need to do is set corresponding strides to zero; forces that index to stay at zero
  267.    """
  268.  
  269.     dummies = alphas[:ndim]
  270.     idx = ','.join('i%i'%i for i in xrange(ndim))
  271.  
  272.     broad_kernel = ()
  273.     source = """
  274.    ({dtype}[{dummies}] output) << [{dummies}] << ({dtype}[{colons}] l, {dtype}[{colons}] r):
  275.        output({idx}) = l({idx}) {operation} r({idx});
  276.    """
  277.  
  278.     def func(L, R):
  279.         broad_shape = tuple(max(l, r) for l,r in zip(L.shape, R.shape))
  280.         #bind kernel shape argument
  281.         return broad_kernel(L, R, shape = broad_shape)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement