Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- kernel test
- """
- from pycuda.driver import init, Device
- init()
- device = Device(0)
- context = device.make_context()
- from pycuda import gpuarray
- import numpy as np
- class Kernel(object):
- """wrapper for pycuda kernel; checks and binds arguments"""
- def __init__(self, decl, kernel, parallel_axes):
- self.decl = decl #kernel declaration object; encodes function signature
- self.kernel = kernel #pycuda kernel handle
- self.parallel_axes = parallel_axes #bool array to specify parallel axes; needed for thread blocking
- class CheckedKernel(Kernel):
- """
- quite a complex beast it appears
- anyway, we can worry about optimizing later
- perhaps a specialized subclass when all constants are know at compile time
- which is the case most often encountered in practice
- """
- def __call__(self, *args, **kwargs):
- arg_list = []
- dummies = {}
- def handle_shape(decl, value):
- """check a shape value versus a shape declaration. dummies encountered in the declaration are bound"""
- assert(len(value) == len(decl)) #assert correct number of dimensions
- for dsize, vsize in zip(decl, value):
- if isinstance( dsize, np.uint32):
- assert(dsize == vsize) #compile-time specified shape; make sure it matches
- elif dsize==':':
- arg_list.append(np.uint32( vsize)) #this is a runtime argument; append it to argument list
- else: #dummy arg; add it to dummy dict, and check consistency
- size = dummies.get(dsize, vsize)
- dummies[dsize] = vsize
- assert(size==vsize)
- #perform runtime checks on input args
- for i, (value, decl) in enumerate( zip(args, self.decl.inputs)):
- if decl.is_array:
- try:
- assert(value.dtype.name == decl.dtype) #type check on string representation of dtype; is that safe?
- arg_list.append(value.gpudata)
- handle_shape(decl.shape, value.shape)
- except:
- raise Exception("argument {i} does not satisfy gpuarray interface".format(i=i))
- else: #scalar type
- try:
- cast = getattr(np, decl.dtype)
- arg_list.append(cast(value))
- except:
- raise Exception("argument {i} could not be cast to {dtype}".format(i=i,dtype=decl.dtype))
- #try if we can get dummies from optional kernel shape argument
- shape = kwargs.pop('shape', None)
- if shape: handle_shape(self.decl.shape, shape)
- #make sure all declared dummies are indeed bound
- assert(all(d in dummies for d in self.decl.dummies))
- #transform a shape declaration to its bound form, by expanding its dummies
- def complete_shape(shape):
- return tuple(int(dummies.get(s, s)) for s in shape)
- #process output arguments; they are all internally allocated for now, but we should also allow for prealloced output args to be passed in to kernel
- return_list = [] #list of things to return from kernel
- for decl in self.decl.outputs:
- value = gpuarray.GPUArray(complete_shape(decl.shape), decl.dtype)
- if decl.default: value.fill(decl.default)
- arg_list.append(value.gpudata)
- return_list.append(value)
- #append dummies to argument list
- for d in self.decl.dummies: arg_list.append(np.uint32(dummies[d]))
- #set thread grid; all shape dimensions are known at this point
- shape = complete_shape(self.decl.shape)
- axes = np.flatnonzero(self.parallel_axes)
- THREADS_PER_BLOCK = 512 #this should be pulled from the context.device, really
- leftover = THREADS_PER_BLOCK
- block = [1]*3
- #rather simple thread block assignment; assign as much as possible to slowest stride, and overflow into the next
- #but should work well under most circumstances.
- #this needs a manual override
- for i,a in enumerate( axes):
- block[i] = min(leftover, shape[a])
- leftover = leftover // block[i]
- if leftover == 0: break
- block = tuple(block)
- def iDivUp(a, b):
- # Round a / b to nearest higher integer value
- a = np.int32(a)
- b = np.int32(b)
- r = (a / b + 1) if (a % b != 0) else (a / b)
- return int(r)
- grid = tuple(iDivUp(shape[a],b) for b,a in zip(block, axes))
- kwargs.update(dict(grid=grid, block=block) )
- self.kernel(*arg_list, **kwargs)
- return tuple(return_list)
- def nd_kernel(source, axes=None):
- """
- transform syntactic sugar to valid CUDA-C code
- """
- print 'ORIGINAL CODE:'
- print
- print source
- print
- from numpycuda_parsing import parsing
- decl, body = parsing(source)
- #create various string options for axes param? 'all_serial', 'all_parallel', 'max_parallel', 'first_serial'?
- if axes is None:
- axes = np.zeros(len(decl.shape), np.bool)
- axes[-3:] = 1 #do last three axes in parallel (needs to be set to two, for old cards
- axes[0] = 0 #first axis is serial, by default. having at least one looping axis can be beneficial, for shared mem or other reusable terms
- axes[-1] = 1 #unless it is also the last, which is always parallel; otherwise we have no coalesing
- axes = np.array(axes, np.bool)
- assert(axes[-1]==True) #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason why youd want this
- #nd-array indexing over serial and parallel axes
- for axis in np.flatnonzero(axes==0): #emit loops for serial axes
- loop = """
- //loop over serial axis
- unsigned i{n};
- for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
- """.format(n=axis)
- body = loop + '{\n' + body + '\n}'
- supported_axes = 'xyz'
- if np.count_nonzero(axes==1) > len(supported_axes):
- raise Exception('requested more parallel axes than are supported by your hardware!')
- for cuda_axis,nd_axis in zip(supported_axes, np.flatnonzero( axes==1)): #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
- parallel_axes = """
- //launch threads for parallel axis
- unsigned const i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
- if (i{n} >= kernel_shape_{n}) return;
- """.format(n=nd_axis, a=cuda_axis)
- body = parallel_axes + body
- #macro-based substitutions, to assemble total kernel
- generic_template = """
- __global__ void ${funcname}(${arguments})
- {
- ${init}
- //kernel body
- ${body}
- }
- """
- from string import Template
- template = Template(generic_template) #would be nice to have identation-aware template...
- funcname = 'nd_kernel'
- transform = template.substitute(
- funcname = funcname,
- arguments = decl.argument_string,
- init = decl.init_string,
- body = body,
- )
- print 'TRANSFORMED CODE:'
- print
- print transform
- print
- def compile(source):
- from pycuda.compiler import SourceModule
- module = SourceModule(source)
- func = module.get_function(funcname)
- return CheckedKernel(decl, func, axes)
- return compile(transform)
- #2d meshgrid function
- mesh_grid_2 = nd_kernel(
- """
- (int32[2,n,m] result) << [n,m] << ():
- result(0, i0,i1) = i0;
- result(1, i0,i1) = i1;
- """,
- )
- print mesh_grid_2(shape=(4,4)) #we have no input args; kernel shape needs to be explicitly specified
- outer_product = nd_kernel(
- """
- (float32[n,m] result = 0) << [n,m] << (float32[n] x0, float32[m] x1):
- result(i0, i1) = x0(i0) * x1(i1);
- """,
- )
- x0 = gpuarray.empty((10), np.float32); x0.fill(1)
- x1 = gpuarray.empty((20), np.float32); x1.fill(1)
- print outer_product(x0, x1) #kernel shape is deduced from input arguments
- funky_product = nd_kernel(
- """
- (float32[n,m] result, uint32 count=0) << [n,m] << (uint32[:,n,m] foo, uint32[8,n,m] bar, float32 scalar):
- float32 r = 0;
- for (i in 0:foo.shape[0])
- for (j in 0:bar.shape[0]:2)
- r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
- result(i0, i1) = r;
- atomicAdd(count, 1);
- """,
- axes = [1,1] #override default axes parallism; gives the same result
- )
- foo = gpuarray.zeros((3,100,99), np.uint32); foo.fill(1)
- bar = gpuarray.zeros((8,100,99), np.uint32); bar.fill(1)
- print funky_product(foo, bar, 1.1 )
- quit()
- def mesh_grid_nd(ndim, dtype=np.int32):
- """nd meshgrid factory function. parametrizes the function on dimensionality and type; you kinda want to know these at compile time"""
- dummies = ','.join( alphas[:ndim] )
- idx = ','.join('i%i'%n for n in xrange(ndim))
- mesh_grid_kernel = nd_kernel(
- decl = """<<<{dummies}>>>: {dtype}[{ndim},{dummies}] result""".format(dtype=dtype, ndim=ndim, dummies=dummies),
- body = '\n'.join( """result({i}, {idx} = i{i};""".format(i=i,idx=idx) for i in xrange(ndim))
- )
- #wrapper func, to handle allocation and kernel launch
- def mesh_grid(shape):
- """returns a gpu mesh grid of shape:{shape}, dtype:{dtype}""".format(shape=shape, dtype=dtype)
- result = gpuarray.zeros(shape, dtype)
- mesh_grid_n(result)
- return result
- return mesh_grid
- def broadcasting_op(op, ndim, dtype):
- """
- general broadcasting requires smart strides
- as a simple hack; any size == 1 dimension is broadcastable
- all we need to do is set corresponding strides to zero; forces that index to stay at zero
- """
- dummies = alphas[:ndim]
- idx = ','.join('i%i'%i for i in xrange(ndim))
- broad_kernel = ()
- source = """
- ({dtype}[{dummies}] output) << [{dummies}] << ({dtype}[{colons}] l, {dtype}[{colons}] r):
- output({idx}) = l({idx}) {operation} r({idx});
- """
- def func(L, R):
- broad_shape = tuple(max(l, r) for l,r in zip(L.shape, R.shape))
- #bind kernel shape argument
- return broad_kernel(L, R, shape = broad_shape)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement