Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- some conceptual code to illustrate planned kernel syntax stuff
- this is just a small stub of code, for ease of mental access for others who want to look at it or play with it
- it is currently not compatible with my other messy code-base, but that will come
- the intent of this code is to extend elementwise-type pycuda functionality.
- specifically, i aim for a kernel factory that is ndarray-aware. this is taken to mean several things:
- runtime type and shape checking of input arrays
- macros to obtain shapes and strides of arrays within the kernel; both the compile time and runtime constants
- macros to obtain the current index being processed; nd generalization of the 'i' in elementwise
- or even better: arbitrary dereferencing of arrays ala weave.blitz:
- arrayidentifier(x,y,z) -> transformed into an appropriate inner product over strides
- this is all fairly easy to do for a fixed number of dimension (what my main code base is built on);
- but the obvious next step is to emit code that works for any dimension
- in order to do so, we should have some clear logic to decide which axes will
- be looped over (serial), and which axes will be handled by the thread launcher (parallel)
- we should always at least launch 1d kernels to have a minimum of parralelism;
- furthermore, the stride=1 axis should always be parralel for coalesing
- but exactly how many axes we should do parralel is highly problem (and somewhat hardware) dependent
- f.i., 2d deconvolution is typically best done by reading coalesed rows into shared mem,
- and then looping over the other axis
- a 3d kernel on older hardware must be force to loop over the biggest stride, and so on
- anyway, having decided upon a partition of parralel and serial axes:
- parralel axes should emit indexing code of the form:
- unsigned i0 = blockDim.x*blockIdx.x + threadIdx.x;
- and serial axes should (recursively) emit an outer loop of the form:
- unsigned in; for in = 0; in<kernel.shape[n]; in++) { $body}
- the most basis implementation that always works (though not optimally);
- simply always launch 1d threadblocks, and loop over the other axes
- we should probably allow for a manual override of axis paralelism, considering the problem-dependence
- another low priority thing, but which would be cool to have:
- can we make a general shared mem mechanism? upon marking an input arg with a keyword, emit code
- to defer reads of that array to a block of shared mem, with correct padding as decided by the supplied stencil
- shared mem only really comes into play for stencil operations. a simple nd-aware stencil like an outer product
- does not stand to gain much from shared mem; it is still 'elementwise', and needs NxM writes anyway, so there is not much
- point trying to optimize the technically unnecessary NxM reads to N+M
- what about class hierarchy?
- nd_kernel is essentially the base
- elementwise is simply an 1d nd_kernel; subclass with some constraints
- stencil_kernel is also a subclass of nd_kernel. it calls upon nd_kernel base to do most work
- all it does itself is pass in a pre-composed body, with the optionally unrolled inner stencil loop
- """
- import numpy as np
- from pyparsing import *
- from pycuda.driver import init, Device
- init()
- device = Device(1)
- context = device.make_context()
- import pycuda.tools
- ##pycuda.compyte.dtypes._fill_dtype_registry(True)
- dtype_to_ctype = {k:v for k,v in pycuda.compyte.dtypes.DTYPE_TO_NAME.iteritems() if isinstance(k, str)}
- class Kernel(object):
- """wrapper for pycuda kernel; check and bind arguments"""
- def __init__(self, arguments, kernel):
- self.arguments = arguments
- self.kernel = kernel
- def __call__(self, *args, **kwargs):
- """
- launch kernel
- we need some logic here to extract runtime variable arguments
- and pass them on in correct order to the compiled kernel
- """
- args.append( [
- np.uint32( size)
- for identifier, (dtype,shape) in self.arguments.iteritems() if not shape is None
- for size in shape if not size is None])
- self.kernel(*args, **kwargs)
- class CheckedKernel(Kernel):
- def __call__(self, *args, **kwargs):
- size_args = []
- normal_args = []
- arg_info = self.arguments
- #perform runtime checks
- for i, (value, (identifier, arg)) in enumerate( zip(args, arg_info.iteritems())):
- if arg.is_array:
- print value.dtype.name, arg.dtype
- assert((value.dtype.name) == (arg.dtype)) #type check on string representation of dtype; is that safe?
- assert(len(arg.shape)==len(value.shape)) #assert correct number of dimensions
- for asize, vsize in zip(arg.shape, value.shape):
- if asize:
- assert(asize == vsize) #check all specified dimensions
- else:
- size_args.append(np.uint32( vsize)) #this is a runtime argument
- normal_args.append(value.gpudata)
- else:
- try:
- cast = getattr(np, arg.dtype)
- normal_args.append( cast(value) )
- except:
- raise Exception("argument {i} could not be cast to {dtype}".format(i=i,dtype=arg.dtype))
- grid = (10,1,1)
- block = (10,1,1)
- kwargs.update(dict(grid=grid, block=block) )
- args = normal_args + size_args
- self.kernel(*args, **kwargs)
- ## super(CheckedKernel, self)(*args, **kwargs)
- class ArgumentInfo(object):
- def __init__(self, identifier, shape, dtype):
- self.identifier = identifier
- self.shape = shape
- self.dtype = dtype
- @property
- def ctype(self): return dtype_to_ctype[self.dtype]
- @property
- def is_scalar(self): return self.shape is None
- @property
- def is_array(self): return not self.is_scalar
- @property
- def ndim(self): return len(self.shape)
- def arg_parse(arguments):
- """
- take a raw argument string of the specified syntax, and transform it
- to an ordereddict of ArgumentInfo objects
- """
- np_types = ' '.join(dtype_to_ctype.keys())
- dtype = oneOf(np_types).setResultsName('dtype')
- identifier = (Word(alphanums+'_')).setResultsName('identifier') #not fully correct; also matches names starting with a numeral. whatever
- positive_integer = Word(nums)
- dimension = Or([positive_integer, Literal(':')])
- shape = nestedExpr('[',']', delimitedList(dimension)).setResultsName('shape')
- term = Group( dtype + Optional(shape) + identifier)
- grammar = delimitedList(term)
- #convert parsing result to typeinfo object, for reasonably efficient runtime checks, and clean downstream code
- def shape_postparse(shape): #apply parseactions to shape args
- if shape is '': return None #scalar argument
- shape = tuple(None if size is ':' else int(size) for size in shape[0])
- if len(shape)==0: raise Exception('arrays must have at least one dimension')
- return shape
- from collections import OrderedDict
- arg_info = OrderedDict()
- for argument in grammar.parseString(arguments):
- arg_info[argument.identifier] = ArgumentInfo(argument.identifier, shape_postparse(argument.shape), argument.dtype)
- return arg_info
- def build_argument_strings(arg_info):
- """
- build c code to represent all the compile time and run time info
- that we seek to pass into the kernel
- """
- #add restrict keyword to ptrs by default; my preferred syntax would be to add an ALIAS keyword to supress this
- c_arguments = ', '.join(
- '{type}{ptr} const {restrict}{identifier}'.format(type=arg.ctype, ptr='*'*arg.is_array, restrict='__restrict__ '*arg.is_array, identifier=arg.identifier)
- for arg in arg_info.itervalues()
- )
- variable_shape_arguments = ', '.join(
- 'const unsigned {identifier}_shape_{dimension}'.format(identifier=arg.identifier, dimension=dimension)
- for arg in arg_info.itervalues() if arg.is_array
- for dimension, size in enumerate(arg.shape) if size is None
- )
- constant_shape_arguments = '\n'.join(
- 'const unsigned {identifier}_shape_{dimension} = {size};'.format(identifier=arg.identifier, dimension=dimension, size=size)
- for arg in arg_info.itervalues() if arg.is_array
- for dimension, size in enumerate(arg.shape) if not size is None
- )
- return c_arguments, variable_shape_arguments, constant_shape_arguments
- def compile_kernel_shape(shape):
- """
- emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
- this function hardcodes this into the kernel
- """
- return '\n'.join(
- 'const unsigned kernel_shape_{dimension} = {size};'.format(dimension=dimension, size=size)
- for dimension,size in enumerate( shape)
- )
- def runtime_kernel_shape(shape):
- """
- emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
- emits extra arguments to be supplied upon calling of the kernel
- """
- raise NotImplementedError()
- def compute_strides(arg_info):
- """
- generate code to compute the strides of each array
- I am assuming in all this the CUDA compiler has a clue about optimizing constant expressions
- otherwise, might have to use C macros for this kind of thing?
- I do hope unused constants get eliminated, and so on?
- """
- def strides(identifier, shape):
- """
- inner generator, to generate a sequence of stride definitions for each array
- i havnt actually settled on a convention for what is the first and last axis
- not having such a convention, it isnt applied either. in other words, this code
- is for illustrative purposes only
- """
- stride_template = '{identifier}_stride_{dimension}'
- shape_template = '{identifier}_shape_{dimension}'
- prev = stride_template.format(identifier=identifier, dimension=len(shape)-1)
- yield 'const unsigned {identifier} = {stride};'.format(identifier = prev ,stride = 1)
- for i, size in reversed(list(enumerate( shape[:-1]))):
- this = stride_template.format(identifier=identifier, dimension=i)
- size = shape_template.format(identifier=identifier, dimension=i+1)
- yield 'const unsigned {this} = {prev} * {size};'.format(this=this, prev=prev ,size = size)
- prev = this
- #add total element size as well, for good measure
- size = shape_template.format(identifier=identifier, dimension=0)
- yield 'const unsigned {identifier}_size = {prev} * {size};'.format(identifier = identifier, prev=prev ,size=size)
- return '\n'.join(
- stride_expr
- for identifier,arg in arg_info.iteritems() if arg.is_array
- for stride_expr in strides(identifier, arg.shape))
- def replace_typing(source):
- """
- replace numpy types with c-types. this could be more efficient and intelligent...
- we do not do any semantic analysis here; simple find and replace
- but useage is optional anyway; we are fully backwards compatible, free to use ctypes in our code
- """
- np_types = ' '.join(dtype_to_ctype.keys())
- type_grammar = oneOf(np_types)
- type_grammar.setParseAction(lambda s,l,t: dtype_to_ctype[t[0]])
- return type_grammar.transformString(source)
- def replace_shape_syntax(source, arg_info):
- """
- replace arrayidentifier.shape[ndim] syntax with C named variables
- silently fails to replace some wrong syntax, like misspelled shape;
- dont worry, the cuda compiler is sure to complain about it :)
- would it be sufficient and currect to catch all instances of 'arrayidentifier.'+whatever,
- that fail to match the whole syntax?
- """
- arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
- positive_integer = Word(nums)
- shape_expr = arrayidentifier + Suppress( Literal('.shape')) + nestedExpr('[',']', positive_integer).setResultsName('dimension')
- def replace(s,l,t): #string, locaction, parseresults
- """if match is correct, replace numpy syntax with c-compatible syntax"""
- identifier = t.identifier
- dimensions = t.dimension[0]
- if not len(dimensions)==1: raise Exception('only simple shape indexing allows')
- dimension = dimensions[0]
- try:
- arg = arg_info[identifier]
- except KeyError:
- raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))
- try:
- size = arg.shape[int(dimension)]
- except Exception:
- raise ParseFatalException('{identifier}.shape[{dimension}] is invalid'.format(identifier=identifier, dimension=dimension))
- return '{identifier}_shape_{dimension}'.format(identifier=identifier, dimension=dimension)
- shape_expr.setParseAction(replace)
- return shape_expr.transformString(source)
- def replace_array_syntax(source, arg_info):
- """
- replace weave.blitz style array indexing with inner product over strides
- we could optionally insert bounds checking code here as well, as a debugging aid
- should we allow for partial indexing? not sure
- """
- arrayidentifier = oneOf(' '.join(arg_info.keys())).setResultsName('identifier')
- ## arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
- identifier = Word(alphanums+'_')
- positive_integer = Word(nums)
- index = Or([identifier, positive_integer])
- index_expr = arrayidentifier + nestedExpr('(',')', delimitedList( index)).setResultsName('indices')
- def replace(s,l,t): #string, locaction, parseresults
- """if match is correct, replace numpy syntax with c-compatible syntax"""
- identifier = t.identifier
- indices = t.indices[0]
- try:
- arg = arg_info[identifier]
- except KeyError:
- raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))
- if not len(indices)==arg.ndim:
- raise Exception("indexing '{identifier}' requires {ndim} arguments".format(identifier=identifier, ndim=arg.ndim))
- offset = '+'.join(
- '{identifier}_stride_{i}*{idx}'.format(identifier=identifier, i=i, idx=idx)
- for i,idx in enumerate(indices))
- return '{identifier}[{offset}]'.format(identifier=identifier, offset=offset)
- index_expr.setParseAction(replace)
- return index_expr.transformString(source)
- def replace_for_syntax(source, arg_info):
- """
- replace: 'for (id in start:stop:step)'
- with: 'for (int id=start; start<end; id+=step)'
- rather trivial syntactic sugar indeed; just flexing my pyparsing knowledge a bit
- we could implement an unrolling mechanism here too, in case all params are known at compile time, and loop is small?
- """
- identifier = Word(alphanums+'_')
- integer = Word(nums)
- index = Combine( Optional(Literal('-')) + Or([identifier, integer])) #index, with optional unary minus prepended
- colon = Suppress(Literal(':'))
- range = index.setResultsName('start') + colon + index.setResultsName('stop') + Optional(Combine(colon + index), '1').setResultsName('step')
- loop_expr = Literal('for') + '(' + identifier.setResultsName('index') + Literal('in') + range + ')'
- def replace(s,l,t):
- return 'for (int {index}={start}; {index} < {stop}; {index}+={step})'.format(**dict(t))
- loop_expr.setParseAction(replace)
- return loop_expr.transformString(source)
- def kernel_parsing(arguments, body, axes = None, shape=None):
- """
- transform syntactic sugar to valid CUDA-C code
- """
- arg_info = arg_parse(arguments)
- if shape is None:
- shape = arg_info[arg_info.keys()[0]].shape #if shape is none, kernel takes shape of first argument
- shape = tuple(shape)
- #create various string options for axes param? 'serial', 'parralel', 'max_parralel', 'first_serial'
- if axes is None:
- axes = np.zeros(len(shape), np.bool)
- axes[-3:] = 1 #do last three axes in parallel (needs to be set to two, for old cards
- axes[0] = 0 #first axis is serial, by default. having at least one looping axis can be beneficial, for shared mem or other reusable terms
- axes[-1] = 1 #unless it is also the last, which is always parallel; otherwise we have no coalesing
- axes = np.array(axes, np.bool)
- assert(axes[-1]==True) #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason why youd want this
- print 'ORIGINAL CODE:'
- print
- print arguments
- print body
- print
- #simple nd-array indexing. probably full of bugs?
- for axis in np.flatnonzero(axes==0): #emit loops for serial axes
- loop = """
- unsigned i{n};
- for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
- """.format(n=axis)
- body = loop + '{\n' + body + '\n}'
- supported_axes = 'xyz'
- if np.count_nonzero(axes==1) > len(supported_axes):
- raise Exception('requested more parallel axes than are supported by your hardware!')
- for cuda_axis,nd_axis in zip(supported_axes, np.flatnonzero( axes==1)): #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
- parallel_axes = """
- const unsigned i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
- if (i{n} >= kernel_shape_{n}) return;
- """.format(n=nd_axis, a=cuda_axis)
- body = parallel_axes + body
- #determine thread blocking
- #warps of THREADS_PER_ROW; other parallel axes are balanced out
- #just like supported_axes, these constants should be pulled from the surrounding context and its associated device
- parallel_axes = np.count_nonzero(axes==1)
- THREADS_PER_BLOCK = 512
- THREADS_PER_ROW = 32 #would like to coalesce 32 threads at the same time
- #WORK IN PROGRESS
- #parsing-based replacements
- body = replace_typing(body)
- body = replace_shape_syntax( body, arg_info )
- body = replace_array_syntax( body, arg_info )
- body = replace_for_syntax(body, arg_info)
- #macro-based substitutions
- generic_template = """
- __global__ void ${funcname}(${arguments})
- {
- ${body}
- }
- """
- from string import Template
- template = Template(generic_template) #would be nice to have identation-aware template...
- c_arguments, variable_shape_arguments, constant_shape_arguments = build_argument_strings(arg_info)
- print 'TRANSFORMED CODE:'
- transform = template.substitute(
- funcname = 'nd_kernel',
- arguments = c_arguments+', '+variable_shape_arguments,
- body = '\n\n'.join([
- compile_kernel_shape(shape),
- constant_shape_arguments,
- compute_strides(arg_info),
- body
- ])
- )
- print transform
- def compile(source):
- from pycuda.compiler import SourceModule
- module = SourceModule(source)
- func = module.get_function('nd_kernel')
- return CheckedKernel(arg_info, func)
- return compile(transform)
- funky_product = kernel_parsing(
- #the (virtual) ndarray to arrange kernel call over; as the example demonstrates, we can not genrally deduce this from arguments.
- #if not specified, first argument is taken to be the kernel shape;
- ## shape = (100,100),
- #specify parallel and serial axes manually (required for now); last (stride=1) axis is made porallel here, the other serial
- #if not specified, default axis assignment gives the same result for this case
- ## axes = [0, 1],
- ## axes= [1,1],
- #note the use of numpy dtypes, and dimension and type constraints in argument list
- arguments = """float64[100,100] result, uint32[:,100,100] foo, uint32[8,100,100] bar, float32 scalar""",
- #note the numpythonic shape accessing, and weave.blitz style array indexing. plus simple looping helpers
- #and the blissfull absence of boilerplate in general :)
- body = """
- float64 r = 0;
- for (i in 0:foo.shape[0])
- for (j in 0:bar.shape[0]:2)
- r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
- result(i0, i1) = r;
- """
- )
- from pycuda import gpuarray
- out = gpuarray.zeros((100,100), np.float64)
- foo = gpuarray.zeros((3,100,100), np.uint32)
- foo.fill(1)
- bar = gpuarray.zeros((8,100,100), np.uint32)
- bar.fill(1)
- #attempt to lunahc this thing, and see if it actually works
- funky_product(out, foo, bar, 1.1)
- #seems like it!
- print out
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement