Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- some conceptual code to illustrate planned kernel syntax stuff
- this is just a small stub of code, for ease of mental access for others who want to look at it or play with it
- it is currently not compatible with my other messy code-base, but that will come
- the intent of this code is to extend elementwise-type pycuda functionality.
- specifically, i aim for a kernel factory that is ndarray-aware. this is taken to mean several things:
- runtime type and shape checking of input arrays
- macros to obtain shapes and strides of arrays within the kernel; both the compile time and runtime constants
- macros to obtain the current index being processed; nd generalization of the 'i' in elementwise
- or even better: arbitrary dereferencing of arrays ala weave.blitz:
- arrayidentifier(x,y,z) -> transformed into an appropriate inner product over strides
- this is all fairly easy to do for a fixed number of dimension (what my main code base is built on);
- but the obvious next step is to emit code that works for any dimension
- in order to do so, we should have some clear logic to decide which axes will
- be looped over (serial), and which axes will be handled by the thread launcher (parallel)
- we should always at least launch 1d kernels to have a minimum of parralelism;
- furthermore, the stride=1 axis should always be parralel for coalesing
- but exactly how many axes we should do parralel is highly problem (and somewhat hardware) dependent
- f.i., 2d deconvolution is typically best done by reading coalesed rows into shared mem,
- and then looping over the other axis
- a 3d kernel on older hardware must be force to loop over the biggest stride, and so on
- anyway, having decided upon a partition of parralel and serial axes:
- parralel axes should emit indexing code of the form:
- unsigned i0 = blockDim.x*blockIdx.x + threadIdx.x;
- and serial axes should (recursively) emit an outer loop of the form:
- unsigned in; for in = 0; in<kernel.shape[n]; in++) { $body}
- the most basis implementation that always works (though not optimally);
- simply always launch 1d threadblocks, and loop over the other axes
- we should probably allow for a manual override of axis paralelism, considering the problem-dependence
- another low priority thing, but which would be cool to have:
- can we make a general shared mem mechanism? upon marking an input arg with a keyword, emit code
- to defer reads of that array to a block of shared mem, with correct padding as decided by the supplied stencil
- shared mem only really comes into play for stencil operations. a simple nd-aware stencil like an outer product
- does not stand to gain much from shared mem; it is still 'elementwise', and needs NxM writes anyway, so there is not much
- point trying to optimize the technically unnecessary NxM reads to N+M
- """
- import numpy as np
- from pyparsing import *
- import pycuda.compyte.dtypes
- pycuda.compyte.dtypes._fill_dtype_registry(True)
- dtype_to_ctype = {k:v for k,v in pycuda.compyte.dtypes.DTYPE_TO_NAME.iteritems() if isinstance(k, str)}
- def arg_parse(arguments):
- """
- take a raw argument string of the specified syntax, and transform it
- to an ordereddict of identifier:(dtype, shape), for further processing
- """
- np_types = ' '.join(dtype_to_ctype.keys())
- dtype = oneOf(np_types).setResultsName('dtype')
- identifier = (Word(alphanums+'_')).setResultsName('identifier') #not fully correct; also matches names starting with a numeral. whatever
- positive_integer = Word(nums)
- dimension = Or([positive_integer, Literal(':')])
- shape = nestedExpr('[',']', delimitedList(dimension)).setResultsName('shape')
- term = Group( dtype + Optional(shape) + identifier)
- grammar = delimitedList(term)
- #convert parsing result to typeinfo object, for reasonably efficient runtime checks, and clean downstream code
- def shape_parse(shape): #apply parseactions to shape args
- if shape is '': return None #scalar argument
- shape = tuple(None if size is ':' else int(size) for size in shape[0])
- if len(shape)==0: raise Exception('arrays must have at least one dimension')
- return shape
- from collections import OrderedDict
- arg_info = OrderedDict()
- for argument in grammar.parseString(arguments):
- arg_info[argument.identifier] = argument.dtype, shape_parse(argument.shape)
- return arg_info
- def build_argument_strings(arg_info):
- """
- build c code to represent all the compile time and run time info
- that we seek to pass into the kernel
- """
- #add restrict keyword to ptrs by default; my preferred syntax would be to add an ALIAS keyword to supress this
- c_arguments = ', '.join(
- '{restrict}{type}{ptr} {identifier}'.format(type=dtype_to_ctype[dtype], ptr='' if shape is None else '*', restrict='' if shape is None else '__restrict__ ', identifier=identifier)
- for identifier,(dtype, shape) in arg_info.iteritems()
- )
- variable_shape_arguments = ', '.join(
- 'constant unsigned {identifier}_shape_{dimension}'.format(identifier=identifier, dimension=dimension)
- for identifier,(dtype, shape) in arg_info.iteritems() if not shape is None
- for dimension, size in enumerate( shape) if size is None
- )
- constant_shape_arguments = '\n'.join(
- 'constant unsigned {identifier}_shape_{dimension} = {size};'.format(identifier=identifier, dimension=dimension, size=size)
- for identifier,(dtype, shape) in arg_info.iteritems() if not shape is None
- for dimension, size in enumerate( shape) if not size is None
- )
- return c_arguments, variable_shape_arguments, constant_shape_arguments
- def compile_kernel_shape(shape):
- """
- emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
- this function hardcodes this into the kernel
- """
- return '\n'.join(
- 'constant unsigned kernel_shape_{dimension} = {size};'.format(dimension=dimension, size=size)
- for dimension,size in enumerate( shape)
- )
- def runtime_kernel_shape(shape):
- """
- emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
- emits extra arguments to be supplied upon calling of the kernel
- """
- raise NotImplementedError()
- def compute_strides(arg_info):
- """
- generate code to compute the strides of each array
- I am assuming in all this the CUDA compiler has a clue about optimizing constant expressions
- otherwise, might have to use C macros for this kind of thing?
- I do hope unused constants get eliminated, and so on?
- """
- def strides(identifier, shape):
- """
- inner generator, to generate a sequence of stride definitions for each array
- i havnt actually settled on a convention for what is the first and last axis
- not having such a convention, it isnt applied either. in other words, this code
- is for illustrative purposes only
- """
- stride_template = '{identifier}_stride_{dimension}'
- shape_template = '{identifier}_shape_{dimension}'
- prev = stride_template.format(identifier=identifier, dimension=len(shape)-1)
- yield 'constant unsigned {identifier} = {stride};'.format(identifier = prev ,stride = 1)
- for i, size in reversed(list(enumerate( shape[:-1]))):
- this = stride_template.format(identifier=identifier, dimension=i)
- size = shape_template.format(identifier=identifier, dimension=i+1)
- yield 'constant unsigned {this} = {prev} * {size};'.format(this=this, prev=prev ,size = size)
- prev = this
- #add total element size as well, for good measure
- size = shape_template.format(identifier=identifier, dimension=0)
- yield 'constant unsigned {identifier}_size = {prev} * {size};'.format(identifier = identifier, prev=prev ,size=size)
- return '\n'.join(
- stride_expr
- for identifier,(dtype, shape) in arg_info.iteritems() if not shape is None
- for stride_expr in strides(identifier, shape))
- def replace_typing(source):
- """
- replace numpy types with c-types. this could be more efficient and intelligent...
- we do not do any semantic analysis here; simple find and replace
- but useage is optional anyway; we are fully backwards compatible, free to use ctypes in our code
- """
- np_types = ' '.join(dtype_to_ctype.keys())
- type_grammar = oneOf(np_types)
- type_grammar.setParseAction(lambda s,l,t: dtype_to_ctype[t[0]])
- return type_grammar.transformString(source)
- def replace_shape_syntax(source, arg_info):
- """
- replace arrayidentifier.shape[ndim] syntax with C named variables
- silently fails to replace some wrong syntax, like misspelled shape;
- dont worry, the cuda compiler is sure to complain about it :)
- would it be sufficient and currect to catch all instances of 'arrayidentifier.'+whatever,
- that fail to match the whole syntax?
- """
- arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
- positive_integer = Word(nums)
- shape_expr = arrayidentifier + Suppress( Literal('.shape')) + nestedExpr('[',']', positive_integer).setResultsName('dimension')
- def replace(s,l,t): #string, locaction, parseresults
- """if match is correct, replace numpy syntax with c-compatible syntax"""
- identifier = t.identifier
- dimensions = t.dimension[0]
- if not len(dimensions)==1: raise Exception('only simple shape indexing allows')
- dimension = dimensions[0]
- try:
- dtype, shape = arg_info[identifier]
- except KeyError:
- raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))
- try:
- size = shape[int(dimension)]
- except Exception:
- raise ParseFatalException('{identifier}.shape[{dimension}] is invalid'.format(identifier=identifier, dimension=dimension))
- return '{identifier}_shape_{dimension}'.format(identifier=identifier, dimension=dimension)
- shape_expr.setParseAction(replace)
- return shape_expr.transformString(source)
- def replace_array_syntax(source, arg_info):
- """
- replace weave.blitz style array indexing with inner product over strides
- we could optionally insert bounds checking code here as well, as a debugging aid
- """
- arrayidentifier = oneOf(' '.join(arg_info.keys())).setResultsName('identifier')
- ## arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
- identifier = Word(alphanums+'_')
- positive_integer = Word(nums)
- index = Or([identifier, positive_integer])
- index_expr = arrayidentifier + nestedExpr('(',')', delimitedList( index)).setResultsName('indices')
- def replace(s,l,t): #string, locaction, parseresults
- """if match is correct, replace numpy syntax with c-compatible syntax"""
- identifier = t.identifier
- indices = t.indices[0]
- try:
- dtype, shape = arg_info[identifier]
- except KeyError:
- raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))
- if not len(indices)==len(shape):
- raise Exception("indexing '{identifier}' requires {ndims} arguments".format(identifier=identifier, ndims=len(shape)))
- offset = '+'.join(
- '{identifier}_stride_{i}*{idx}'.format(identifier=identifier, i=i, idx=idx)
- for i,idx in enumerate(indices))
- return '{identifier}[{offset}]'.format(identifier=identifier, offset=offset)
- index_expr.setParseAction(replace)
- return index_expr.transformString(source)
- def kernel_parsing(arguments, body, shape, axes):
- """
- transform syntactic sugar to valid CUDA-C code
- """
- print 'ORIGINAL CODE:'
- print
- print arguments
- print body
- print
- assert(axes[-1]==True) #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason
- #simple nd-array indexing. probably full of bugs
- for axis in np.flatnonzero(np.array( axes)==0): #emit loops for serial axes
- loop = """
- unsigned i{n};
- for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
- """.format(n=axis)
- body = loop + '{\n' + body + '\n}'
- for i,axis in enumerate( np.flatnonzero(np.array( axes)==1)): #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
- d = {0:'x', 1:'y', 2:'z'}
- try:
- parallel_axes = """
- unsigned i{n};
- i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
- if (i{n} >= kernel_shape_{n}) return;
- """.format(n=axis, a=d[i])
- body = parallel_axes + body
- except:
- raise Exception('requested more parallel axes than are supported by your hardware!')
- #need some analogous code here to determine threadblock size
- arg_info = arg_parse(arguments)
- #parsing-based replacements
- body = replace_typing(body)
- body = replace_shape_syntax( body, arg_info )
- body = replace_array_syntax( body, arg_info )
- #macro-based substitutions
- generic_template = """
- __global__ ${funcname}(${arguments})
- {
- ${body}
- }
- """
- from string import Template
- template = Template(generic_template) #would be nice to have identation-aware template...
- c_arguments, variable_shape_arguments, constant_shape_arguments = build_argument_strings(arg_info)
- print 'TRANSFORMED CODE:'
- print template.substitute(
- funcname = 'templated_function',
- arguments = c_arguments+', '+variable_shape_arguments,
- body = '\n\n'.join([
- compile_kernel_shape(shape),
- constant_shape_arguments,
- compute_strides(arg_info),
- body
- ])
- )
- def Kernel(object):
- """wrappers for """
- def __init__(self, arguments, func):
- self.arguments = arguments
- self.kernel = kernel
- def __call__(self, *args, **kwargs):
- """
- launch kernel
- we need some logic here to extract runtime variable arguments
- and pass them on in correct order to the compiled kernel
- """
- self.kernel(*args, **kwargs)
- def CheckedKernel(Kernel):
- def __call__(self, *args, **kwargs):
- #perform runtime checks
- for k,v in kwargs.iteritems():
- dtype, shape = self.arguments[k]
- assert(v.dtype.name is dtype) #type check on string representation of dtype; is that safe?
- if shape:
- assert(len(shape)==len(v.shape)) #assert correct number of dimensions
- for size, vsize in zip(shape, v.shape):
- if size: assert(size == vsize) #check all specified dimensions
- else:
- assert(v.shape is ()) #scalar numpy type
- super(CheckedKernel, self)(*args, **kwargs)
- funky_product = kernel_parsing(
- #the (virtual) ndarray to arrange kernel call over; as the example demonstrates, we can not genrally deduce this from arguments. if unspecified, should be determined at runtime
- shape = (100,100),
- #specify parallel and serial axes manually (required for now); last (stride=1) axis is made porallel here, the other serial
- axes = [0, 1],
- #note the use of numpy dtypes, and dimension and type constraints in argument list
- arguments = """float64[100,100] result, uint32[:,100,100] foo, uint32[8,100,100] bar, float32 scalar""",
- #note the numpythonic shape accessing, and weave.blitz style array indexing. and the blissfull absence of boilerplate in general :)
- body = """
- float64 r = 0;
- for (uint32 i; i < foo.shape[0]; i++)
- for (uint32 j; j < bar.shape[0]; j++)
- r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
- result(i0, i1) = r
- """
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement