More pycuda nd_kernel stuff


"""
some conceptual code to illustrate planned kernel syntax stuff
this is just a small stub of code, for ease of mental access for others who want to look at it or play with it
it is currently not compatible with my other messy code-base, but that will come


the intent of this code is to extend elementwise-type pycuda functionality.
specifically, i aim for a kernel factory that is ndarray-aware. this is taken to mean several things:
    runtime type and shape checking of input arrays
    macros to obtain shapes and strides of arrays within the kernel; both the compile time and runtime constants
    macros to obtain the current index being processed; nd generalization of the 'i' in elementwise
    or even better: arbitrary dereferencing of arrays ala weave.blitz:
        arrayidentifier(x,y,z) -> transformed into an appropriate inner product over strides


this is all fairly easy to do for a fixed number of dimension (what my main code base is built on);
but the obvious next step is to emit code that works for any dimension

in order to do so, we should have some clear logic to decide which axes will
be looped over (serial), and which axes will be handled by the thread launcher (parallel)

we should always at least launch 1d kernels to have a minimum of parralelism;
furthermore, the stride=1 axis should always be parralel for coalesing
but exactly how many axes we should do parralel is highly problem (and somewhat hardware) dependent
f.i., 2d deconvolution is typically best done by reading coalesed rows into shared mem,
and then looping over the other axis
a 3d kernel on older hardware must be force to loop over the biggest stride, and so on

anyway, having decided upon a partition of parralel and serial axes:
    parralel axes should emit indexing code of the form:
        unsigned i0 = blockDim.x*blockIdx.x + threadIdx.x;
    and serial axes should (recursively) emit an outer loop of the form:
        unsigned in; for in = 0; in<kernel.shape[n]; in++) { $body}

the most basis implementation that always works (though not optimally);
simply always launch 1d threadblocks, and loop over the other axes

we should probably allow for a manual override of axis paralelism, considering the problem-dependence


another low priority thing, but which would be cool to have:
    can we make a general shared mem mechanism? upon marking an input arg with a keyword, emit code
    to defer reads of that array to a block of shared mem, with correct padding as decided by the supplied stencil
    shared mem only really comes into play for stencil operations. a simple nd-aware stencil like an outer product
    does not stand to gain much from shared mem; it is still 'elementwise', and needs NxM writes anyway, so there is not much
    point trying to optimize the technically unnecessary NxM reads to N+M


what about class hierarchy?
nd_kernel is essentially the base
elementwise is simply an 1d nd_kernel; subclass with some constraints
stencil_kernel is also a subclass of nd_kernel. it calls upon nd_kernel base to do most work
all it does itself is pass in a pre-composed body, with the optionally unrolled inner stencil loop
"""


import numpy as np

from pyparsing import *


from pycuda.driver import init, Device
init()
device = Device(1)
context = device.make_context()

import pycuda.tools
##pycuda.compyte.dtypes._fill_dtype_registry(True)
dtype_to_ctype = {k:v for k,v in pycuda.compyte.dtypes.DTYPE_TO_NAME.iteritems() if isinstance(k, str)}


class Kernel(object):
    """wrapper for pycuda kernel; check and bind arguments"""

    def __init__(self, arguments, kernel):
        self.arguments = arguments
        self.kernel = kernel

    def __call__(self, *args, **kwargs):
        """
        launch kernel
        we need some logic here to extract runtime variable arguments
        and pass them on in correct order to the compiled kernel
        """

        args.append( [
            np.uint32( size)
                for identifier, (dtype,shape) in self.arguments.iteritems() if not shape is None
                    for size in shape if not size is None])

        self.kernel(*args, **kwargs)


class CheckedKernel(Kernel):

    def __call__(self, *args, **kwargs):
        size_args = []
        normal_args = []
        arg_info = self.arguments
        #perform runtime checks
        for i, (value, (identifier, arg)) in enumerate( zip(args, arg_info.iteritems())):
            if arg.is_array:
                print value.dtype.name, arg.dtype
                assert((value.dtype.name) == (arg.dtype))           #type check on string representation of dtype; is that safe?
                assert(len(arg.shape)==len(value.shape))    #assert correct number of dimensions
                for asize, vsize in zip(arg.shape, value.shape):
                    if asize:
                        assert(asize == vsize)  #check all specified dimensions
                    else:
                        size_args.append(np.uint32( vsize))      #this is a runtime argument
                normal_args.append(value.gpudata)
            else:
                try:
                    cast = getattr(np, arg.dtype)
                    normal_args.append( cast(value) )
                except:
                    raise Exception("argument {i} could not be cast to {dtype}".format(i=i,dtype=arg.dtype))

        grid = (10,1,1)
        block = (10,1,1)
        kwargs.update(dict(grid=grid, block=block) )

        args = normal_args + size_args
        self.kernel(*args, **kwargs)
##        super(CheckedKernel, self)(*args, **kwargs)


class ArgumentInfo(object):
    def __init__(self, identifier, shape, dtype):
        self.identifier = identifier
        self.shape = shape
        self.dtype = dtype

    @property
    def ctype(self): return dtype_to_ctype[self.dtype]
    @property
    def is_scalar(self): return self.shape is None
    @property
    def is_array(self): return not self.is_scalar
    @property
    def ndim(self): return len(self.shape)


def arg_parse(arguments):

    """
    take a raw argument string of the specified syntax, and transform it
    to an ordereddict of ArgumentInfo objects
    """
    np_types = ' '.join(dtype_to_ctype.keys())
    dtype = oneOf(np_types).setResultsName('dtype')

    identifier = (Word(alphanums+'_')).setResultsName('identifier') #not fully correct; also matches names starting with a numeral. whatever

    positive_integer = Word(nums)
    dimension = Or([positive_integer, Literal(':')])
    shape = nestedExpr('[',']', delimitedList(dimension)).setResultsName('shape')

    term = Group( dtype + Optional(shape) + identifier)
    grammar = delimitedList(term)

    #convert parsing result to typeinfo object, for reasonably efficient runtime checks, and clean downstream code
    def shape_postparse(shape):     #apply parseactions to shape args
        if shape is '': return None     #scalar argument
        shape = tuple(None if size is ':' else int(size) for size in shape[0])
        if len(shape)==0: raise Exception('arrays must have at least one dimension')
        return shape

    from collections import OrderedDict
    arg_info = OrderedDict()
    for argument in grammar.parseString(arguments):
        arg_info[argument.identifier] = ArgumentInfo(argument.identifier, shape_postparse(argument.shape), argument.dtype)

    return arg_info


def build_argument_strings(arg_info):
    """
    build c code to represent all the compile time and run time info
    that we seek to pass into the kernel
    """
    #add restrict keyword to ptrs by default; my preferred syntax would be to add an ALIAS keyword to supress this
    c_arguments = ', '.join(
        '{type}{ptr} const {restrict}{identifier}'.format(type=arg.ctype, ptr='*'*arg.is_array, restrict='__restrict__ '*arg.is_array, identifier=arg.identifier)
            for arg in arg_info.itervalues()
    )

    variable_shape_arguments = ', '.join(
        'const unsigned {identifier}_shape_{dimension}'.format(identifier=arg.identifier, dimension=dimension)
            for arg in arg_info.itervalues() if arg.is_array
                for dimension, size in enumerate(arg.shape) if size is None
    )

    constant_shape_arguments = '\n'.join(
        'const unsigned {identifier}_shape_{dimension} = {size};'.format(identifier=arg.identifier, dimension=dimension, size=size)
            for arg in arg_info.itervalues() if arg.is_array
                for dimension, size in enumerate(arg.shape) if not size is None
    )

    return c_arguments, variable_shape_arguments, constant_shape_arguments

def compile_kernel_shape(shape):
    """
    emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
    this function hardcodes this into the kernel
    """
    return '\n'.join(
        'const unsigned kernel_shape_{dimension} = {size};'.format(dimension=dimension, size=size)
            for dimension,size in enumerate( shape)
        )
def runtime_kernel_shape(shape):
    """
    emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
    emits extra arguments to be supplied upon calling of the kernel
    """
    raise NotImplementedError()


def compute_strides(arg_info):
    """
    generate code to compute the strides of each array
    I am assuming in all this the CUDA compiler has a clue about optimizing constant expressions
    otherwise, might have to use C macros for this kind of thing?
    I do hope unused constants get eliminated, and so on?
    """
    def strides(identifier, shape):
        """
        inner generator, to generate a sequence of stride definitions for each array
        i havnt actually settled on a convention for what is the first and last axis
        not having such a convention, it isnt applied either. in other words, this code
        is for illustrative purposes only
        """
        stride_template  = '{identifier}_stride_{dimension}'
        shape_template = '{identifier}_shape_{dimension}'
        prev = stride_template.format(identifier=identifier, dimension=len(shape)-1)
        yield 'const unsigned {identifier} = {stride};'.format(identifier = prev ,stride = 1)
        for i, size in reversed(list(enumerate( shape[:-1]))):
            this = stride_template.format(identifier=identifier, dimension=i)
            size = shape_template.format(identifier=identifier, dimension=i+1)
            yield 'const unsigned {this} = {prev} * {size};'.format(this=this, prev=prev ,size = size)
            prev = this
        #add total element size as well, for good measure
        size = shape_template.format(identifier=identifier, dimension=0)
        yield 'const unsigned {identifier}_size = {prev} * {size};'.format(identifier = identifier, prev=prev ,size=size)

    return '\n'.join(
        stride_expr
            for identifier,arg in arg_info.iteritems() if arg.is_array
                for stride_expr in strides(identifier, arg.shape))


def replace_typing(source):
    """
    replace numpy types with c-types. this could be more efficient and intelligent...
    we do not do any semantic analysis here; simple find and replace
    but useage is optional anyway; we are fully backwards compatible, free to use ctypes in our code
    """
    np_types = ' '.join(dtype_to_ctype.keys())
    type_grammar = oneOf(np_types)
    type_grammar.setParseAction(lambda s,l,t: dtype_to_ctype[t[0]])
    return type_grammar.transformString(source)


def replace_shape_syntax(source, arg_info):
    """
    replace arrayidentifier.shape[ndim] syntax with C named variables
    silently fails to replace some wrong syntax, like misspelled shape;
    dont worry, the cuda compiler is sure to complain about it :)
    would it be sufficient and currect to catch all instances of 'arrayidentifier.'+whatever,
    that fail to match the whole syntax?
    """
    arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
    positive_integer = Word(nums)
    shape_expr = arrayidentifier + Suppress( Literal('.shape')) + nestedExpr('[',']', positive_integer).setResultsName('dimension')

    def replace(s,l,t):    #string, locaction, parseresults
        """if match is correct, replace numpy syntax with c-compatible syntax"""
        identifier = t.identifier
        dimensions = t.dimension[0]
        if not len(dimensions)==1: raise Exception('only simple shape indexing allows')
        dimension = dimensions[0]
        try:
            arg = arg_info[identifier]
        except KeyError:
            raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))
        try:
            size = arg.shape[int(dimension)]
        except Exception:
            raise ParseFatalException('{identifier}.shape[{dimension}] is invalid'.format(identifier=identifier, dimension=dimension))

        return '{identifier}_shape_{dimension}'.format(identifier=identifier, dimension=dimension)
    shape_expr.setParseAction(replace)

    return shape_expr.transformString(source)


def replace_array_syntax(source, arg_info):
    """
    replace weave.blitz style array indexing with inner product over strides
    we could optionally insert bounds checking code here as well, as a debugging aid
    should we allow for partial indexing? not sure
    """
    arrayidentifier = oneOf(' '.join(arg_info.keys())).setResultsName('identifier')
##    arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
    identifier = Word(alphanums+'_')
    positive_integer = Word(nums)
    index = Or([identifier, positive_integer])
    index_expr = arrayidentifier + nestedExpr('(',')', delimitedList( index)).setResultsName('indices')

    def replace(s,l,t):    #string, locaction, parseresults
        """if match is correct, replace numpy syntax with c-compatible syntax"""
        identifier = t.identifier
        indices = t.indices[0]

        try:
            arg = arg_info[identifier]
        except KeyError:
            raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))

        if not len(indices)==arg.ndim:
            raise Exception("indexing '{identifier}' requires {ndim} arguments".format(identifier=identifier, ndim=arg.ndim))


        offset = '+'.join(
            '{identifier}_stride_{i}*{idx}'.format(identifier=identifier, i=i, idx=idx)
                for i,idx in enumerate(indices))
        return '{identifier}[{offset}]'.format(identifier=identifier, offset=offset)
    index_expr.setParseAction(replace)

    return index_expr.transformString(source)

def replace_for_syntax(source, arg_info):
    """
    replace: 'for (id in start:stop:step)'
    with:    'for (int id=start; start<end; id+=step)'
    rather trivial syntactic sugar indeed; just flexing my pyparsing knowledge a bit
    we could implement an unrolling mechanism here too, in case all params are known at compile time, and loop is small?
    """

    identifier = Word(alphanums+'_')
    integer = Word(nums)
    index = Combine( Optional(Literal('-')) + Or([identifier, integer]))    #index, with optional unary minus prepended
    colon = Suppress(Literal(':'))
    range = index.setResultsName('start') + colon + index.setResultsName('stop') + Optional(Combine(colon + index), '1').setResultsName('step')
    loop_expr = Literal('for') + '(' + identifier.setResultsName('index') + Literal('in') + range + ')'

    def replace(s,l,t):
        return 'for (int {index}={start}; {index} < {stop}; {index}+={step})'.format(**dict(t))
    loop_expr.setParseAction(replace)

    return loop_expr.transformString(source)


def kernel_parsing(arguments, body, axes = None, shape=None):
    """
    transform syntactic sugar to valid CUDA-C code

    """
    arg_info = arg_parse(arguments)


    if shape is None:
        shape = arg_info[arg_info.keys()[0]].shape      #if shape is none, kernel takes shape of first argument
    shape = tuple(shape)

    #create various string options for axes param? 'serial', 'parralel', 'max_parralel', 'first_serial'
    if axes is None:
        axes = np.zeros(len(shape), np.bool)
        axes[-3:] = 1       #do last three axes in parallel (needs to be set to two, for old cards
        axes[0] = 0         #first axis is serial, by default. having at least one looping axis can be beneficial, for shared mem or other reusable terms
        axes[-1] = 1        #unless it is also the last, which is always parallel; otherwise we have no coalesing
    axes = np.array(axes, np.bool)
    assert(axes[-1]==True)  #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason why youd want this


    print 'ORIGINAL CODE:'
    print
    print arguments
    print body
    print


    #simple nd-array indexing. probably full of bugs?
    for axis in np.flatnonzero(axes==0):  #emit loops for serial axes
        loop = """
        unsigned i{n};
        for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
        """.format(n=axis)
        body = loop + '{\n' + body + '\n}'

    supported_axes = 'xyz'
    if np.count_nonzero(axes==1) > len(supported_axes):
        raise Exception('requested more parallel axes than are supported by your hardware!')
    for cuda_axis,nd_axis in zip(supported_axes, np.flatnonzero( axes==1)):    #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
        parallel_axes = """
        const unsigned i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
        if (i{n} >= kernel_shape_{n}) return;
        """.format(n=nd_axis, a=cuda_axis)
        body = parallel_axes + body

    #determine thread blocking
    #warps of THREADS_PER_ROW; other parallel axes are balanced out
    #just like supported_axes, these constants should be pulled from the surrounding context and its associated device
    parallel_axes = np.count_nonzero(axes==1)
    THREADS_PER_BLOCK = 512
    THREADS_PER_ROW = 32        #would like to coalesce 32 threads at the same time
    #WORK IN PROGRESS


    #parsing-based replacements
    body = replace_typing(body)
    body = replace_shape_syntax( body, arg_info )
    body = replace_array_syntax( body, arg_info )
    body = replace_for_syntax(body, arg_info)


    #macro-based substitutions
    generic_template = """
    __global__ void ${funcname}(${arguments})
    {
        ${body}
    }
    """
    from string import Template
    template = Template(generic_template)   #would be nice to have identation-aware template...

    c_arguments, variable_shape_arguments, constant_shape_arguments = build_argument_strings(arg_info)

    print 'TRANSFORMED CODE:'
    transform = template.substitute(
        funcname = 'nd_kernel',
        arguments = c_arguments+', '+variable_shape_arguments,
        body = '\n\n'.join([
                compile_kernel_shape(shape),
                constant_shape_arguments,
                compute_strides(arg_info),
                body
            ])
        )
    print transform

    def compile(source):
        from pycuda.compiler import SourceModule
        module = SourceModule(source)

        func = module.get_function('nd_kernel')

        return CheckedKernel(arg_info, func)

    return compile(transform)


funky_product = kernel_parsing(
    #the (virtual) ndarray to arrange kernel call over; as the example demonstrates, we can not genrally deduce this from arguments.
    #if not specified, first argument is taken to be the kernel shape;
##    shape = (100,100),
    #specify parallel and serial axes manually (required for now); last (stride=1) axis is made porallel here, the other serial
    #if not specified, default axis assignment gives the same result for this case
##    axes = [0, 1],
##    axes= [1,1],
    #note the use of numpy dtypes, and dimension and type constraints in argument list
    arguments = """float64[100,100] result, uint32[:,100,100] foo, uint32[8,100,100] bar, float32 scalar""",
    #note the numpythonic shape accessing, and weave.blitz style array indexing. plus simple looping helpers
    #and the blissfull absence of boilerplate in general :)
    body = """
    float64 r = 0;
    for (i in 0:foo.shape[0])
        for (j in 0:bar.shape[0]:2)
            r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
    result(i0, i1) = r;
    """
    )

from pycuda import gpuarray
out = gpuarray.zeros((100,100), np.float64)
foo = gpuarray.zeros((3,100,100), np.uint32)
foo.fill(1)
bar = gpuarray.zeros((8,100,100), np.uint32)
bar.fill(1)


#attempt to lunahc this thing, and see if it actually works
funky_product(out, foo, bar, 1.1)
#seems like it!
print out