PYCUDA ndarray kernel musings


"""
some conceptual code to illustrate planned kernel syntax stuff
this is just a small stub of code, for ease of mental access for others who want to look at it or play with it
it is currently not compatible with my other messy code-base, but that will come


the intent of this code is to extend elementwise-type pycuda functionality.
specifically, i aim for a kernel factory that is ndarray-aware. this is taken to mean several things:
    runtime type and shape checking of input arrays
    macros to obtain shapes and strides of arrays within the kernel; both the compile time and runtime constants
    macros to obtain the current index being processed; nd generalization of the 'i' in elementwise
    or even better: arbitrary dereferencing of arrays ala weave.blitz:
        arrayidentifier(x,y,z) -> transformed into an appropriate inner product over strides


this is all fairly easy to do for a fixed number of dimension (what my main code base is built on);
but the obvious next step is to emit code that works for any dimension

in order to do so, we should have some clear logic to decide which axes will
be looped over (serial), and which axes will be handled by the thread launcher (parallel)

we should always at least launch 1d kernels to have a minimum of parralelism;
furthermore, the stride=1 axis should always be parralel for coalesing
but exactly how many axes we should do parralel is highly problem (and somewhat hardware) dependent
f.i., 2d deconvolution is typically best done by reading coalesed rows into shared mem,
and then looping over the other axis
a 3d kernel on older hardware must be force to loop over the biggest stride, and so on

anyway, having decided upon a partition of parralel and serial axes:
    parralel axes should emit indexing code of the form:
        unsigned i0 = blockDim.x*blockIdx.x + threadIdx.x;
    and serial axes should (recursively) emit an outer loop of the form:
        unsigned in; for in = 0; in<kernel.shape[n]; in++) { $body}

the most basis implementation that always works (though not optimally);
simply always launch 1d threadblocks, and loop over the other axes

we should probably allow for a manual override of axis paralelism, considering the problem-dependence


another low priority thing, but which would be cool to have:
    can we make a general shared mem mechanism? upon marking an input arg with a keyword, emit code
    to defer reads of that array to a block of shared mem, with correct padding as decided by the supplied stencil
    shared mem only really comes into play for stencil operations. a simple nd-aware stencil like an outer product
    does not stand to gain much from shared mem; it is still 'elementwise', and needs NxM writes anyway, so there is not much
    point trying to optimize the technically unnecessary NxM reads to N+M


"""


import numpy as np

from pyparsing import *

import pycuda.compyte.dtypes
pycuda.compyte.dtypes._fill_dtype_registry(True)
dtype_to_ctype = {k:v for k,v in pycuda.compyte.dtypes.DTYPE_TO_NAME.iteritems() if isinstance(k, str)}


def arg_parse(arguments):

    """
    take a raw argument string of the specified syntax, and transform it
    to an ordereddict of identifier:(dtype, shape), for further processing
    """
    np_types = ' '.join(dtype_to_ctype.keys())
    dtype = oneOf(np_types).setResultsName('dtype')

    identifier = (Word(alphanums+'_')).setResultsName('identifier') #not fully correct; also matches names starting with a numeral. whatever

    positive_integer = Word(nums)
    dimension = Or([positive_integer, Literal(':')])
    shape = nestedExpr('[',']', delimitedList(dimension)).setResultsName('shape')

    term = Group( dtype + Optional(shape) + identifier)
    grammar = delimitedList(term)

    #convert parsing result to typeinfo object, for reasonably efficient runtime checks, and clean downstream code
    def shape_parse(shape):     #apply parseactions to shape args
        if shape is '': return None     #scalar argument
        shape = tuple(None if size is ':' else int(size) for size in shape[0])
        if len(shape)==0: raise Exception('arrays must have at least one dimension')
        return shape

    from collections import OrderedDict
    arg_info = OrderedDict()
    for argument in grammar.parseString(arguments):
        arg_info[argument.identifier] = argument.dtype, shape_parse(argument.shape)

    return arg_info


def build_argument_strings(arg_info):
    """
    build c code to represent all the compile time and run time info
    that we seek to pass into the kernel
    """
    #add restrict keyword to ptrs by default; my preferred syntax would be to add an ALIAS keyword to supress this
    c_arguments = ', '.join(
        '{restrict}{type}{ptr} {identifier}'.format(type=dtype_to_ctype[dtype], ptr='' if shape is None else '*', restrict='' if shape is None else '__restrict__ ', identifier=identifier)
            for identifier,(dtype, shape) in arg_info.iteritems()
    )

    variable_shape_arguments = ', '.join(
        'constant unsigned {identifier}_shape_{dimension}'.format(identifier=identifier, dimension=dimension)
            for identifier,(dtype, shape) in arg_info.iteritems() if not shape is None
                for dimension, size in enumerate( shape) if size is None
    )

    constant_shape_arguments = '\n'.join(
        'constant unsigned {identifier}_shape_{dimension} = {size};'.format(identifier=identifier, dimension=dimension, size=size)
           for identifier,(dtype, shape) in arg_info.iteritems() if not shape is None
               for dimension, size in enumerate( shape) if not size is None
    )

    return c_arguments, variable_shape_arguments, constant_shape_arguments

def compile_kernel_shape(shape):
    """
    emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
    this function hardcodes this into the kernel
    """
    return '\n'.join(
        'constant unsigned kernel_shape_{dimension} = {size};'.format(dimension=dimension, size=size)
            for dimension,size in enumerate( shape)
        )
def runtime_kernel_shape(shape):
    """
    emit code that determines the shape of the virtual ndarray over which we evaluate the kernel
    emits extra arguments to be supplied upon calling of the kernel
    """
    raise NotImplementedError()


def compute_strides(arg_info):
    """
    generate code to compute the strides of each array
    I am assuming in all this the CUDA compiler has a clue about optimizing constant expressions
    otherwise, might have to use C macros for this kind of thing?
    I do hope unused constants get eliminated, and so on?
    """
    def strides(identifier, shape):
        """
        inner generator, to generate a sequence of stride definitions for each array
        i havnt actually settled on a convention for what is the first and last axis
        not having such a convention, it isnt applied either. in other words, this code
        is for illustrative purposes only
        """
        stride_template  = '{identifier}_stride_{dimension}'
        shape_template = '{identifier}_shape_{dimension}'
        prev = stride_template.format(identifier=identifier, dimension=len(shape)-1)
        yield 'constant unsigned {identifier} = {stride};'.format(identifier = prev ,stride = 1)
        for i, size in reversed(list(enumerate( shape[:-1]))):
            this = stride_template.format(identifier=identifier, dimension=i)
            size = shape_template.format(identifier=identifier, dimension=i+1)
            yield 'constant unsigned {this} = {prev} * {size};'.format(this=this, prev=prev ,size = size)
            prev = this
        #add total element size as well, for good measure
        size = shape_template.format(identifier=identifier, dimension=0)
        yield 'constant unsigned {identifier}_size = {prev} * {size};'.format(identifier = identifier, prev=prev ,size=size)

    return '\n'.join(
        stride_expr
            for identifier,(dtype, shape) in arg_info.iteritems() if not shape is None
                for stride_expr in strides(identifier, shape))


def replace_typing(source):
    """
    replace numpy types with c-types. this could be more efficient and intelligent...
    we do not do any semantic analysis here; simple find and replace
    but useage is optional anyway; we are fully backwards compatible, free to use ctypes in our code
    """
    np_types = ' '.join(dtype_to_ctype.keys())
    type_grammar = oneOf(np_types)
    type_grammar.setParseAction(lambda s,l,t: dtype_to_ctype[t[0]])
    return type_grammar.transformString(source)


def replace_shape_syntax(source, arg_info):
    """
    replace arrayidentifier.shape[ndim] syntax with C named variables
    silently fails to replace some wrong syntax, like misspelled shape;
    dont worry, the cuda compiler is sure to complain about it :)
    would it be sufficient and currect to catch all instances of 'arrayidentifier.'+whatever,
    that fail to match the whole syntax?
    """
    arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
    positive_integer = Word(nums)
    shape_expr = arrayidentifier + Suppress( Literal('.shape')) + nestedExpr('[',']', positive_integer).setResultsName('dimension')

    def replace(s,l,t):    #string, locaction, parseresults
        """if match is correct, replace numpy syntax with c-compatible syntax"""
        identifier = t.identifier
        dimensions = t.dimension[0]
        if not len(dimensions)==1: raise Exception('only simple shape indexing allows')
        dimension = dimensions[0]
        try:
            dtype, shape = arg_info[identifier]
        except KeyError:
            raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))
        try:
            size = shape[int(dimension)]
        except Exception:
            raise ParseFatalException('{identifier}.shape[{dimension}] is invalid'.format(identifier=identifier, dimension=dimension))

        return '{identifier}_shape_{dimension}'.format(identifier=identifier, dimension=dimension)
    shape_expr.setParseAction(replace)

    return shape_expr.transformString(source)


def replace_array_syntax(source, arg_info):
    """
    replace weave.blitz style array indexing with inner product over strides
    we could optionally insert bounds checking code here as well, as a debugging aid
    """
    arrayidentifier = oneOf(' '.join(arg_info.keys())).setResultsName('identifier')
##    arrayidentifier = (Word(alphanums+'_')).setResultsName('identifier') # + Optional( Word(alphanums))
    identifier = Word(alphanums+'_')
    positive_integer = Word(nums)
    index = Or([identifier, positive_integer])
    index_expr = arrayidentifier + nestedExpr('(',')', delimitedList( index)).setResultsName('indices')

    def replace(s,l,t):    #string, locaction, parseresults
        """if match is correct, replace numpy syntax with c-compatible syntax"""
        identifier = t.identifier
        indices = t.indices[0]

        try:
            dtype, shape = arg_info[identifier]
        except KeyError:
            raise ParseFatalException("array '{identifier}' is not defined".format(identifier=identifier))

        if not len(indices)==len(shape):
            raise Exception("indexing '{identifier}' requires {ndims} arguments".format(identifier=identifier, ndims=len(shape)))


        offset = '+'.join(
            '{identifier}_stride_{i}*{idx}'.format(identifier=identifier, i=i, idx=idx)
                for i,idx in enumerate(indices))
        return '{identifier}[{offset}]'.format(identifier=identifier, offset=offset)
    index_expr.setParseAction(replace)

    return index_expr.transformString(source)


def kernel_parsing(arguments, body, shape, axes):
    """
    transform syntactic sugar to valid CUDA-C code
    """

    print 'ORIGINAL CODE:'
    print
    print arguments
    print body
    print


    assert(axes[-1]==True)        #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason

    #simple nd-array indexing. probably full of bugs
    for axis in np.flatnonzero(np.array( axes)==0):  #emit loops for serial axes
        loop = """
        unsigned i{n};
        for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
        """.format(n=axis)
        body = loop + '{\n' + body + '\n}'

    for i,axis in enumerate( np.flatnonzero(np.array( axes)==1)):    #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
        d = {0:'x', 1:'y', 2:'z'}
        try:
            parallel_axes = """
            unsigned i{n};
            i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
            if (i{n} >= kernel_shape_{n}) return;
            """.format(n=axis, a=d[i])
            body = parallel_axes + body
        except:
            raise Exception('requested more parallel axes than are supported by your hardware!')
    #need some analogous code here to determine threadblock size


    arg_info = arg_parse(arguments)


    #parsing-based replacements
    body = replace_typing(body)
    body = replace_shape_syntax( body, arg_info )
    body = replace_array_syntax( body, arg_info )


    #macro-based substitutions
    generic_template = """
    __global__ ${funcname}(${arguments})
    {
        ${body}
    }
    """
    from string import Template
    template = Template(generic_template)   #would be nice to have identation-aware template...

    c_arguments, variable_shape_arguments, constant_shape_arguments = build_argument_strings(arg_info)

    print 'TRANSFORMED CODE:'
    print template.substitute(
        funcname = 'templated_function',
        arguments = c_arguments+', '+variable_shape_arguments,
        body = '\n\n'.join([
                compile_kernel_shape(shape),
                constant_shape_arguments,
                compute_strides(arg_info),
                body
            ])
        )


def Kernel(object):
    """wrappers for """
    def __init__(self, arguments, func):
        self.arguments = arguments
        self.kernel = kernel

    def __call__(self, *args, **kwargs):
        """
        launch kernel
        we need some logic here to extract runtime variable arguments
        and pass them on in correct order to the compiled kernel
        """

        self.kernel(*args, **kwargs)

def CheckedKernel(Kernel):

    def __call__(self, *args, **kwargs):
        #perform runtime checks
        for k,v in kwargs.iteritems():
            dtype, shape = self.arguments[k]
            assert(v.dtype.name is dtype)           #type check on string representation of dtype; is that safe?
            if shape:
                assert(len(shape)==len(v.shape))    #assert correct number of dimensions
                for size, vsize in zip(shape, v.shape):
                    if size: assert(size == vsize)  #check all specified dimensions
            else:
                assert(v.shape is ())       #scalar numpy type
        super(CheckedKernel, self)(*args, **kwargs)


funky_product = kernel_parsing(
    #the (virtual) ndarray to arrange kernel call over; as the example demonstrates, we can not genrally deduce this from arguments. if unspecified, should be determined at runtime
    shape = (100,100),
    #specify parallel and serial axes manually (required for now); last (stride=1) axis is made porallel here, the other serial
    axes = [0, 1],
    #note the use of numpy dtypes, and dimension and type constraints in argument list
    arguments = """float64[100,100] result, uint32[:,100,100] foo, uint32[8,100,100] bar, float32 scalar""",
    #note the numpythonic shape accessing, and weave.blitz style array indexing. and the blissfull absence of boilerplate in general :)
    body = """
    float64 r = 0;
    for (uint32 i; i < foo.shape[0]; i++)
        for (uint32 j; j < bar.shape[0]; j++)
            r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
    result(i0, i1) = r
    """
    )