numpycuda tests

"""
kernel test

"""

from pycuda.driver import init, Device
init()
device = Device(0)
context = device.make_context()

from pycuda import gpuarray

import numpy as np


class Kernel(object):
    """wrapper for pycuda kernel; checks and binds arguments"""

    def __init__(self, decl, kernel, parallel_axes):
        self.decl = decl                        #kernel declaration object; encodes function signature
        self.kernel = kernel                    #pycuda kernel handle
        self.parallel_axes = parallel_axes      #bool array to specify parallel axes; needed for thread blocking


class CheckedKernel(Kernel):
    """
    quite a complex beast it appears
    anyway, we can worry about optimizing later
    perhaps a specialized subclass when all constants are know at compile time
    which is the case most often encountered in practice
    """
    def __call__(self, *args, **kwargs):
        arg_list = []

        dummies = {}
        def handle_shape(decl, value):
            """check a shape value versus a shape declaration. dummies encountered in the declaration are bound"""
            assert(len(value) == len(decl))          #assert correct number of dimensions

            for dsize, vsize in zip(decl, value):
                if isinstance( dsize, np.uint32):
                    assert(dsize == vsize)                  #compile-time specified shape; make sure it matches
                elif dsize==':':
                    arg_list.append(np.uint32( vsize))          #this is a runtime argument; append it to argument list
                else:                                       #dummy arg; add it to dummy dict, and check consistency
                    size = dummies.get(dsize, vsize)
                    dummies[dsize] = vsize
                    assert(size==vsize)

        #perform runtime checks on input args
        for i, (value, decl) in enumerate( zip(args, self.decl.inputs)):
            if decl.is_array:
                try:
                    assert(value.dtype.name == decl.dtype)         #type check on string representation of dtype; is that safe?
                    arg_list.append(value.gpudata)
                    handle_shape(decl.shape, value.shape)
                except:
                    raise Exception("argument {i} does not satisfy gpuarray interface".format(i=i))
            else:   #scalar type
                try:
                    cast = getattr(np, decl.dtype)
                    arg_list.append(cast(value))
                except:
                    raise Exception("argument {i} could not be cast to {dtype}".format(i=i,dtype=decl.dtype))

        #try if we can get dummies from optional kernel shape argument
        shape = kwargs.pop('shape', None)
        if shape: handle_shape(self.decl.shape, shape)

        #make sure all declared dummies are indeed bound
        assert(all(d in dummies for d in self.decl.dummies))

        #transform a shape declaration to its bound form, by expanding its dummies
        def complete_shape(shape):
            return tuple(int(dummies.get(s, s)) for s in shape)

        #process output arguments; they are all internally allocated for now, but we should also allow for prealloced output args to be passed in to kernel
        return_list = []    #list of things to return from kernel
        for decl in self.decl.outputs:
            value = gpuarray.GPUArray(complete_shape(decl.shape), decl.dtype)
            if decl.default: value.fill(decl.default)
            arg_list.append(value.gpudata)
            return_list.append(value)

        #append dummies to argument list
        for d in self.decl.dummies: arg_list.append(np.uint32(dummies[d]))

        #set thread grid; all shape dimensions are known at this point
        shape = complete_shape(self.decl.shape)
        axes = np.flatnonzero(self.parallel_axes)

        THREADS_PER_BLOCK = 512     #this should be pulled from the context.device, really
        leftover = THREADS_PER_BLOCK
        block = [1]*3
        #rather simple thread block assignment; assign as much as possible to slowest stride, and overflow into the next
        #but should work well under most circumstances.
        #this needs a manual override
        for i,a in enumerate( axes):
            block[i] = min(leftover, shape[a])
            leftover = leftover //  block[i]
            if leftover == 0: break
        block = tuple(block)


        def iDivUp(a, b):
            # Round a / b to nearest higher integer value
            a = np.int32(a)
            b = np.int32(b)
            r = (a / b + 1) if (a % b != 0) else (a / b)
            return int(r)

        grid = tuple(iDivUp(shape[a],b) for b,a in zip(block, axes))

        kwargs.update(dict(grid=grid, block=block) )

        self.kernel(*arg_list, **kwargs)
        return tuple(return_list)


def nd_kernel(source, axes=None):
    """
    transform syntactic sugar to valid CUDA-C code

    """
    print 'ORIGINAL CODE:'
    print
    print source
    print

    from numpycuda_parsing import parsing
    decl, body = parsing(source)


    #create various string options for axes param? 'all_serial', 'all_parallel', 'max_parallel', 'first_serial'?
    if axes is None:
        axes = np.zeros(len(decl.shape), np.bool)
        axes[-3:] = 1       #do last three axes in parallel (needs to be set to two, for old cards
        axes[0] = 0         #first axis is serial, by default. having at least one looping axis can be beneficial, for shared mem or other reusable terms
        axes[-1] = 1        #unless it is also the last, which is always parallel; otherwise we have no coalesing
    axes = np.array(axes, np.bool)
    assert(axes[-1]==True)  #last (stride==1) axis should be parralel. maybe this should be a warning instead, but i cant think of any reason why youd want this


    #nd-array indexing over serial and parallel axes
    for axis in np.flatnonzero(axes==0):  #emit loops for serial axes
        loop = """
        //loop over serial axis
        unsigned i{n};
        for (i{n}=0; i{n} < kernel_shape_{n}; i{n}++ )
        """.format(n=axis)
        body = loop + '{\n' + body + '\n}'

    supported_axes = 'xyz'
    if np.count_nonzero(axes==1) > len(supported_axes):
        raise Exception('requested more parallel axes than are supported by your hardware!')
    for cuda_axis,nd_axis in zip(supported_axes, np.flatnonzero( axes==1)):    #do tread/block arithmetic for parralel axes, and map from x,y,(z) to nd-enumeration
        parallel_axes = """
        //launch threads for parallel axis
        unsigned const i{n} = blockDim.{a}*blockIdx.{a} + threadIdx.{a};
        if (i{n} >= kernel_shape_{n}) return;
        """.format(n=nd_axis, a=cuda_axis)
        body = parallel_axes + body


    #macro-based substitutions, to assemble total kernel
    generic_template = """
    __global__ void ${funcname}(${arguments})
    {
        ${init}
        //kernel body
        ${body}
    }
    """
    from string import Template
    template = Template(generic_template)   #would be nice to have identation-aware template...
    funcname = 'nd_kernel'
    transform = template.substitute(
        funcname = funcname,
        arguments = decl.argument_string,
        init = decl.init_string,
        body = body,
        )

    print 'TRANSFORMED CODE:'
    print
    print transform
    print

    def compile(source):
        from pycuda.compiler import SourceModule
        module = SourceModule(source)
        func = module.get_function(funcname)
        return CheckedKernel(decl, func, axes)

    return compile(transform)


#2d meshgrid function
mesh_grid_2 = nd_kernel(
    """
    (int32[2,n,m] result) << [n,m] << ():
        result(0, i0,i1) = i0;
        result(1, i0,i1) = i1;
    """,
)
print mesh_grid_2(shape=(4,4))      #we have no input args; kernel shape needs to be explicitly specified

outer_product = nd_kernel(
    """
    (float32[n,m] result = 0) << [n,m] << (float32[n] x0, float32[m] x1):
        result(i0, i1) = x0(i0) * x1(i1);
    """,
    )
x0 = gpuarray.empty((10), np.float32); x0.fill(1)
x1 = gpuarray.empty((20), np.float32); x1.fill(1)
print outer_product(x0, x1)         #kernel shape is deduced from input arguments


funky_product = nd_kernel(
    """
    (float32[n,m] result, uint32 count=0) << [n,m] << (uint32[:,n,m] foo, uint32[8,n,m] bar, float32 scalar):
        float32 r = 0;
        for (i in 0:foo.shape[0])
            for (j in 0:bar.shape[0]:2)
                r += foo(i,i0,i1) * bar(j,i0,i1) * scalar;
        result(i0, i1) = r;
        atomicAdd(count, 1);
    """,
    axes = [1,1]            #override default axes parallism; gives the same result
    )

foo = gpuarray.zeros((3,100,99), np.uint32); foo.fill(1)
bar = gpuarray.zeros((8,100,99), np.uint32); bar.fill(1)
print funky_product(foo, bar, 1.1 )
quit()


def mesh_grid_nd(ndim, dtype=np.int32):
    """nd meshgrid factory function. parametrizes the function on dimensionality and type; you kinda want to know these at compile time"""
    dummies = ','.join( alphas[:ndim] )
    idx = ','.join('i%i'%n for n in xrange(ndim))
    mesh_grid_kernel = nd_kernel(
        decl = """<<<{dummies}>>>: {dtype}[{ndim},{dummies}] result""".format(dtype=dtype, ndim=ndim, dummies=dummies),
        body = '\n'.join( """result({i}, {idx} = i{i};""".format(i=i,idx=idx) for i in xrange(ndim))
        )
    #wrapper func, to handle allocation and kernel launch
    def mesh_grid(shape):
        """returns a gpu mesh grid of shape:{shape}, dtype:{dtype}""".format(shape=shape, dtype=dtype)
        result = gpuarray.zeros(shape, dtype)
        mesh_grid_n(result)
        return result
    return mesh_grid


def broadcasting_op(op, ndim, dtype):
    """
    general broadcasting requires smart strides
    as a simple hack; any size == 1 dimension is broadcastable
    all we need to do is set corresponding strides to zero; forces that index to stay at zero
    """

    dummies = alphas[:ndim]
    idx = ','.join('i%i'%i for i in xrange(ndim))

    broad_kernel = ()
    source = """
    ({dtype}[{dummies}] output) << [{dummies}] << ({dtype}[{colons}] l, {dtype}[{colons}] r):
        output({idx}) = l({idx}) {operation} r({idx});
    """

    def func(L, R):
        broad_shape = tuple(max(l, r) for l,r in zip(L.shape, R.shape))
        #bind kernel shape argument
        return broad_kernel(L, R, shape = broad_shape)