Untitled

import asyncio
import time
import numpy as np
import cupy
import numba

import dask.array as da
from dask_cuda import DGX, LocalCUDACluster
from dask.distributed import Client, wait


@numba.cuda.jit
def _smooth_gpu(x, out):
    i, j = numba.cuda.grid(2)
    n, m = x.shape
    if 1 <= i < n - 1 and 1 <= j < m - 1:
        out[i, j] = (x[i - 1, j - 1] + x[i - 1, j] + x[i - 1, j + 1] +
                     x[i    , j - 1] + x[i    , j] + x[i    , j + 1] +
                     x[i + 1, j - 1] + x[i + 1, j] + x[i + 1, j + 1]) / 9


def smooth_gpu(x, out):
    import math

    threadsperblock = (16, 16)
    blockspergrid_x = math.ceil(x.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(x.shape[1] / threadsperblock[1])
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    _smooth_gpu[blockspergrid, threadsperblock](x, out)


def dispatch_smooth_gpu(x):
    out = cupy.zeros(x.shape, x.dtype)
    smooth_gpu(x, out)
    return out


async def f():
    #async with LocalCUDACluster(asynchronous=True) as cluster:
    async with DGX(asynchronous=True, silence_logs=True) as cluster:
        async with Client(cluster, asynchronous=True) as client:

            # Create a simple random array
            rs = da.random.RandomState(RandomState=cupy.random.RandomState)
            x = rs.random((80000, 80000), chunks=(10000, 10000)).persist()
            await wait(x)

            import time
            t = time.time()
            y = x.map_overlap(dispatch_smooth_gpu, depth=1)
            result = await y.persist()
            print("Time:", time.time() - t)


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(f())