Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include </usr/include/assert.h>
- #include </usr/include/stdio.h>
- #define CEIL_DIV(x, y) ((x + y - 1) / y)
- #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
- void gpuAssert(cudaError_t code, const char *file, int line) {
- if (code != cudaSuccess) {
- fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
- }
- }
- __device__ int2 divide_work(int n_jobs, int n_workers, int worker_idx) {
- // Each worker will do a continuous slice of either n_jobs / n_workers
- // or ceil_div(n_jobs, n_workers). The return value is an int2 representing
- // a half open interval of jobs for the worker to perform (perform jobs
- // i for a <= i < b)
- int cd = CEIL_DIV(n_jobs, n_workers);
- int d = n_jobs / n_workers;
- int doing_cd = n_jobs % n_workers;
- int2 retval;
- if (worker_idx < doing_cd) {
- retval.x = worker_idx * cd;
- retval.y = retval.x + cd;
- } else {
- retval.x = doing_cd * cd + (worker_idx - doing_cd) * d;
- retval.y = retval.x + d;
- }
- return retval;
- }
- __device__ int2 compute_warp_start_stop(int block_idx, int warp_idx,
- int n_blocks, int n_steps) {
- int2 block_ss = divide_work(n_steps, n_blocks, block_idx);
- int block_start = block_ss.x;
- int block_stop = block_ss.y;
- int block_jobs = block_stop - block_start;
- int2 warp_ss = divide_work(block_jobs, 32, warp_idx);
- int warp_start = block_start + warp_ss.x;
- int warp_stop = block_start + warp_ss.y;
- int2 retval;
- retval.x = warp_start;
- retval.y = warp_stop;
- return retval;
- }
- __global__ void reduction_kernel(float *decays, float *impulses,
- float *initial_state,
- float *_decay_storage, float *_h_storage,
- int n_dims, int n_steps) {
- int warp = threadIdx.x / 32;
- int lane = threadIdx.x % 32;
- float *decay_storage = &_decay_storage[blockIdx.x * 33 * n_dims];
- float *h_storage = &_h_storage[blockIdx.x * 33 * n_dims];
- int2 start_stop = compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps);
- int warp_start = start_stop.x;
- int warp_stop = start_stop.y;
- /*
- * Reduce within warps.
- * After this loop exits, the storage arrays should contain the reduction
- * from warp_start to warp_stop (including initial state) at index
- * (feature_idx, warp, block).
- */
- for (int i = lane; i < n_dims; i += 32) {
- float cum_decay = 1.0;
- float h = 0.0;
- if (blockIdx.x == 0 && warp == 0 && initial_state != NULL) {
- h = initial_state[i];
- }
- for (int t = warp_start; t < warp_stop; t++) {
- cum_decay *= decays[i + t * n_dims];
- h = decays[i + t * n_dims] * h + impulses[i + t * n_dims];
- }
- // TODO: store into shared memory, work in shared memory sized blocks
- // store into global memory
- decay_storage[i + warp * n_dims] = cum_decay;
- h_storage[i + warp * n_dims] = h;
- }
- __syncthreads();
- /*
- * Reduce over warps.
- * After this loop exits, the storage arrays should contain the reduction
- * from block_start to block_finish (including initial state) at index
- * (feature_idx, 32, block).
- */
- // TODO: parallel reduction (or scan). Need to worry about changing the warp
- // reduction values (as I use them again later)
- for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) {
- float cum_decay = 1.0;
- float h = 0.0;
- for (int t = 0; t < 32; t++) {
- cum_decay *= decay_storage[i + t * n_dims];
- h = decay_storage[i + t * n_dims] * h + h_storage[i + t * n_dims];
- }
- decay_storage[i + 32 * n_dims] = cum_decay;
- h_storage[i + 32 * n_dims] = h;
- }
- }
- __global__ void block_scan_kernel(float *decay_storage, float *h_storage,
- int n_dims, int n_blocks) {
- /*
- * Scan over blocks.
- * After this loop exits, the storage arrays should contain the cumulative sum
- * from block_idx 0 to i (inclusive) at index (feature_idx, 32, i)
- * This means (feature_idx, 32, 2) contains the reduction of blocks 0, 1, and 2.
- */
- // TODO: parallel scan (tricky because number of blocks isn't necessarily
- // smaller than number of warps that can fit in a single block)
- for (int i = threadIdx.x + blockIdx.x * blockDim.x;
- i < n_dims;
- i += blockDim.x * gridDim.x) {
- for (int t = 1; t < n_blocks; t++) {
- int cur_idx = i + 32 * n_dims + t * 33 * n_dims;
- int prev_idx = i + 32 * n_dims + (t - 1) * 33 * n_dims;
- // TODO: remove unneccessary reads from global memory (prev_idx accesses)
- h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] + h_storage[cur_idx];
- decay_storage[cur_idx] *= decay_storage[prev_idx];
- }
- }
- }
- __global__ void warp_scan_kernel(float *decays, float *impulses,
- float *initial_state, float *out,
- float *decay_storage, float *h_storage,
- int n_dims, int n_steps) {
- int warp = threadIdx.x / 32;
- int lane = threadIdx.x % 32;
- // Note: Due to the index ordering of the storage arrays, the following
- // indices are equivalent:
- //
- // i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims
- // i + 32 * n_dims + (blockIdx.x - 1) * 33 * n_dims
- //
- // when t is 0. This means something that looks like negative indexing
- // (t-1) can be used to safely access the stored value for the previous
- // warp (even if the previous warp belonged to the previous block).
- /*
- * Scan over warps.
- * After this loop executes, the storage arrays should contain the cumulative
- * sum from the beginning of sequence (including initial condition) up to
- * and including the indexed warp and block.
- */
- // TODO: parallel scan
- for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) {
- for (int t = 0; t < 32; t++) {
- if (t == 0 && blockIdx.x == 0) {
- // the reduction over warp 0 (including initial condition) is correct val
- // for scan, so there's no work to do
- continue;
- }
- int cur_idx = i + t * n_dims + blockIdx.x * 33 * n_dims;
- int prev_idx = i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims;
- h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] + h_storage[cur_idx];
- decay_storage[cur_idx] *= decay_storage[prev_idx];
- }
- }
- __syncthreads();
- int2 start_stop = compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps);
- int warp_start = start_stop.x;
- int warp_stop = start_stop.y;
- /*
- * Scan within warps.
- * This loop writes to the output array. Each warp reads in it's initial state
- * (either from the "initial_state" or the storage arrays) and then writes
- * to output for indices warp_start up to warp_stop.
- */
- for (int i = lane; i < n_dims; i += 32) {
- float h = 0.0;
- if (blockIdx.x == 0 && warp == 0) {
- if (initial_state != NULL) {
- h = initial_state[i];
- }
- } else {
- h = h_storage[i + (warp - 1) * n_dims + blockIdx.x * 33 * n_dims];
- }
- for (int t = warp_start; t < warp_stop; t++) {
- h = decays[i + t * n_dims] * h + impulses[i + t * n_dims];
- out[i + t * n_dims] = h;
- }
- }
- }
- extern "C" {
- /*
- * This is the main method for the prefix sum kernels.
- * decays, impulses, out:
- * each a n_dims x n_steps column major matrix located on GPU
- * initial_state:
- * array of size n_dims located on GPU
- */
- void compute_linear_recurrence(float *decays, float *impulses, float *initial_state,
- float *out, int n_dims, int n_steps) {
- // TODO: query
- int n_SMs = 15;
- int n_blocks_per_sm = 2;
- // we want at least 32 elements per block, but no reason to run
- // with more than the maximum number of concurrent blocks
- int n_blocks = min(CEIL_DIV(n_steps, 32), n_SMs * n_blocks_per_sm);
- // TODO: make user pass in working memory? This allows integration
- // with CNMeM (used by Theano)
- int reduction_mem_sz = 2 * n_blocks * 33 * n_dims * sizeof(float);
- float *d_reduction_mem;
- gpuErrChk(cudaMalloc(&d_reduction_mem, reduction_mem_sz));
- float *d_decay_storage = &d_reduction_mem[0 * n_blocks * 33 * n_dims];
- float *d_h_storage = &d_reduction_mem[1 * n_blocks * 33 * n_dims];
- // TODO: run kernels on non-default stream?
- reduction_kernel<<<n_blocks, 1024>>>(decays, impulses, initial_state,
- d_decay_storage, d_h_storage,
- n_dims, n_steps);
- block_scan_kernel<<<n_blocks, 1024>>>(d_decay_storage, d_h_storage,
- n_dims, n_blocks);
- warp_scan_kernel<<<n_blocks, 1024>>>(decays, impulses,
- initial_state, out,
- d_decay_storage, d_h_storage,
- n_dims, n_steps);
- gpuErrChk(cudaFree(d_reduction_mem));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement