Advertisement
Guest User

Untitled

a guest
Nov 7th, 2024
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.50 KB | None | 0 0
  1. #include </usr/include/assert.h>
  2. #include </usr/include/stdio.h>
  3.  
  4. #define CEIL_DIV(x, y) ((x + y - 1) / y)
  5.  
  6. #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
  7. void gpuAssert(cudaError_t code, const char *file, int line) {
  8.   if (code != cudaSuccess) {
  9.     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  10.   }
  11. }
  12.  
  13. __device__ int2 divide_work(int n_jobs, int n_workers, int worker_idx) {
  14.   // Each worker will do a continuous slice of either n_jobs / n_workers
  15.   // or ceil_div(n_jobs, n_workers). The return value is an int2 representing
  16.   // a half open interval of jobs for the worker to perform (perform jobs
  17.   // i for a <= i < b)
  18.  
  19.   int cd = CEIL_DIV(n_jobs, n_workers);
  20.   int d = n_jobs / n_workers;
  21.  
  22.   int doing_cd = n_jobs % n_workers;
  23.  
  24.   int2 retval;
  25.   if (worker_idx < doing_cd) {
  26.     retval.x = worker_idx * cd;
  27.     retval.y = retval.x + cd;
  28.   } else {
  29.     retval.x = doing_cd * cd + (worker_idx - doing_cd) * d;
  30.     retval.y = retval.x + d;
  31.   }
  32.  
  33.   return retval;
  34. }
  35.  
  36. __device__ int2 compute_warp_start_stop(int block_idx, int warp_idx,
  37.                     int n_blocks, int n_steps) {
  38.   int2 block_ss = divide_work(n_steps, n_blocks, block_idx);
  39.   int block_start = block_ss.x;
  40.   int block_stop = block_ss.y;
  41.   int block_jobs = block_stop - block_start;
  42.  
  43.   int2 warp_ss = divide_work(block_jobs, 32, warp_idx);
  44.   int warp_start = block_start + warp_ss.x;
  45.   int warp_stop = block_start + warp_ss.y;
  46.  
  47.   int2 retval;
  48.   retval.x = warp_start;
  49.   retval.y = warp_stop;
  50.   return retval;
  51. }
  52. __global__ void reduction_kernel(float *decays, float *impulses,
  53.                  float *initial_state,
  54.                  float *_decay_storage, float *_h_storage,
  55.                  int n_dims, int n_steps) {
  56.   int warp = threadIdx.x / 32;
  57.   int lane = threadIdx.x % 32;
  58.  
  59.   float *decay_storage = &_decay_storage[blockIdx.x * 33 * n_dims];
  60.   float *h_storage = &_h_storage[blockIdx.x * 33 * n_dims];
  61.  
  62.   int2 start_stop = compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps);
  63.   int warp_start = start_stop.x;
  64.   int warp_stop = start_stop.y;
  65.  
  66.   /*
  67.   * Reduce within warps.
  68.   * After this loop exits, the storage arrays should contain the reduction
  69.   * from warp_start to warp_stop (including initial state) at index
  70.   * (feature_idx, warp, block).
  71.   */
  72.   for (int i = lane; i < n_dims; i += 32) {
  73.     float cum_decay = 1.0;
  74.     float h = 0.0;
  75.     if (blockIdx.x == 0 && warp == 0 && initial_state != NULL) {
  76.       h = initial_state[i];
  77.     }
  78.  
  79.     for (int t = warp_start; t < warp_stop; t++) {
  80.       cum_decay *= decays[i + t * n_dims];
  81.       h = decays[i + t * n_dims] * h + impulses[i + t * n_dims];
  82.     }
  83.  
  84.     // TODO: store into shared memory, work in shared memory sized blocks
  85.     // store into global memory
  86.     decay_storage[i + warp * n_dims] = cum_decay;
  87.     h_storage[i + warp * n_dims] = h;
  88.   }
  89.  
  90.   __syncthreads();
  91.  
  92.   /*
  93.    * Reduce over warps.
  94.    * After this loop exits, the storage arrays should contain the reduction
  95.    * from block_start to block_finish (including initial state) at index
  96.    * (feature_idx, 32, block).
  97.    */
  98.   // TODO: parallel reduction (or scan). Need to worry about changing the warp
  99.   //       reduction values (as I use them again later)
  100.   for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) {
  101.     float cum_decay = 1.0;
  102.     float h = 0.0;
  103.     for (int t = 0; t < 32; t++) {
  104.       cum_decay *= decay_storage[i + t * n_dims];
  105.       h = decay_storage[i + t * n_dims] * h + h_storage[i + t * n_dims];
  106.     }
  107.     decay_storage[i + 32 * n_dims] = cum_decay;
  108.     h_storage[i + 32 * n_dims] = h;
  109.   }
  110. }
  111.  
  112. __global__ void block_scan_kernel(float *decay_storage, float *h_storage,
  113.                   int n_dims, int n_blocks) {
  114.   /*
  115.    * Scan over blocks.
  116.    * After this loop exits, the storage arrays should contain the cumulative sum
  117.    * from block_idx 0 to i (inclusive) at index (feature_idx, 32, i)
  118.    * This means (feature_idx, 32, 2) contains the reduction of blocks 0, 1, and 2.
  119.    */
  120.   // TODO: parallel scan (tricky because number of blocks isn't necessarily
  121.   //       smaller than number of warps that can fit in a single block)
  122.   for (int i = threadIdx.x + blockIdx.x * blockDim.x;
  123.        i < n_dims;
  124.        i += blockDim.x * gridDim.x) {
  125.  
  126.     for (int t = 1; t < n_blocks; t++) {
  127.       int cur_idx = i + 32 * n_dims + t * 33 * n_dims;
  128.       int prev_idx = i + 32 * n_dims + (t - 1) * 33 * n_dims;
  129.  
  130.       // TODO: remove unneccessary reads from global memory (prev_idx accesses)
  131.       h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] + h_storage[cur_idx];
  132.       decay_storage[cur_idx] *= decay_storage[prev_idx];
  133.     }
  134.   }
  135. }
  136.  
  137. __global__ void warp_scan_kernel(float *decays, float *impulses,
  138.                  float *initial_state, float *out,
  139.                  float *decay_storage, float *h_storage,
  140.                  int n_dims, int n_steps) {
  141.   int warp = threadIdx.x / 32;
  142.   int lane = threadIdx.x % 32;
  143.  
  144.   // Note: Due to the index ordering of the storage arrays, the following
  145.   // indices are equivalent:
  146.   //
  147.   // i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims
  148.   // i + 32 * n_dims + (blockIdx.x - 1) * 33 * n_dims
  149.   //
  150.   // when t is 0. This means something that looks like negative indexing
  151.   // (t-1) can be used to safely access the stored value for the previous
  152.   // warp (even if the previous warp belonged to the previous block).
  153.  
  154.   /*
  155.    * Scan over warps.
  156.    * After this loop executes, the storage arrays should contain the cumulative
  157.    * sum from the beginning of sequence (including initial condition) up to
  158.    * and including the indexed warp and block.
  159.    */
  160.   // TODO: parallel scan
  161.   for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) {
  162.     for (int t = 0; t < 32; t++) {
  163.       if (t == 0 && blockIdx.x == 0) {
  164.         // the reduction over warp 0 (including initial condition) is correct val
  165.         // for scan, so there's no work to do
  166.         continue;
  167.       }
  168.  
  169.       int cur_idx = i + t * n_dims + blockIdx.x * 33 * n_dims;
  170.       int prev_idx = i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims;
  171.       h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] + h_storage[cur_idx];
  172.       decay_storage[cur_idx] *= decay_storage[prev_idx];
  173.     }
  174.   }
  175.  
  176.   __syncthreads();
  177.  
  178.   int2 start_stop = compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps);
  179.   int warp_start = start_stop.x;
  180.   int warp_stop = start_stop.y;
  181.  
  182.   /*
  183.    * Scan within warps.
  184.    * This loop writes to the output array. Each warp reads in it's initial state
  185.    * (either from the "initial_state" or the storage arrays) and then writes
  186.    * to output for indices warp_start up to warp_stop.
  187.    */
  188.   for (int i = lane; i < n_dims; i += 32) {
  189.     float h = 0.0;
  190.     if (blockIdx.x == 0 && warp == 0) {
  191.       if (initial_state != NULL) {
  192.     h = initial_state[i];
  193.       }
  194.     } else {
  195.       h = h_storage[i + (warp - 1) * n_dims + blockIdx.x * 33 * n_dims];
  196.     }
  197.  
  198.     for (int t = warp_start; t < warp_stop; t++) {
  199.       h = decays[i + t * n_dims] * h + impulses[i + t * n_dims];
  200.       out[i + t * n_dims] = h;
  201.     }
  202.   }
  203. }
  204.  
  205.  
  206. extern "C" {
  207. /*
  208.  * This is the main method for the prefix sum kernels.
  209.  * decays, impulses, out:
  210.  *   each a n_dims x n_steps column major matrix located on GPU
  211.  * initial_state:
  212.  *   array of size n_dims located on GPU
  213.  */
  214. void compute_linear_recurrence(float *decays, float *impulses, float *initial_state,
  215.                    float *out, int n_dims, int n_steps) {
  216.  
  217.   // TODO: query
  218.   int n_SMs = 15;
  219.   int n_blocks_per_sm = 2;
  220.  
  221.   // we want at least 32 elements per block, but no reason to run
  222.   // with more than the maximum number of concurrent blocks
  223.   int n_blocks = min(CEIL_DIV(n_steps, 32), n_SMs * n_blocks_per_sm);
  224.  
  225.   // TODO: make user pass in working memory? This allows integration
  226.   //       with CNMeM (used by Theano)
  227.   int reduction_mem_sz = 2 * n_blocks * 33 * n_dims * sizeof(float);
  228.   float *d_reduction_mem;
  229.   gpuErrChk(cudaMalloc(&d_reduction_mem, reduction_mem_sz));
  230.   float *d_decay_storage = &d_reduction_mem[0 * n_blocks * 33 * n_dims];
  231.   float *d_h_storage = &d_reduction_mem[1 * n_blocks * 33 * n_dims];
  232.  
  233.   // TODO: run kernels on non-default stream?
  234.   reduction_kernel<<<n_blocks, 1024>>>(decays, impulses, initial_state,
  235.                        d_decay_storage, d_h_storage,
  236.                        n_dims, n_steps);
  237.  
  238.   block_scan_kernel<<<n_blocks, 1024>>>(d_decay_storage, d_h_storage,
  239.                     n_dims, n_blocks);
  240.  
  241.   warp_scan_kernel<<<n_blocks, 1024>>>(decays, impulses,
  242.                        initial_state, out,
  243.                        d_decay_storage, d_h_storage,
  244.                        n_dims, n_steps);
  245.  
  246.   gpuErrChk(cudaFree(d_reduction_mem));
  247. }
  248. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement