Advertisement
phystota

unroll_2

Nov 5th, 2024
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 10.07 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 256
  7.  
  8. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  9.                                         const int Batch, const int Channel,
  10.                                         const int Height, const int Width,
  11.                                         const int K) {
  12.     /*
  13.     Modify this function to implement the input matrix unrolling kernel.
  14.  
  15.     Function paramter definitions:
  16.     input - input
  17.     output - output
  18.     Batch - batch_size (number of images in x)
  19.     Channel - number of input feature maps
  20.     Height - input height dimension
  21.     Width - input width dimension
  22.     K - kernel height and width (K x K)
  23.     */
  24.     unsigned int batchN = blockIdx.y;
  25.     unsigned int t = blockIdx.x * blockDim.x + threadIdx.x;
  26.  
  27.     const int Height_out = Height - K + 1;
  28.     const int Width_out = Width - K + 1;
  29.  
  30.     int W_unroll = Height_out * Width_out;
  31.     int H_unroll = Channel * K * K;
  32.  
  33.     // We have some nice #defs for you below to simplify indexing. Feel free to use them, or create your own.
  34.     // An example use of these macros:
  35.     // float a = in_4d(0,0,0,0)
  36.  
  37.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0] // input(batch, channel, height, width)
  38.     #define out_3d(i2,i1,i0) output[(i2) * (H_unroll * W_unroll) + (i1) * (W_unroll) + i0] // output(batch, height, width)
  39.  
  40.     // Width of the unrolled input feature matrix
  41.  
  42.     if (batchN < Batch && t < Channel * W_unroll) {
  43.         // Channel of the input feature map being collected by the thread
  44.         int c = t / W_unroll;
  45.         // Column index of the unrolled matrix to write a strip of
  46.         // input elements into (also, the linearized index of the output
  47.         // element for which the thread is collecting input elements)
  48.         int w_unroll = t % W_unroll;
  49.         // Horizontal and vertical indices of the output element
  50.         int h_out = w_unroll / Width_out;
  51.         int w_out = w_unroll % Width_out;
  52.         // Starting row index for the unrolled matrix section for channel c
  53.         int w_base = c * K * K;
  54.         for(int p = 0; p < K; p++) {
  55.             for(int q = 0; q < K; q++) {
  56.                 // Row index of the unrolled matrix for the thread to write
  57.                 // the input element into for the current iteration
  58.                 int h_unroll = w_base + p*K + q;
  59.                 out_3d(batchN, h_unroll, w_unroll) = in_4d(batchN, c, h_out + p, w_out + q);
  60.             }
  61.         }
  62.     }
  63.     #undef in_4d
  64. }
  65.  
  66. // Tiled matrix multiplication kernel. Computes C = AB
  67. // You don't need to modify this kernel.
  68. __global__ void matrixMultiplyShared(const float *A, const float *B, float *C,
  69.                                      int numARows, int numAColumns,
  70.                                      int numBRows, int numBColumns,
  71.                                      int numCRows, int numCColumns)
  72. {
  73.     __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
  74.     __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
  75.  
  76.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  77.  
  78.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  79.     float val = 0;
  80.  
  81.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  82.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  83.             tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx];
  84.         } else {
  85.             tileA[ty][tx] = 0;
  86.         }
  87.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  88.             tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
  89.         } else {
  90.             tileB[ty][tx] = 0;
  91.         }
  92.         __syncthreads();
  93.  
  94.         if (row < numCRows && col < numCColumns) {
  95.             for (int i = 0; i < TILE_WIDTH; i++) {
  96.                 val += tileA[ty][i] * tileB[i][tx];
  97.             }
  98.         }
  99.         __syncthreads();
  100.     }
  101.  
  102.     if (row < numCRows && col < numCColumns) {
  103.         C[row * numCColumns + col] = val;
  104.     }
  105. }
  106.  
  107. // Permutes the matmul result.
  108. // The output feature map after matmul is of shape Map_out x Batch x Height_out x Width_out,
  109. // and we need to permute it into Batch x Map_out x Height_out x Width_out.
  110. // You don't need to modify this kernel.
  111. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  112.                                       int Batch, int image_size) {
  113.     int b = blockIdx.y;
  114.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  115.     if (x < image_size) {
  116.         for (int m = 0; m < Map_out; m++) {
  117.             output[b * Map_out * image_size + m * image_size + x] =
  118.                     input[m * Batch * image_size + b * image_size + x];
  119.         }
  120.     }
  121. }
  122.  
  123. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  124. {
  125.     // TODO: Allocate memory and copy over the relevant data structures to the GPU
  126.  
  127.     // We pass double pointers for you to initialize the relevant device pointers,
  128.     //  which are passed to the other two functions.
  129.  
  130.     // Useful snippet for error checking
  131.     // cudaError_t error = cudaGetLastError();
  132.     // if(error != cudaSuccess)
  133.     // {
  134.     //     std::cout<<"CUDA error: "<<cudaGetErrorString(error)<<std::endl;
  135.     //     exit(-1);
  136.     // }
  137.  
  138.     //  allocating memory
  139.  
  140.     // Calculate sizes
  141.     const int Height_out = Height - K + 1;
  142.     const int Width_out = Width - K + 1;
  143.    
  144.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  145.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  146.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  147.  
  148.     cudaMalloc((void**)device_input_ptr, input_size);
  149.     cudaMalloc((void**)device_mask_ptr, mask_size);
  150.     cudaMalloc((void**)device_output_ptr, output_size);
  151.  
  152.     cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
  153.     cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
  154.  
  155. }
  156.  
  157.  
  158. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  159. {
  160.     const int Height_out = Height - K + 1;
  161.     const int Width_out = Width - K + 1;
  162.     const int Height_unrolled = Channel * K * K;
  163.     const int Width_unrolled = Batch * Height_out * Width_out;
  164.  
  165.     //allocating temping storage of unrolling matrix
  166.     float *unrolled_matrix;  // Pointer to device memory for storing the unrolled matrix
  167.     float *matmul_output;    // Pointer to device memory for storing the result of matrix multiplication
  168.     cudaMalloc((void**)&unrolled_matrix, (size_t) Batch * Channel * K * K * Height_out * Width_out * sizeof(float));
  169.     cudaMalloc((void**)&matmul_output, (Batch * Map_out * Height_out * Width_out) * sizeof(float));
  170.  
  171.     // TODO: Set the kernel dimensions and call the matrix unrolling kernel.
  172.     dim3 gridDim((Channel * Width_unrolled + BLOCK_SIZE - 1) / BLOCK_SIZE, Batch, 1);
  173.     matrix_unrolling_kernel<<<gridDim, BLOCK_SIZE>>>(device_input, unrolled_matrix, Batch, Channel, Height, Width, K);
  174.  
  175.     // TODO: Set the kernel dimensions and call the matmul kernel
  176.     dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
  177.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  178.     matrixMultiplyShared<<<dimGrid, dimBlock>>>(device_mask, unrolled_matrix, matmul_output, Map_out, Height_unrolled, Height_unrolled, Width_unrolled,
  179.     Map_out, Width_unrolled);
  180.  
  181.     // Permute the result of matrix multiplication
  182.     const int out_image_size = Height_out * Width_out;
  183.     dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  184.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(matmul_output, device_output, Map_out, Batch, out_image_size);
  185.  
  186.     cudaFree(matmul_output);
  187.     cudaFree(unrolled_matrix);
  188. }
  189.  
  190.  
  191. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  192. {
  193.  
  194.     // Calculate output size
  195.     const int Height_out = Height - K + 1;
  196.     const int Width_out = Width - K + 1;
  197.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  198.  
  199.     // TODO: Copy the output back to host
  200.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  201.  
  202.     // TODO: Free device memory
  203.     cudaFree(device_output);
  204.     cudaFree(device_input);
  205.     cudaFree(device_mask);
  206. }
  207.  
  208.  
  209. __host__ void GPUInterface::get_device_properties()
  210. {
  211.     int deviceCount;
  212.     cudaGetDeviceCount(&deviceCount);
  213.  
  214.     for(int dev = 0; dev < deviceCount; dev++)
  215.     {
  216.         cudaDeviceProp deviceProp;
  217.         cudaGetDeviceProperties(&deviceProp, dev);
  218.  
  219.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  220.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  221.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  222.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  223.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  224.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  225.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  226.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  227.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  228.     }
  229. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement