Advertisement
phystota

FP16_working

Dec 2nd, 2024
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 10.69 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4. #include <cuda_fp16.h> // Added for FP16 support
  5.  
  6. #define TILE_WIDTH 16
  7. #define BLOCK_SIZE 512
  8.  
  9. // Added constant memory for FP16 mask storage
  10. __constant__ half KERNEL_DEVICE_CST[3136];
  11.  
  12. // New kernel to convert FP32 to FP16
  13. __global__ void convertFloatToHalf(half *output, const float *input, const int numElements) {
  14.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  15.     if (idx < numElements) {
  16.         output[idx] = __float2half(input[idx]);
  17.     }
  18. }
  19.  
  20. // Modified matrix unrolling kernel to work with FP16
  21. __global__ void matrix_unrolling_kernel_fp16(const half *input, half *output,
  22.                                            const int Batch, const int Channel,
  23.                                            const int Height, const int Width,
  24.                                            const int K) {
  25.  
  26.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  27.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  28.  
  29.     const size_t Height_out = Height - K + 1;
  30.     const size_t Width_out = Width - K + 1;
  31.     const size_t W_unroll = Height_out * Width_out;
  32.     const size_t H_unroll = Channel * K * K;
  33.     const size_t W_total_unroll = Batch * W_unroll;
  34.  
  35.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
  36.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  37.     const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  38.  
  39.     const size_t h_out = hw_pos / Width_out;
  40.     const size_t w_out = hw_pos % Width_out;
  41.  
  42.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
  43.         return;
  44.     }
  45.  
  46.     const size_t w_unroll = h_out * Width_out + w_out;
  47.     const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
  48.     const size_t w_base = c * K * K;
  49.  
  50.     // Using FP16 operations for the unrolling
  51.     for (int p = 0; p < K; p++) {
  52.         for (int q = 0; q < K; q++) {
  53.             int h_unroll = w_base + p * K + q;
  54.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  55.         }
  56.     }
  57.  
  58.     #undef in_4d
  59.     #undef out_3d
  60. }
  61.  
  62. // Modified matrix multiplication kernel for FP16
  63. __global__ void matrixMultiplyShared_fp16(const half *A, const half *B, float *C,
  64.                                          int numARows, int numAColumns,
  65.                                          int numBRows, int numBColumns,
  66.                                          int numCRows, int numCColumns) {
  67.     __shared__ half tileA[TILE_WIDTH][TILE_WIDTH];
  68.     __shared__ half tileB[TILE_WIDTH][TILE_WIDTH];
  69.  
  70.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  71.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  72.     half acc = __float2half(0.0f);
  73.  
  74.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  75.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  76.             tileA[ty][tx] = A[(size_t)row * numAColumns + tileId * TILE_WIDTH + tx];
  77.         } else {
  78.             tileA[ty][tx] = __float2half(0.0f);
  79.         }
  80.        
  81.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  82.             tileB[ty][tx] = B[((size_t)tileId * TILE_WIDTH + ty) * numBColumns + col];
  83.         } else {
  84.             tileB[ty][tx] = __float2half(0.0f);
  85.         }
  86.        
  87.         __syncthreads();
  88.  
  89.         if (row < numCRows && col < numCColumns) {
  90.             for (int i = 0; i < TILE_WIDTH; i++) {
  91.                 // Using FP16 multiplication and addition
  92.                 acc = __hadd(acc, __hmul(tileA[ty][i], tileB[i][tx]));
  93.             }
  94.         }
  95.         __syncthreads();
  96.     }
  97.  
  98.     if (row < numCRows && col < numCColumns) {
  99.         // Convert back to FP32 for output
  100.         C[row * numCColumns + col] = __half2float(acc);
  101.     }
  102. }
  103.  
  104. // Keep the original matrix_permute_kernel as is since it works with FP32 output
  105. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  106.                                     int Batch, int image_size) {
  107.     int b = blockIdx.y;
  108.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  109.     if (x < image_size) {
  110.         for (int m = 0; m < Map_out; m++) {
  111.             output[b * Map_out * image_size + m * image_size + x] =
  112.                     input[m * Batch * image_size + b * image_size + x];
  113.         }
  114.     }
  115. }
  116.  
  117. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) {
  118.     // Calculate output dimensions and sizes
  119.     const int Height_out = Height - K + 1;
  120.     const int Width_out = Width - K + 1;
  121.    
  122.     const int input_size = Batch * Channel * Height * Width;
  123.     const int mask_size = Map_out * Channel * K * K;
  124.     const int output_size = Batch * Map_out * Height_out * Width_out;
  125.  
  126.     // Allocate memory for FP16 versions
  127.     half *device_input_fp16, *device_mask_fp16;
  128.     cudaMalloc(&device_input_fp16, input_size * sizeof(half));
  129.     cudaMalloc(&device_mask_fp16, mask_size * sizeof(half));
  130.     cudaMalloc(device_output_ptr, output_size * sizeof(float));
  131.  
  132.     // Temporary FP32 storage for conversion
  133.     float *device_input_fp32, *device_mask_fp32;
  134.     cudaMalloc(&device_input_fp32, input_size * sizeof(float));
  135.     cudaMalloc(&device_mask_fp32, mask_size * sizeof(float));
  136.  
  137.     // Copy FP32 data to device
  138.     cudaMemcpy(device_input_fp32, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
  139.     cudaMemcpy(device_mask_fp32, host_mask, mask_size * sizeof(float), cudaMemcpyHostToDevice);
  140.  
  141.     // Convert to FP16
  142.     const int blockSize = 256;
  143.     dim3 gridDim_input((input_size + blockSize - 1) / blockSize);
  144.     dim3 gridDim_mask((mask_size + blockSize - 1) / blockSize);
  145.  
  146.     convertFloatToHalf<<<gridDim_input, blockSize>>>(device_input_fp16, device_input_fp32, input_size);
  147.     convertFloatToHalf<<<gridDim_mask, blockSize>>>(device_mask_fp16, device_mask_fp32, mask_size);
  148.  
  149.     // Copy mask to constant memory
  150.     cudaMemcpyToSymbol(KERNEL_DEVICE_CST, device_mask_fp16, mask_size * sizeof(half));
  151.  
  152.     // Store FP16 pointers in the provided FP32 pointers
  153.     *device_input_ptr = reinterpret_cast<float*>(device_input_fp16);
  154.     *device_mask_ptr = reinterpret_cast<float*>(device_mask_fp16);
  155.  
  156.     // Clean up temporary storage
  157.     cudaFree(device_input_fp32);
  158.     cudaFree(device_mask_fp32);
  159. }
  160.  
  161. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) {
  162.     // Cast back to FP16 pointers
  163.     const half *input_fp16 = reinterpret_cast<const half*>(device_input);
  164.     const half *mask_fp16 = reinterpret_cast<const half*>(device_mask);
  165.  
  166.     const int Height_out = Height - K + 1;
  167.     const int Width_out = Width - K + 1;
  168.     const int Height_unrolled = Channel * K * K;
  169.     const int Width_unrolled = Batch * Height_out * Width_out;
  170.  
  171.     // Allocate unrolled matrix in FP16
  172.     half *unrolled_matrix;
  173.     cudaMalloc(&unrolled_matrix, (size_t)Height_unrolled * Width_unrolled * sizeof(half));
  174.  
  175.     // Set dimensions for matrix unrolling
  176.     dim3 blockDim(4, 256, 1);
  177.     dim3 gridDim(
  178.         (Channel + blockDim.x - 1) / blockDim.x,
  179.         (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
  180.         (Batch + blockDim.z - 1) / blockDim.z
  181.     );
  182.  
  183.     // Perform matrix unrolling in FP16
  184.     matrix_unrolling_kernel_fp16<<<gridDim, blockDim>>>(
  185.         input_fp16, unrolled_matrix,
  186.         Batch, Channel, Height, Width, K
  187.     );
  188.  
  189.     // Matrix multiplication dimensions
  190.     dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
  191.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  192.  
  193.     // Temporary storage for matmul result
  194.     float *matmul_output;
  195.     cudaMalloc(&matmul_output, Width_unrolled * Map_out * sizeof(float));
  196.  
  197.     // Perform matrix multiplication with FP16 inputs
  198.     matrixMultiplyShared_fp16<<<dimGrid, dimBlock>>>(
  199.         mask_fp16, unrolled_matrix, matmul_output,
  200.         Map_out, Height_unrolled, Height_unrolled, Width_unrolled,
  201.         Map_out, Width_unrolled
  202.     );
  203.  
  204.     // Permute the result
  205.     const int out_image_size = Height_out * Width_out;
  206.     dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  207.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(
  208.         matmul_output, device_output, Map_out, Batch, out_image_size
  209.     );
  210.  
  211.     // Clean up
  212.     cudaFree(unrolled_matrix);
  213.     cudaFree(matmul_output);
  214. }
  215.  
  216. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) {
  217.     // Calculate output size
  218.     const int Height_out = Height - K + 1;
  219.     const int Width_out = Width - K + 1;
  220.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  221.  
  222.     // Copy output back to host
  223.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  224.  
  225.     // Free device memory (cast to half* for FP16 allocations)
  226.     cudaFree(device_output);
  227.     cudaFree(reinterpret_cast<half*>(device_input));
  228.     cudaFree(reinterpret_cast<half*>(device_mask));
  229. }
  230.  
  231.  
  232. // Host function: Get Device Properties
  233. __host__ void GPUInterface::get_device_properties()
  234. {
  235.     int deviceCount;
  236.     cudaGetDeviceCount(&deviceCount);
  237.  
  238.     for(int dev = 0; dev < deviceCount; dev++)
  239.     {
  240.         cudaDeviceProp deviceProp;
  241.         cudaGetDeviceProperties(&deviceProp, dev);
  242.  
  243.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  244.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  245.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  246.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  247.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  248.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  249.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  250.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  251.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  252.     }
  253. }
  254.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement