Advertisement
edft56

Untitled

Feb 24th, 2021 (edited)
813
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.15 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <iostream>
  4. #include <assert.h>
  5.  
  6. #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
  7. inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
  8. {
  9.    if (code != cudaSuccess)
  10.    {
  11.       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  12.       if (abort) exit(code);
  13.    }
  14. }
  15.  
  16.  
  17. template<const int CHANNELS, const int HW, const int OUT_2, const int TILES_SIZE, const int BATCH_SIZE>
  18. __global__ void BtdB(const float* __restrict__ input, float* __restrict__ output){
  19.     int in_read_idx = blockIdx.z*CHANNELS*HW*HW + blockIdx.y*CHANNELS*HW*4 + blockIdx.x*CHANNELS*4*blockDim.y +                                 threadIdx.y*CHANNELS*4 + threadIdx.x;
  20.     int out_idx = blockIdx.z*CHANNELS*OUT_2*OUT_2 + blockIdx.y*CHANNELS*OUT_2 + blockIdx.x*blockDim.y*CHANNELS +                                threadIdx.y*CHANNELS + threadIdx.x;
  21.  
  22.  
  23.     float tile_reg[36];
  24.  
  25.     #pragma unroll
  26.     for(int i=0; i<6; i++){
  27.         int base = in_read_idx + i*CHANNELS;
  28.        
  29.         tile_reg[i*6] = 4*input[base] - 5*input[base + 2*CHANNELS*HW] + input[base + 4*CHANNELS*HW];
  30.        
  31.         tile_reg[i*6 + 1] = -4*input[base + 1*CHANNELS*HW] - 4*input[base + 2*CHANNELS*HW] + input[base + 3*CHANNELS*HW] +                              input[base + 4*CHANNELS*HW];
  32.        
  33.         tile_reg[i*6 + 2] = 4*input[base + 1*CHANNELS*HW] - 4*input[base + 2*CHANNELS*HW] - input[base + 3*CHANNELS*HW] +                               input[base + 4*CHANNELS*HW];
  34.        
  35.         tile_reg[i*6 + 3] = -2*input[base + 1*CHANNELS*HW] - input[base + 2*CHANNELS*HW] + 2*input[base + 3*CHANNELS*HW] +                              input[base + 4*CHANNELS*HW];
  36.        
  37.         tile_reg[i*6 + 4] = 2*input[base + 1*CHANNELS*HW] - input[base + 2*CHANNELS*HW] - 2*input[base + 3*CHANNELS*HW] +                               input[base + 4*CHANNELS*HW];
  38.        
  39.         tile_reg[i*6 + 5] = 4*input[base + 1*CHANNELS*HW] - 5*input[base + 3*CHANNELS*HW] + input[base + 5*CHANNELS*HW];
  40.     }
  41.  
  42.     #pragma unroll
  43.     for(int i=0; i<6; i++){
  44.         int base = i*TILES_SIZE*CHANNELS*BATCH_SIZE*6;
  45.        
  46.         output[out_idx + base] = 4*tile_reg[i] - 5*tile_reg[i + 12] + tile_reg[i + 24];
  47.        
  48.         output[out_idx + base + TILES_SIZE*CHANNELS*BATCH_SIZE] = -4*tile_reg[i + 6] - 4*tile_reg[i + 12] +
  49.                                                                     tile_reg[i + 18] + tile_reg[i + 24];
  50.                                                                    
  51.         output[out_idx + base + 2*TILES_SIZE*CHANNELS*BATCH_SIZE] = 4*tile_reg[i + 6] - 4*tile_reg[i + 12] -
  52.                                                                     tile_reg[i + 18] + tile_reg[i + 24];
  53.                                                                    
  54.         output[out_idx + base + 3*TILES_SIZE*CHANNELS*BATCH_SIZE] = -2*tile_reg[i + 6] - tile_reg[i + 12] +
  55.                                                                     2*tile_reg[i + 18] + tile_reg[i + 24];
  56.                                                                    
  57.         output[out_idx + base + 4*TILES_SIZE*CHANNELS*BATCH_SIZE] = 2*tile_reg[i + 6] - tile_reg[i + 12] -
  58.                                                                     2*tile_reg[i + 18] + tile_reg[i + 24];
  59.                                                                    
  60.         output[out_idx + base + 5*TILES_SIZE*CHANNELS*BATCH_SIZE] = 4*tile_reg[i + 6] - 5*tile_reg[i + 18] +
  61.                                                                     tile_reg[i + 30];
  62.     }
  63. }
  64.  
  65. void test2(){
  66.     const int IN_SIZE = 226*226*64*32;
  67.     const int OUT_SIZE = 36*56*56*64*32;
  68.  
  69.     float *output;
  70.     {cudaError_t status = cudaMallocHost((void**) &output, OUT_SIZE*sizeof(float));
  71.     if (status != cudaSuccess) printf("Error allocating pinned host memory\n");}
  72.  
  73.  
  74.     float* memory_pool;
  75.  
  76.     size_t free_mem;
  77.     size_t total;
  78.     gpuErrchk(cudaMemGetInfo(&free_mem,&total));
  79.     std::cout<<free_mem<<" "<<total<<"\n";
  80.  
  81.     size_t mem_to_alloc = 0.95f*free_mem;
  82.  
  83.     gpuErrchk( cudaMalloc((void **) &memory_pool, mem_to_alloc) );
  84.     gpuErrchk( cudaDeviceSynchronize() );
  85.  
  86.     gpuErrchk(cudaMemGetInfo(&free_mem,&total));
  87.     std::cout<<free_mem<<" "<<total<<"\n";
  88.  
  89.  
  90.     float* ptr1;
  91.     float* ptr2;
  92.     float* ptr3;
  93.     float* ptr4;
  94.  
  95.  
  96.     ptr1 = memory_pool;
  97.     ptr2 = ptr1 + IN_SIZE;
  98.  
  99.     ptr3 = (memory_pool + mem_to_alloc/4) - (IN_SIZE + OUT_SIZE) - 120;
  100.  
  101.     ptr4 = ptr3 + IN_SIZE;
  102.  
  103.     dim3 threads(64,1);
  104.     dim3 blocks(56,56,32);
  105.  
  106.     BtdB<64,226,56,3136,32><<<blocks, threads>>>(ptr1,ptr2); //fast
  107.    
  108.     gpuErrchk( cudaMemcpy(output, ptr2, OUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );
  109.  
  110.     BtdB<64,226,56,3136,32><<<blocks, threads>>>(ptr3,ptr4); //slow
  111.  
  112.     gpuErrchk( cudaMemcpy(output, ptr4, OUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );
  113.  
  114.  
  115.     cudaPointerAttributes ptratr;
  116.  
  117.     gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr3));
  118.     std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";
  119.  
  120.     gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr4));
  121.     std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";
  122.  
  123.     gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr4 + OUT_SIZE));
  124.     std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";
  125.  
  126. }
  127.  
  128. int main(){
  129.     test2();
  130.     return 0;
  131. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement