Untitled

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}


template<const int CHANNELS, const int HW, const int OUT_2, const int TILES_SIZE, const int BATCH_SIZE>
__global__ void BtdB(const float* __restrict__ input, float* __restrict__ output){
    int in_read_idx = blockIdx.z*CHANNELS*HW*HW + blockIdx.y*CHANNELS*HW*4 + blockIdx.x*CHANNELS*4*blockDim.y +                                 threadIdx.y*CHANNELS*4 + threadIdx.x;
    int out_idx = blockIdx.z*CHANNELS*OUT_2*OUT_2 + blockIdx.y*CHANNELS*OUT_2 + blockIdx.x*blockDim.y*CHANNELS +                                threadIdx.y*CHANNELS + threadIdx.x;


    float tile_reg[36];

    #pragma unroll
    for(int i=0; i<6; i++){
        int base = in_read_idx + i*CHANNELS;

        tile_reg[i*6] = 4*input[base] - 5*input[base + 2*CHANNELS*HW] + input[base + 4*CHANNELS*HW];

        tile_reg[i*6 + 1] = -4*input[base + 1*CHANNELS*HW] - 4*input[base + 2*CHANNELS*HW] + input[base + 3*CHANNELS*HW] +                              input[base + 4*CHANNELS*HW];

        tile_reg[i*6 + 2] = 4*input[base + 1*CHANNELS*HW] - 4*input[base + 2*CHANNELS*HW] - input[base + 3*CHANNELS*HW] +                               input[base + 4*CHANNELS*HW];

        tile_reg[i*6 + 3] = -2*input[base + 1*CHANNELS*HW] - input[base + 2*CHANNELS*HW] + 2*input[base + 3*CHANNELS*HW] +                              input[base + 4*CHANNELS*HW];

        tile_reg[i*6 + 4] = 2*input[base + 1*CHANNELS*HW] - input[base + 2*CHANNELS*HW] - 2*input[base + 3*CHANNELS*HW] +                               input[base + 4*CHANNELS*HW];

        tile_reg[i*6 + 5] = 4*input[base + 1*CHANNELS*HW] - 5*input[base + 3*CHANNELS*HW] + input[base + 5*CHANNELS*HW];
    }

    #pragma unroll
    for(int i=0; i<6; i++){
        int base = i*TILES_SIZE*CHANNELS*BATCH_SIZE*6;

        output[out_idx + base] = 4*tile_reg[i] - 5*tile_reg[i + 12] + tile_reg[i + 24];

        output[out_idx + base + TILES_SIZE*CHANNELS*BATCH_SIZE] = -4*tile_reg[i + 6] - 4*tile_reg[i + 12] +
                                                                    tile_reg[i + 18] + tile_reg[i + 24];

        output[out_idx + base + 2*TILES_SIZE*CHANNELS*BATCH_SIZE] = 4*tile_reg[i + 6] - 4*tile_reg[i + 12] -
                                                                    tile_reg[i + 18] + tile_reg[i + 24];

        output[out_idx + base + 3*TILES_SIZE*CHANNELS*BATCH_SIZE] = -2*tile_reg[i + 6] - tile_reg[i + 12] +
                                                                    2*tile_reg[i + 18] + tile_reg[i + 24];

        output[out_idx + base + 4*TILES_SIZE*CHANNELS*BATCH_SIZE] = 2*tile_reg[i + 6] - tile_reg[i + 12] -
                                                                    2*tile_reg[i + 18] + tile_reg[i + 24];

        output[out_idx + base + 5*TILES_SIZE*CHANNELS*BATCH_SIZE] = 4*tile_reg[i + 6] - 5*tile_reg[i + 18] +
                                                                    tile_reg[i + 30];
    }
}

void test2(){
    const int IN_SIZE = 226*226*64*32;
    const int OUT_SIZE = 36*56*56*64*32;

    float *output;
    {cudaError_t status = cudaMallocHost((void**) &output, OUT_SIZE*sizeof(float));
    if (status != cudaSuccess) printf("Error allocating pinned host memory\n");}


    float* memory_pool;

    size_t free_mem;
    size_t total;
    gpuErrchk(cudaMemGetInfo(&free_mem,&total));
    std::cout<<free_mem<<" "<<total<<"\n";

    size_t mem_to_alloc = 0.95f*free_mem;

    gpuErrchk( cudaMalloc((void **) &memory_pool, mem_to_alloc) );
    gpuErrchk( cudaDeviceSynchronize() );

    gpuErrchk(cudaMemGetInfo(&free_mem,&total));
    std::cout<<free_mem<<" "<<total<<"\n";


    float* ptr1;
    float* ptr2;
    float* ptr3;
    float* ptr4;


    ptr1 = memory_pool;
    ptr2 = ptr1 + IN_SIZE;

    ptr3 = (memory_pool + mem_to_alloc/4) - (IN_SIZE + OUT_SIZE) - 120;

    ptr4 = ptr3 + IN_SIZE;

    dim3 threads(64,1);
    dim3 blocks(56,56,32);

    BtdB<64,226,56,3136,32><<<blocks, threads>>>(ptr1,ptr2); //fast

    gpuErrchk( cudaMemcpy(output, ptr2, OUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );

    BtdB<64,226,56,3136,32><<<blocks, threads>>>(ptr3,ptr4); //slow

    gpuErrchk( cudaMemcpy(output, ptr4, OUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );


    cudaPointerAttributes ptratr;

    gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr3));
    std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";

    gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr4));
    std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";

    gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr4 + OUT_SIZE));
    std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";

}

int main(){
    test2();
    return 0;
}