Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <iostream>
- #include <assert.h>
- #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
- inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
- {
- if (code != cudaSuccess)
- {
- fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
- if (abort) exit(code);
- }
- }
- template<const int CHANNELS, const int HW, const int OUT_2, const int TILES_SIZE, const int BATCH_SIZE>
- __global__ void BtdB(const float* __restrict__ input, float* __restrict__ output){
- int in_read_idx = blockIdx.z*CHANNELS*HW*HW + blockIdx.y*CHANNELS*HW*4 + blockIdx.x*CHANNELS*4*blockDim.y + threadIdx.y*CHANNELS*4 + threadIdx.x;
- int out_idx = blockIdx.z*CHANNELS*OUT_2*OUT_2 + blockIdx.y*CHANNELS*OUT_2 + blockIdx.x*blockDim.y*CHANNELS + threadIdx.y*CHANNELS + threadIdx.x;
- float tile_reg[36];
- #pragma unroll
- for(int i=0; i<6; i++){
- int base = in_read_idx + i*CHANNELS;
- tile_reg[i*6] = 4*input[base] - 5*input[base + 2*CHANNELS*HW] + input[base + 4*CHANNELS*HW];
- tile_reg[i*6 + 1] = -4*input[base + 1*CHANNELS*HW] - 4*input[base + 2*CHANNELS*HW] + input[base + 3*CHANNELS*HW] + input[base + 4*CHANNELS*HW];
- tile_reg[i*6 + 2] = 4*input[base + 1*CHANNELS*HW] - 4*input[base + 2*CHANNELS*HW] - input[base + 3*CHANNELS*HW] + input[base + 4*CHANNELS*HW];
- tile_reg[i*6 + 3] = -2*input[base + 1*CHANNELS*HW] - input[base + 2*CHANNELS*HW] + 2*input[base + 3*CHANNELS*HW] + input[base + 4*CHANNELS*HW];
- tile_reg[i*6 + 4] = 2*input[base + 1*CHANNELS*HW] - input[base + 2*CHANNELS*HW] - 2*input[base + 3*CHANNELS*HW] + input[base + 4*CHANNELS*HW];
- tile_reg[i*6 + 5] = 4*input[base + 1*CHANNELS*HW] - 5*input[base + 3*CHANNELS*HW] + input[base + 5*CHANNELS*HW];
- }
- #pragma unroll
- for(int i=0; i<6; i++){
- int base = i*TILES_SIZE*CHANNELS*BATCH_SIZE*6;
- output[out_idx + base] = 4*tile_reg[i] - 5*tile_reg[i + 12] + tile_reg[i + 24];
- output[out_idx + base + TILES_SIZE*CHANNELS*BATCH_SIZE] = -4*tile_reg[i + 6] - 4*tile_reg[i + 12] +
- tile_reg[i + 18] + tile_reg[i + 24];
- output[out_idx + base + 2*TILES_SIZE*CHANNELS*BATCH_SIZE] = 4*tile_reg[i + 6] - 4*tile_reg[i + 12] -
- tile_reg[i + 18] + tile_reg[i + 24];
- output[out_idx + base + 3*TILES_SIZE*CHANNELS*BATCH_SIZE] = -2*tile_reg[i + 6] - tile_reg[i + 12] +
- 2*tile_reg[i + 18] + tile_reg[i + 24];
- output[out_idx + base + 4*TILES_SIZE*CHANNELS*BATCH_SIZE] = 2*tile_reg[i + 6] - tile_reg[i + 12] -
- 2*tile_reg[i + 18] + tile_reg[i + 24];
- output[out_idx + base + 5*TILES_SIZE*CHANNELS*BATCH_SIZE] = 4*tile_reg[i + 6] - 5*tile_reg[i + 18] +
- tile_reg[i + 30];
- }
- }
- void test2(){
- const int IN_SIZE = 226*226*64*32;
- const int OUT_SIZE = 36*56*56*64*32;
- float *output;
- {cudaError_t status = cudaMallocHost((void**) &output, OUT_SIZE*sizeof(float));
- if (status != cudaSuccess) printf("Error allocating pinned host memory\n");}
- float* memory_pool;
- size_t free_mem;
- size_t total;
- gpuErrchk(cudaMemGetInfo(&free_mem,&total));
- std::cout<<free_mem<<" "<<total<<"\n";
- size_t mem_to_alloc = 0.95f*free_mem;
- gpuErrchk( cudaMalloc((void **) &memory_pool, mem_to_alloc) );
- gpuErrchk( cudaDeviceSynchronize() );
- gpuErrchk(cudaMemGetInfo(&free_mem,&total));
- std::cout<<free_mem<<" "<<total<<"\n";
- float* ptr1;
- float* ptr2;
- float* ptr3;
- float* ptr4;
- ptr1 = memory_pool;
- ptr2 = ptr1 + IN_SIZE;
- ptr3 = (memory_pool + mem_to_alloc/4) - (IN_SIZE + OUT_SIZE) - 120;
- ptr4 = ptr3 + IN_SIZE;
- dim3 threads(64,1);
- dim3 blocks(56,56,32);
- BtdB<64,226,56,3136,32><<<blocks, threads>>>(ptr1,ptr2); //fast
- gpuErrchk( cudaMemcpy(output, ptr2, OUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );
- BtdB<64,226,56,3136,32><<<blocks, threads>>>(ptr3,ptr4); //slow
- gpuErrchk( cudaMemcpy(output, ptr4, OUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );
- cudaPointerAttributes ptratr;
- gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr3));
- std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";
- gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr4));
- std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";
- gpuErrchk( cudaPointerGetAttributes(&ptratr,ptr4 + OUT_SIZE));
- std::cout<<ptratr.type<<" "<< ptratr.device <<" "<< ptratr.devicePointer <<" "<< ptratr.hostPointer << "\n";
- }
- int main(){
- test2();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement