#include #include #include "gpu-new-forward.h" #define TILE_WIDTH 16 #define BLOCK_SIZE 512 __global__ void matrix_unrolling_kernel(const float *input, float *output, const int Batch, const int Channel, const int Height, const int Width, const int K) { #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0] #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0] // Calculate output dimensions const size_t Height_out = Height - K + 1; const size_t Width_out = Width - K + 1; const size_t W_unroll = Height_out * Width_out; const size_t H_unroll = Channel * K * K; const size_t W_total_unroll = Batch * W_unroll; // Calculate thread indices const size_t c = blockIdx.x * blockDim.x + threadIdx.x; // Channel/map index const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y; // Combined height-width position const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;// Batch index // Extract height and width positions const size_t h_out = hw_pos / Width_out; // Height position const size_t w_out = hw_pos % Width_out; // Width position // Boundary check if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) { return; } // Calculate position in unrolled matrix const size_t w_unroll = h_out * Width_out + w_out; const size_t w_total_unroll = batch_idx * W_unroll + w_unroll; const size_t w_base = c * K * K; // Perform unrolling for (int p = 0; p < K; p++) { for (int q = 0; q < K; q++) { int h_unroll = w_base + p * K + q; out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q); } } #undef in_4d #undef out_3d } // Tiled matrix multiplication kernel. Computes C = AB // You don't need to modify this kernel. __global__ void matrixMultiplyShared(const float *A, const float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { __shared__ float tileA[TILE_WIDTH][TILE_WIDTH]; __shared__ float tileB[TILE_WIDTH][TILE_WIDTH]; int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x; int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx; float val = 0; for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) { if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) { tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx]; } else { tileA[ty][tx] = 0; } if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) { tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col]; } else { tileB[ty][tx] = 0; } __syncthreads(); if (row < numCRows && col < numCColumns) { for (int i = 0; i < TILE_WIDTH; i++) { val += tileA[ty][i] * tileB[i][tx]; } } __syncthreads(); } if (row < numCRows && col < numCColumns) { C[row * numCColumns + col] = val; } } // Permutes the matmul result. // The output feature map after matmul is of shape Map_out x Batch x Height_out x Width_out, // and we need to permute it into Batch x Map_out x Height_out x Width_out. // You don't need to modify this kernel. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out, int Batch, int image_size) { int b = blockIdx.y; int x = blockIdx.x * BLOCK_SIZE + threadIdx.x; if (x < image_size) { for (int m = 0; m < Map_out; m++) { output[b * Map_out * image_size + m * image_size + x] = input[m * Batch * image_size + b * image_size + x]; } } } __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) { // TODO: Allocate memory and copy over the relevant data structures to the GPU // We pass double pointers for you to initialize the relevant device pointers, // which are passed to the other two functions. // Useful snippet for error checking // cudaError_t error = cudaGetLastError(); // if(error != cudaSuccess) // { // std::cout<<"CUDA error: "<>>(device_input, unrolled_matrix, Batch, Channel, Height, Width, K); // TODO: Set the kernel dimensions and call the matmul kernel dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1); dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); matrixMultiplyShared<<>>(device_mask, unrolled_matrix, matmul_output, Map_out, Height_unrolled, Height_unrolled, Width_unrolled, Map_out, Width_unrolled); // Permute the result of matrix multiplication const int out_image_size = Height_out * Width_out; dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1); matrix_permute_kernel<<>>(matmul_output, device_output, Map_out, Batch, out_image_size); cudaFree(matmul_output); cudaFree(unrolled_matrix); } __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) { // Calculate output size const int Height_out = Height - K + 1; const int Width_out = Width - K + 1; const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float); // TODO: Copy the output back to host cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost); // TODO: Free device memory cudaFree(device_output); cudaFree(device_input); cudaFree(device_mask); } __host__ void GPUInterface::get_device_properties() { int deviceCount; cudaGetDeviceCount(&deviceCount); for(int dev = 0; dev < deviceCount; dev++) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); std::cout<<"Device "<