peepee

#include <wb.h>

#define wbCheck(stmt)                                                     \
  do {                                                                    \
    cudaError_t err = stmt;                                               \
    if (err != cudaSuccess) {                                             \
      wbLog(ERROR, "CUDA error: ", cudaGetErrorString(err));              \
      wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
      return -1;                                                          \
    }                                                                     \
  } while (0)

//@@ Define any useful program-wide constants here
#define MASK_RADIUS 1
#define MASK_WIDTH 3
#define TILE_WIDTH 4

//@@ Define constant memory for device kernel here
__constant__ float deviceKernel[MASK_WIDTH][MASK_WIDTH][MASK_WIDTH];

__global__ void conv3d(float *input, float *output, const int z_size,
                       const int y_size, const int x_size) {
  //@@ Insert kernel code here
  // shared mem tile with space for halos


  __shared__ float tile[TILE_WIDTH + 2*MASK_RADIUS][TILE_WIDTH + 2*MASK_RADIUS][TILE_WIDTH + 2*MASK_RADIUS];

  int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z;
  int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;

  // shifting from output coordinates to input coordinates
  int dep_o = bz * TILE_WIDTH + tz; // depth? idk the z dimension name lol
  int row_o = by * TILE_WIDTH + ty;
  int col_o = bx * TILE_WIDTH + tx;

  int dep_i = dep_o - MASK_RADIUS;
  int row_i = row_o - MASK_RADIUS;
  int col_i = col_o - MASK_RADIUS;

  // taking care of boundaries
  if ((dep_i >= 0) && (dep_i < z_size) && (row_i >= 0) && (row_i < y_size) && (col_i >= 0) && (col_i < x_size)) {
    tile[tz][ty][tx] = input[dep_i * (y_size * x_size) + row_i * (x_size) + col_i];
  }
  else {
    tile[tz][ty][tx] = 0.0f;
  }

  __syncthreads(); // now we wait for the tile...

  float result = 0.0f;
  if ((tz < TILE_WIDTH) && (ty < TILE_WIDTH) && (tx < TILE_WIDTH)) {
    for (int i = 0; i < MASK_WIDTH; i++) {
      for (int j = 0; j < MASK_WIDTH; j++) {
        for (int k = 0; k < MASK_WIDTH; k++) {
          result += deviceKernel[i][j][k] * tile[tz+i][ty+j][tx+k];
        }
      }
    }

    if ((dep_o < z_size) && (row_o < y_size) && (col_o < x_size)) {
      output[dep_o * (x_size * y_size) + row_o * (x_size) + col_o] = result;
    }
  }
}

int main(int argc, char *argv[]) {
  wbArg_t args;
  int z_size;
  int y_size;
  int x_size;
  int inputLength, kernelLength;
  float *hostInput;
  float *hostKernel;
  float *hostOutput;
  float *deviceInput;
  float *deviceOutput;

  args = wbArg_read(argc, argv);

  // Import data
  hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
  hostKernel =
      (float *)wbImport(wbArg_getInputFile(args, 1), &kernelLength);
  hostOutput = (float *)malloc(inputLength * sizeof(float));

  // First three elements are the input dimensions
  z_size = hostInput[0];
  y_size = hostInput[1];
  x_size = hostInput[2];
  wbLog(TRACE, "The input size is ", z_size, "x", y_size, "x", x_size);
  assert(z_size * y_size * x_size == inputLength - 3);
  assert(kernelLength == 27);

  wbTime_start(GPU, "Doing GPU Computation (memory + compute)");

  wbTime_start(GPU, "Doing GPU memory allocation");
  //@@ Allocate GPU memory here
  cudaMalloc((void**) &deviceInput, (inputLength-3) * sizeof(float));
  cudaMalloc((void**) &deviceOutput, (inputLength-3) * sizeof(float));
  // Recall that inputLength is 3 elements longer than the input data
  // because the first  three elements were the dimensions
  wbTime_stop(GPU, "Doing GPU memory allocation");

  wbTime_start(Copy, "Copying data to the GPU");
  //@@ Copy input and kernel to GPU here
  cudaMemcpy(deviceInput, &hostInput[3], (inputLength-3) * sizeof(float), cudaMemcpyHostToDevice);
  // "symbol can either be a var in global or constant memory space, or it can be a character string"
  // we need to use cpyToSymbol bc deviceKernel is const. This is good because
  cudaMemcpyToSymbol(deviceKernel, hostKernel, kernelLength * sizeof(float));
  // Recall that the first three elements of hostInput are dimensions and
  // do
  // not need to be copied to the gpu
  wbTime_stop(Copy, "Copying data to the GPU");

  wbTime_start(Compute, "Doing the computation on the GPU");
  //@@ Initialize grid and block dimensions here
  dim3 DimGrid(ceil(x_size/float(TILE_WIDTH)), ceil(y_size/float(TILE_WIDTH)), ceil(z_size/float(TILE_WIDTH)));
  dim3 DimBlock(TILE_WIDTH+MASK_RADIUS*2, TILE_WIDTH+MASK_RADIUS*2, TILE_WIDTH+MASK_RADIUS*2);
  //@@ Launch the GPU kernel here
  conv3d<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, z_size, y_size, x_size);
  cudaDeviceSynchronize();
  wbTime_stop(Compute, "Doing the computation on the GPU");

  wbTime_start(Copy, "Copying data from the GPU");
  //@@ Copy the device memory back to the host here
  cudaMemcpy(&hostOutput[3], deviceOutput, (inputLength-3) * sizeof(float), cudaMemcpyDeviceToHost);
  // Recall that the first three elements of the output are the dimensions
  // and should not be set here (they are set below)
  wbTime_stop(Copy, "Copying data from the GPU");

  wbTime_stop(GPU, "Doing GPU Computation (memory + compute)");

  // Set the output dimensions for correctness checking
  hostOutput[0] = z_size;
  hostOutput[1] = y_size;
  hostOutput[2] = x_size;
  wbSolution(args, hostOutput, inputLength);

  // Free device memory
  cudaFree(deviceInput);
  cudaFree(deviceOutput);

  // Free host memory
  free(hostInput);
  free(hostOutput);
  return 0;
}