Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <wb.h>
- #define wbCheck(stmt) \
- do { \
- cudaError_t err = stmt; \
- if (err != cudaSuccess) { \
- wbLog(ERROR, "CUDA error: ", cudaGetErrorString(err)); \
- wbLog(ERROR, "Failed to run stmt ", #stmt); \
- return -1; \
- } \
- } while (0)
- //@@ Define any useful program-wide constants here
- #define MASK_RADIUS 1
- #define MASK_WIDTH 3
- #define TILE_WIDTH 4
- //@@ Define constant memory for device kernel here
- __constant__ float deviceKernel[MASK_WIDTH][MASK_WIDTH][MASK_WIDTH];
- __global__ void conv3d(float *input, float *output, const int z_size,
- const int y_size, const int x_size) {
- //@@ Insert kernel code here
- // shared mem tile with space for halos
- __shared__ float tile[TILE_WIDTH + 2*MASK_RADIUS][TILE_WIDTH + 2*MASK_RADIUS][TILE_WIDTH + 2*MASK_RADIUS];
- int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z;
- int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
- // shifting from output coordinates to input coordinates
- int dep_o = bz * TILE_WIDTH + tz; // depth? idk the z dimension name lol
- int row_o = by * TILE_WIDTH + ty;
- int col_o = bx * TILE_WIDTH + tx;
- int dep_i = dep_o - MASK_RADIUS;
- int row_i = row_o - MASK_RADIUS;
- int col_i = col_o - MASK_RADIUS;
- // taking care of boundaries
- if ((dep_i >= 0) && (dep_i < z_size) && (row_i >= 0) && (row_i < y_size) && (col_i >= 0) && (col_i < x_size)) {
- tile[tz][ty][tx] = input[dep_i * (y_size * x_size) + row_i * (x_size) + col_i];
- }
- else {
- tile[tz][ty][tx] = 0.0f;
- }
- __syncthreads(); // now we wait for the tile...
- float result = 0.0f;
- if ((tz < TILE_WIDTH) && (ty < TILE_WIDTH) && (tx < TILE_WIDTH)) {
- for (int i = 0; i < MASK_WIDTH; i++) {
- for (int j = 0; j < MASK_WIDTH; j++) {
- for (int k = 0; k < MASK_WIDTH; k++) {
- result += deviceKernel[i][j][k] * tile[tz+i][ty+j][tx+k];
- }
- }
- }
- if ((dep_o < z_size) && (row_o < y_size) && (col_o < x_size)) {
- output[dep_o * (x_size * y_size) + row_o * (x_size) + col_o] = result;
- }
- }
- }
- int main(int argc, char *argv[]) {
- wbArg_t args;
- int z_size;
- int y_size;
- int x_size;
- int inputLength, kernelLength;
- float *hostInput;
- float *hostKernel;
- float *hostOutput;
- float *deviceInput;
- float *deviceOutput;
- args = wbArg_read(argc, argv);
- // Import data
- hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
- hostKernel =
- (float *)wbImport(wbArg_getInputFile(args, 1), &kernelLength);
- hostOutput = (float *)malloc(inputLength * sizeof(float));
- // First three elements are the input dimensions
- z_size = hostInput[0];
- y_size = hostInput[1];
- x_size = hostInput[2];
- wbLog(TRACE, "The input size is ", z_size, "x", y_size, "x", x_size);
- assert(z_size * y_size * x_size == inputLength - 3);
- assert(kernelLength == 27);
- wbTime_start(GPU, "Doing GPU Computation (memory + compute)");
- wbTime_start(GPU, "Doing GPU memory allocation");
- //@@ Allocate GPU memory here
- cudaMalloc((void**) &deviceInput, (inputLength-3) * sizeof(float));
- cudaMalloc((void**) &deviceOutput, (inputLength-3) * sizeof(float));
- // Recall that inputLength is 3 elements longer than the input data
- // because the first three elements were the dimensions
- wbTime_stop(GPU, "Doing GPU memory allocation");
- wbTime_start(Copy, "Copying data to the GPU");
- //@@ Copy input and kernel to GPU here
- cudaMemcpy(deviceInput, &hostInput[3], (inputLength-3) * sizeof(float), cudaMemcpyHostToDevice);
- // "symbol can either be a var in global or constant memory space, or it can be a character string"
- // we need to use cpyToSymbol bc deviceKernel is const. This is good because
- cudaMemcpyToSymbol(deviceKernel, hostKernel, kernelLength * sizeof(float));
- // Recall that the first three elements of hostInput are dimensions and
- // do
- // not need to be copied to the gpu
- wbTime_stop(Copy, "Copying data to the GPU");
- wbTime_start(Compute, "Doing the computation on the GPU");
- //@@ Initialize grid and block dimensions here
- dim3 DimGrid(ceil(x_size/float(TILE_WIDTH)), ceil(y_size/float(TILE_WIDTH)), ceil(z_size/float(TILE_WIDTH)));
- dim3 DimBlock(TILE_WIDTH+MASK_RADIUS*2, TILE_WIDTH+MASK_RADIUS*2, TILE_WIDTH+MASK_RADIUS*2);
- //@@ Launch the GPU kernel here
- conv3d<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, z_size, y_size, x_size);
- cudaDeviceSynchronize();
- wbTime_stop(Compute, "Doing the computation on the GPU");
- wbTime_start(Copy, "Copying data from the GPU");
- //@@ Copy the device memory back to the host here
- cudaMemcpy(&hostOutput[3], deviceOutput, (inputLength-3) * sizeof(float), cudaMemcpyDeviceToHost);
- // Recall that the first three elements of the output are the dimensions
- // and should not be set here (they are set below)
- wbTime_stop(Copy, "Copying data from the GPU");
- wbTime_stop(GPU, "Doing GPU Computation (memory + compute)");
- // Set the output dimensions for correctness checking
- hostOutput[0] = z_size;
- hostOutput[1] = y_size;
- hostOutput[2] = x_size;
- wbSolution(args, hostOutput, inputLength);
- // Free device memory
- cudaFree(deviceInput);
- cudaFree(deviceOutput);
- // Free host memory
- free(hostInput);
- free(hostOutput);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement