Advertisement
Guest User

peepee

a guest
Feb 26th, 2020
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.66 KB | None | 0 0
  1. #include <wb.h>
  2.  
  3. #define wbCheck(stmt) \
  4. do { \
  5. cudaError_t err = stmt; \
  6. if (err != cudaSuccess) { \
  7. wbLog(ERROR, "CUDA error: ", cudaGetErrorString(err)); \
  8. wbLog(ERROR, "Failed to run stmt ", #stmt); \
  9. return -1; \
  10. } \
  11. } while (0)
  12.  
  13. //@@ Define any useful program-wide constants here
  14. #define MASK_RADIUS 1
  15. #define MASK_WIDTH 3
  16. #define TILE_WIDTH 4
  17.  
  18. //@@ Define constant memory for device kernel here
  19. __constant__ float deviceKernel[MASK_WIDTH][MASK_WIDTH][MASK_WIDTH];
  20.  
  21. __global__ void conv3d(float *input, float *output, const int z_size,
  22. const int y_size, const int x_size) {
  23. //@@ Insert kernel code here
  24. // shared mem tile with space for halos
  25.  
  26.  
  27. __shared__ float tile[TILE_WIDTH + 2*MASK_RADIUS][TILE_WIDTH + 2*MASK_RADIUS][TILE_WIDTH + 2*MASK_RADIUS];
  28.  
  29. int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z;
  30. int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
  31.  
  32. // shifting from output coordinates to input coordinates
  33. int dep_o = bz * TILE_WIDTH + tz; // depth? idk the z dimension name lol
  34. int row_o = by * TILE_WIDTH + ty;
  35. int col_o = bx * TILE_WIDTH + tx;
  36.  
  37. int dep_i = dep_o - MASK_RADIUS;
  38. int row_i = row_o - MASK_RADIUS;
  39. int col_i = col_o - MASK_RADIUS;
  40.  
  41. // taking care of boundaries
  42. if ((dep_i >= 0) && (dep_i < z_size) && (row_i >= 0) && (row_i < y_size) && (col_i >= 0) && (col_i < x_size)) {
  43. tile[tz][ty][tx] = input[dep_i * (y_size * x_size) + row_i * (x_size) + col_i];
  44. }
  45. else {
  46. tile[tz][ty][tx] = 0.0f;
  47. }
  48.  
  49. __syncthreads(); // now we wait for the tile...
  50.  
  51. float result = 0.0f;
  52. if ((tz < TILE_WIDTH) && (ty < TILE_WIDTH) && (tx < TILE_WIDTH)) {
  53. for (int i = 0; i < MASK_WIDTH; i++) {
  54. for (int j = 0; j < MASK_WIDTH; j++) {
  55. for (int k = 0; k < MASK_WIDTH; k++) {
  56. result += deviceKernel[i][j][k] * tile[tz+i][ty+j][tx+k];
  57. }
  58. }
  59. }
  60.  
  61. if ((dep_o < z_size) && (row_o < y_size) && (col_o < x_size)) {
  62. output[dep_o * (x_size * y_size) + row_o * (x_size) + col_o] = result;
  63. }
  64. }
  65. }
  66.  
  67. int main(int argc, char *argv[]) {
  68. wbArg_t args;
  69. int z_size;
  70. int y_size;
  71. int x_size;
  72. int inputLength, kernelLength;
  73. float *hostInput;
  74. float *hostKernel;
  75. float *hostOutput;
  76. float *deviceInput;
  77. float *deviceOutput;
  78.  
  79. args = wbArg_read(argc, argv);
  80.  
  81. // Import data
  82. hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
  83. hostKernel =
  84. (float *)wbImport(wbArg_getInputFile(args, 1), &kernelLength);
  85. hostOutput = (float *)malloc(inputLength * sizeof(float));
  86.  
  87. // First three elements are the input dimensions
  88. z_size = hostInput[0];
  89. y_size = hostInput[1];
  90. x_size = hostInput[2];
  91. wbLog(TRACE, "The input size is ", z_size, "x", y_size, "x", x_size);
  92. assert(z_size * y_size * x_size == inputLength - 3);
  93. assert(kernelLength == 27);
  94.  
  95. wbTime_start(GPU, "Doing GPU Computation (memory + compute)");
  96.  
  97. wbTime_start(GPU, "Doing GPU memory allocation");
  98. //@@ Allocate GPU memory here
  99. cudaMalloc((void**) &deviceInput, (inputLength-3) * sizeof(float));
  100. cudaMalloc((void**) &deviceOutput, (inputLength-3) * sizeof(float));
  101. // Recall that inputLength is 3 elements longer than the input data
  102. // because the first three elements were the dimensions
  103. wbTime_stop(GPU, "Doing GPU memory allocation");
  104.  
  105. wbTime_start(Copy, "Copying data to the GPU");
  106. //@@ Copy input and kernel to GPU here
  107. cudaMemcpy(deviceInput, &hostInput[3], (inputLength-3) * sizeof(float), cudaMemcpyHostToDevice);
  108. // "symbol can either be a var in global or constant memory space, or it can be a character string"
  109. // we need to use cpyToSymbol bc deviceKernel is const. This is good because
  110. cudaMemcpyToSymbol(deviceKernel, hostKernel, kernelLength * sizeof(float));
  111. // Recall that the first three elements of hostInput are dimensions and
  112. // do
  113. // not need to be copied to the gpu
  114. wbTime_stop(Copy, "Copying data to the GPU");
  115.  
  116. wbTime_start(Compute, "Doing the computation on the GPU");
  117. //@@ Initialize grid and block dimensions here
  118. dim3 DimGrid(ceil(x_size/float(TILE_WIDTH)), ceil(y_size/float(TILE_WIDTH)), ceil(z_size/float(TILE_WIDTH)));
  119. dim3 DimBlock(TILE_WIDTH+MASK_RADIUS*2, TILE_WIDTH+MASK_RADIUS*2, TILE_WIDTH+MASK_RADIUS*2);
  120. //@@ Launch the GPU kernel here
  121. conv3d<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, z_size, y_size, x_size);
  122. cudaDeviceSynchronize();
  123. wbTime_stop(Compute, "Doing the computation on the GPU");
  124.  
  125. wbTime_start(Copy, "Copying data from the GPU");
  126. //@@ Copy the device memory back to the host here
  127. cudaMemcpy(&hostOutput[3], deviceOutput, (inputLength-3) * sizeof(float), cudaMemcpyDeviceToHost);
  128. // Recall that the first three elements of the output are the dimensions
  129. // and should not be set here (they are set below)
  130. wbTime_stop(Copy, "Copying data from the GPU");
  131.  
  132. wbTime_stop(GPU, "Doing GPU Computation (memory + compute)");
  133.  
  134. // Set the output dimensions for correctness checking
  135. hostOutput[0] = z_size;
  136. hostOutput[1] = y_size;
  137. hostOutput[2] = x_size;
  138. wbSolution(args, hostOutput, inputLength);
  139.  
  140. // Free device memory
  141. cudaFree(deviceInput);
  142. cudaFree(deviceOutput);
  143.  
  144. // Free host memory
  145. free(hostInput);
  146. free(hostOutput);
  147. return 0;
  148. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement