Advertisement
Guest User

Untitled

a guest
Oct 19th, 2019
319
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.39 KB | None | 0 0
  1.  
  2. #ifndef MXNET_OPERATOR_NEW_FORWARD_CUH_
  3. #define MXNET_OPERATOR_NEW_FORWARD_CUH_
  4. #define TILE_WIDTH 16
  5.  
  6. #include <mxnet/base.h>
  7.  
  8. namespace mxnet
  9. {
  10. namespace op
  11. {
  12.  
  13. __global__ void forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K)
  14. {
  15.  
  16. // An example use of these macros:
  17. // float a = y4d(0,0,0,0)
  18. // y4d(0,0,0,0) = a
  19.  
  20.  
  21. /*
  22. Modify this function to implement the forward pass described in Chapter 16.
  23. We have added an additional dimension to the tensors to support an entire mini-batch
  24. The goal here is to be correct AND fast.
  25. We have some nice #defs for you below to simplify indexing. Feel free to use them, or create your own.
  26. */
  27.  
  28. const int H_out = H - K + 1;
  29. const int W_out = W - K + 1;
  30. const int W_grid = ceil(1.0 * W_out / TILE_WIDTH);
  31. //int H_grid = ceil(1.0 * H_out / TILE_WIDTH);
  32.  
  33.  
  34. #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
  35. #define x4d(i3, i2, i1, i0) x[(i3) * (C * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
  36. #define k4d(i3, i2, i1, i0) k[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0]
  37.  
  38. int n = blockIdx.x;
  39. int m = blockIdx.y;
  40. int h = (blockIdx.z / W_grid) * TILE_WIDTH + threadIdx.y;
  41. int w = (blockIdx.z % W_grid) * TILE_WIDTH + threadIdx.x;
  42. int c, p, q;
  43. if (h < H_out && w < W_out) {
  44.  
  45. float value = 0;
  46. for(c = 0; c < C; c++){
  47. for(p = 0; p < K; p++){
  48. for(q = 0; q < K; q++){
  49. value += x4d(n,c,h+p,w+q) * k4d(m,c,p,q);
  50.  
  51.  
  52. }
  53. }
  54. }
  55. y4d(n,m,h,w) = value;
  56. }
  57.  
  58.  
  59.  
  60.  
  61. #undef y4d
  62. #undef x4d
  63. #undef k4d
  64. }
  65.  
  66. /*
  67. This function is called by new-inl.h
  68. Any code you write should be executed by this function.
  69. For ECE408, we only expect the float version of the operator to be called, so here we specialize with only floats.
  70. */
  71. template <>
  72. void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tensor<gpu, 4, float> &x, const mshadow::Tensor<gpu, 4, float> &w)
  73. {
  74.  
  75. // Use mxnet's CHECK_EQ to do assertions.
  76. // Remove this assertion when you do your implementation!
  77.  
  78. // Extract the tensor dimensions into B,M,C,H,W,K
  79. // ...
  80.  
  81. const int B = x.shape_[0];
  82. const int M = y.shape_[1];
  83. const int C = x.shape_[1];
  84. const int H = x.shape_[2];
  85. const int W = x.shape_[3];
  86. const int K = w.shape_[3];
  87. const int H_out = H - K + 1;
  88. const int W_out = W - K + 1;
  89.  
  90. const int W_grid = ceil(W_out * 1.0 / TILE_WIDTH);
  91. const int H_grid = ceil(H_out * 1.0 / TILE_WIDTH);
  92. const int Z = H_grid * W_grid;
  93. // Set the kernel dimensions
  94. dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
  95. dim3 gridDim(B,M,Z);
  96.  
  97. // Call the kernel
  98. forward_kernel<<<gridDim, blockDim, 0>>>(y.dptr_,x.dptr_,w.dptr_, B,M,C,H,W,K);
  99.  
  100. // Use MSHADOW_CUDA_CALL to check for CUDA runtime errors.
  101. MSHADOW_CUDA_CALL(cudaDeviceSynchronize());
  102.  
  103. }
  104.  
  105. /*
  106. This tells mxnet how to do an op when it's not a float.
  107. This is not used in the ECE408 project
  108. */
  109. template <typename gpu, typename DType>
  110. void forward(mshadow::Tensor<gpu, 4, DType> &y, const mshadow::Tensor<gpu, 4, DType> &x, const mshadow::Tensor<gpu, 4, DType> &w)
  111. {
  112. CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
  113. }
  114. }
  115. }
  116.  
  117. #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement