Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2018
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.21 KB | None | 0 0
  1.  
  2. #include <cstdint>
  3. #include <cassert>
  4. #include <cmath>
  5.  
  6. #include "../gradient_t.h"
  7. #include "../layer_t.h"
  8. #include "../optimization_method.h"
  9. #include "../conv_layer_t.h"
  10. #include "cuda_conv_layer_t.hpp"
  11.  
  12. #include <iostream>
  13.  
  14.  
  15. /////////////////////////// ACTIVATE ////////////////
  16.  
  17. __global__ void activateCUDAkern(float *input,
  18. float *output,
  19. float *filters, int const filter_size,
  20. int const stride)
  21. {
  22. if(blockIdx.x < filter_size/2 || blockIdx.x >= gridDim.x - filter_size/2 || blockIdx.y < filter_size/2 || blockIdx.y >= gridDim.x - filter_size/2)
  23. {
  24. return;
  25. }
  26.  
  27. int output_position_x = blockIdx.x - filter_size/2;
  28. int output_position_y = blockIdx.y - filter_size/2;
  29. int output_position = threadIdx.z * ((gridDim.x-(filter_size-1)) * (gridDim.y-(filter_size-1))) +
  30. output_position_y * (gridDim.x-(filter_size-1)) +
  31. output_position_x;
  32. output[output_position] = 0;
  33.  
  34. __syncthreads();
  35.  
  36. int input_position_x = blockIdx.x-(filter_size/2)+threadIdx.x;
  37. int input_position_y = blockIdx.y-(filter_size/2)+threadIdx.y;
  38. int input_position = input_position_y * gridDim.x +
  39. input_position_x;
  40.  
  41.  
  42. int filter_position = threadIdx.z * (blockDim.x * blockDim.y) +
  43. threadIdx.y * blockDim.x +
  44. threadIdx.x;
  45.  
  46. float pixel_value = input[input_position];
  47. float filter_value = filters[filter_position];
  48. output[output_position] = 1;
  49. //atomicAdd(&output[output_position], (pixel_value * filter_value));
  50. }
  51.  
  52. tensor_t<float> activateCUDA(tensor_t<float> &in, tensor_t<float> &out,
  53. std::vector<tensor_t<float>> &filters, int filter_size,
  54. int stride) {
  55.  
  56. std::vector<float> filters_elements;
  57. for (int a = 0; a < filters.size(); a++)
  58. for (int i = 0; i < filter_size; i++)
  59. for (int j = 0; j < filter_size; j++)
  60. for (int z = 0; z < in.size.z; z++) {
  61. filters_elements.push_back(filters[a].get(i, j, z));
  62. }
  63.  
  64.  
  65. float *raw_input = (float *)in.data.data();
  66. float *d_input;
  67. float *d_output;
  68. float *d_filters;
  69.  
  70.  
  71. cudaMalloc((void **)&d_input, in.size.x*in.size.y*in.size.z*sizeof(float));
  72. cudaMalloc((void **)&d_output, out.size.x*out.size.y*out.size.z*sizeof(float));
  73. cudaMalloc((void **)&d_filters, filters.size()*filter_size*filter_size*in.size.z*sizeof(float));
  74.  
  75. cudaMemcpy(d_input, raw_input, in.size.x*in.size.y*in.size.z*sizeof(float), cudaMemcpyHostToDevice);
  76. //cudaMemcpy(d_output, &out.data[0], out.size.x*out.size.y*out.size.z*sizeof(float), cudaMemcpyHostToDevice);
  77. cudaMemcpy(d_filters, &filters_elements[0], filters.size()*filter_size*filter_size*sizeof(float), cudaMemcpyHostToDevice);
  78.  
  79.  
  80. dim3 grid_size = dim3(in.size.x, in.size.y, in.size.z);
  81. dim3 block_size = dim3(filter_size, filter_size, 1);//out.size.z);
  82.  
  83. activateCUDAkern<<<grid_size, block_size>>>(d_input, d_output, d_filters, filter_size, stride);
  84.  
  85. float *h_output = (float *)malloc(out.size.x * out.size.y * out.size.z * sizeof(float));
  86. cudaMemcpy(h_output, d_output, out.size.x * out.size.y * out.size.z * sizeof(float), cudaMemcpyDeviceToHost);
  87.  
  88. out.data.assign(h_output, h_output+(out.size.x * out.size.y * out.size.z));
  89.  
  90. for (int j=0; j<out.size.z; j++) {
  91. for (int i=0; i<out.size.x * out.size.y; i++) {
  92. std::cout << out.data[(j*out.size.x * out.size.y) + i] << " ";
  93. }
  94. std::cout << std::endl << std::endl;
  95. }
  96.  
  97. std::cout << "--------------------------" << std::endl;
  98.  
  99. return out;
  100. }
  101.  
  102.  
  103.  
  104.  
  105. __global__ void FCactivateCUDA(float *input, float *output, float *weights)
  106. {
  107. __shared__ float inputv;
  108. inputv=0;
  109.  
  110. int m= threadIdx.z * (blockDim.x * blockDim.y) +
  111. threadIdx.y * blockDim.x +
  112. threadIdx.x;
  113.  
  114. atomicAdd(&inputv, input[m]*weights[blockIdx.x*(blockDim.x*blockDim.y*blockDim.z) + m]);
  115.  
  116. }
  117.  
  118. tensor_t<float> FC_activate_CUDA(tensor_t<float> &in, tensor_t<float> &out, tensor_t<float> &weights )
  119. {
  120.  
  121. std::vector<float> weights_elements;
  122. for (int n = 0; n < out.size.x; n++)
  123. for (int m = 0; m < in.size.x*in.size.y*in.size.z; m++)
  124. {
  125. weights_elements.push_back(weights(m, n, 0));
  126. }
  127.  
  128. float *d_input;
  129. float *d_output;
  130. float *d_weights;
  131.  
  132. cudaMalloc((void **)&d_input, in.size.x*in.size.y*in.size.z*sizeof(float));
  133. cudaMalloc((void **)&d_output, out.size.x*out.size.y*out.size.z*sizeof(float));
  134. cudaMalloc((void **)&d_weights, weights.size.x*weights.size.y*weights.size.z*sizeof(float));
  135.  
  136. cudaMemcpy(d_input, &in.data[0], in.size.x*in.size.y*in.size.z*sizeof(float), cudaMemcpyHostToDevice);
  137. cudaMemcpy(d_weights, &weights_elements[0], weights.size.x*weights.size.y*weights.size.z *sizeof(float), cudaMemcpyHostToDevice);
  138.  
  139. dim3 grid_size = dim3(out.size.x, 1, 1);
  140. dim3 block_size = dim3(in.size.x, in.size.y, in.size.z);
  141.  
  142. FCactivateCUDA<<<grid_size, block_size>>>(d_input,d_output,d_weights);
  143.  
  144. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement