Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cstdint>
- #include <cassert>
- #include <cmath>
- #include "../gradient_t.h"
- #include "../layer_t.h"
- #include "../optimization_method.h"
- #include "../conv_layer_t.h"
- #include "cuda_conv_layer_t.hpp"
- #include <iostream>
- /////////////////////////// ACTIVATE ////////////////
- __global__ void activateCUDAkern(float *input,
- float *output,
- float *filters, int const filter_size,
- int const stride)
- {
- if(blockIdx.x < filter_size/2 || blockIdx.x >= gridDim.x - filter_size/2 || blockIdx.y < filter_size/2 || blockIdx.y >= gridDim.x - filter_size/2)
- {
- return;
- }
- int output_position_x = blockIdx.x - filter_size/2;
- int output_position_y = blockIdx.y - filter_size/2;
- int output_position = threadIdx.z * ((gridDim.x-(filter_size-1)) * (gridDim.y-(filter_size-1))) +
- output_position_y * (gridDim.x-(filter_size-1)) +
- output_position_x;
- output[output_position] = 0;
- __syncthreads();
- int input_position_x = blockIdx.x-(filter_size/2)+threadIdx.x;
- int input_position_y = blockIdx.y-(filter_size/2)+threadIdx.y;
- int input_position = input_position_y * gridDim.x +
- input_position_x;
- int filter_position = threadIdx.z * (blockDim.x * blockDim.y) +
- threadIdx.y * blockDim.x +
- threadIdx.x;
- float pixel_value = input[input_position];
- float filter_value = filters[filter_position];
- output[output_position] = 1;
- //atomicAdd(&output[output_position], (pixel_value * filter_value));
- }
- tensor_t<float> activateCUDA(tensor_t<float> &in, tensor_t<float> &out,
- std::vector<tensor_t<float>> &filters, int filter_size,
- int stride) {
- std::vector<float> filters_elements;
- for (int a = 0; a < filters.size(); a++)
- for (int i = 0; i < filter_size; i++)
- for (int j = 0; j < filter_size; j++)
- for (int z = 0; z < in.size.z; z++) {
- filters_elements.push_back(filters[a].get(i, j, z));
- }
- float *raw_input = (float *)in.data.data();
- float *d_input;
- float *d_output;
- float *d_filters;
- cudaMalloc((void **)&d_input, in.size.x*in.size.y*in.size.z*sizeof(float));
- cudaMalloc((void **)&d_output, out.size.x*out.size.y*out.size.z*sizeof(float));
- cudaMalloc((void **)&d_filters, filters.size()*filter_size*filter_size*in.size.z*sizeof(float));
- cudaMemcpy(d_input, raw_input, in.size.x*in.size.y*in.size.z*sizeof(float), cudaMemcpyHostToDevice);
- //cudaMemcpy(d_output, &out.data[0], out.size.x*out.size.y*out.size.z*sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(d_filters, &filters_elements[0], filters.size()*filter_size*filter_size*sizeof(float), cudaMemcpyHostToDevice);
- dim3 grid_size = dim3(in.size.x, in.size.y, in.size.z);
- dim3 block_size = dim3(filter_size, filter_size, 1);//out.size.z);
- activateCUDAkern<<<grid_size, block_size>>>(d_input, d_output, d_filters, filter_size, stride);
- float *h_output = (float *)malloc(out.size.x * out.size.y * out.size.z * sizeof(float));
- cudaMemcpy(h_output, d_output, out.size.x * out.size.y * out.size.z * sizeof(float), cudaMemcpyDeviceToHost);
- out.data.assign(h_output, h_output+(out.size.x * out.size.y * out.size.z));
- for (int j=0; j<out.size.z; j++) {
- for (int i=0; i<out.size.x * out.size.y; i++) {
- std::cout << out.data[(j*out.size.x * out.size.y) + i] << " ";
- }
- std::cout << std::endl << std::endl;
- }
- std::cout << "--------------------------" << std::endl;
- return out;
- }
- __global__ void FCactivateCUDA(float *input, float *output, float *weights)
- {
- __shared__ float inputv;
- inputv=0;
- int m= threadIdx.z * (blockDim.x * blockDim.y) +
- threadIdx.y * blockDim.x +
- threadIdx.x;
- atomicAdd(&inputv, input[m]*weights[blockIdx.x*(blockDim.x*blockDim.y*blockDim.z) + m]);
- }
- tensor_t<float> FC_activate_CUDA(tensor_t<float> &in, tensor_t<float> &out, tensor_t<float> &weights )
- {
- std::vector<float> weights_elements;
- for (int n = 0; n < out.size.x; n++)
- for (int m = 0; m < in.size.x*in.size.y*in.size.z; m++)
- {
- weights_elements.push_back(weights(m, n, 0));
- }
- float *d_input;
- float *d_output;
- float *d_weights;
- cudaMalloc((void **)&d_input, in.size.x*in.size.y*in.size.z*sizeof(float));
- cudaMalloc((void **)&d_output, out.size.x*out.size.y*out.size.z*sizeof(float));
- cudaMalloc((void **)&d_weights, weights.size.x*weights.size.y*weights.size.z*sizeof(float));
- cudaMemcpy(d_input, &in.data[0], in.size.x*in.size.y*in.size.z*sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(d_weights, &weights_elements[0], weights.size.x*weights.size.y*weights.size.z *sizeof(float), cudaMemcpyHostToDevice);
- dim3 grid_size = dim3(out.size.x, 1, 1);
- dim3 block_size = dim3(in.size.x, in.size.y, in.size.z);
- FCactivateCUDA<<<grid_size, block_size>>>(d_input,d_output,d_weights);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement