SGD implementation

# The neural network class has 3 main attributes:
# self.biases -> the list of matrices for the weights
# self.weights -> the list of matrices for the biases
# self.layers -> a list of the layers of the network and their shape ( [5, 3, 2, 1] means 4 layers with 5 neurons for the input, 3 for the first hidden, 2 for the second hidden and 1 for the output layer )
# self.activation_functions -> a list of strings equivalent to self.layers but instead of the number of neurons per layer it determins what activation function should be used for that layer (Example [None, "relu", "relu", "sigmoid"])

def stochastic_gradient_descent(self, input_data, loss_function, learning_rate=0.01, momentum=0.5, minibatch_size=100, relu_leak=0.1):
    # input_data -> Array of tuples (X, y) [training sample and correct guess]
    # loss_function -> A string used to choose which loss function to use
    # learning_rate -> Learning rate
    # momentum -> Momentum
    # minibatch_size -> Minibatch size
    # relu_leak -> the leak value for the relu function that is passed if the activation of the current layer is leaky_relu

    weight_velocities = [numpy.zeros_like(weights) for weights in self.weights]
    bias_velocities = [numpy.zeros_like(biases) for biases in self.biases]

    mini_batches = [input_data[i:i + minibatch_size] for i in range(0, len(input_data), minibatch_size)]
    loss = 0
    slope = 0
    for mini_batch in mini_batches:
        temp_weights = [numpy.zeros(weights_layer.shape) for weights_layer in self.weights]
        temp_biases = [numpy.zeros(bias_layer.shape) for bias_layer in self.biases]

        minibatch_loss = 0
        minibatch_slope = 0
        # Cycle for backpropagation
        for input_sample, correct_result in mini_batch:

            # Feed forward
            # network_activated_neurons is a copy of the network's neurons containing the neuron values after they were passed throught the activation function
            network_activated_neurons = [input_sample]
            # network_not_activated_neurons is a copy of the network's neurons containing the neuron values before they were passed throught the activation function
            network_not_activated_neurons = [input_sample]
            for biases, weights, activation_function in zip(self.biases, self.weights, self.activation_functions):
                x = numpy.dot(weights, network_activated_neurons[-1]) + biases
                network_not_activated_neurons.append(x)
                network_activated_neurons.append(self.calculate_activation(activation_function, x, relu_leak))

            activated_output = network_activated_neurons[-1]
            not_activated_output = network_not_activated_neurons[-1]

            # Output layer errors & gradients
            error_gradients = self.calculate_loss_gradient(loss_function, activated_output, correct_result)
            final_gradients = self.calculate_activation_prime(self.activation_functions[-1],
                                                              not_activated_output) * error_gradients  # This is basically the final result of the whole cost derivative

            minibatch_loss += self.calculate_loss(loss_function, activated_output, correct_result)
            minibatch_slope += error_gradients

            delta_biases = final_gradients
            delta_weights = final_gradients * network_activated_neurons[-2].transpose()

            weight_velocities[-1] = momentum * weight_velocities[-1] + (1 - momentum) * delta_weights
            bias_velocities[-1] = momentum * bias_velocities[-1] + (1 - momentum) * delta_biases

            temp_weights[-1] += weight_velocities[-1]
            temp_biases[-1] += bias_velocities[-1]


            # Hidden layers errors & gradients
            for layer_index in range(2, len(self.layers)):
                hidden_layer_errors = numpy.dot(self.weights[-layer_index + 1].transpose(), final_gradients)
                activation_derivative = self.calculate_activation_prime(self.activation_functions[-layer_index], network_not_activated_neurons[-layer_index])

                final_gradients = hidden_layer_errors * activation_derivative

                delta_biases = final_gradients
                delta_weights = numpy.dot(final_gradients, network_activated_neurons[-layer_index - 1].transpose())

                weight_velocities[-layer_index] = momentum * weight_velocities[-layer_index] + (1 - momentum) * delta_weights
                bias_velocities[-layer_index] = momentum * bias_velocities[-layer_index] + (1 - momentum) * delta_biases

                temp_weights[-layer_index] += weight_velocities[-layer_index]
                temp_biases[-layer_index] += bias_velocities[-layer_index]

        loss += minibatch_loss / len(mini_batch)
        slope += minibatch_slope / len(mini_batch)
        self.weights = [weights - delta_weights * (learning_rate) for weights, delta_weights in zip(self.weights, temp_weights)]
        self.biases = [biases - delta_biases * (learning_rate) for biases, delta_biases in zip(self.biases, temp_biases)]
    return loss / len(mini_batches), slope / len(mini_batches)