Untitled

# this implementation was given as assignment 3 of the course
# B55.2 WT Ausgewählte Kapitel sozialer Webtechnologien at HTW Berlin


# third party
import numpy as np
import matplotlib.pyplot as plt

# internal
from deep_teaching_commons.data.fundamentals.mnist import Mnist

# create mnist loader from deep_teaching_commons
mnist_loader = Mnist(data_dir='data')

# load all data, labels are one-hot-encoded, images are flatten and pixel squashed between [0,1]
train_images, train_labels, test_images, test_labels = mnist_loader.get_all_data(
    one_hot_enc=True, normalized=True)

# shuffle training data
shuffle_index = np.random.permutation(60000)
train_images, train_labels = train_images[shuffle_index], train_labels[shuffle_index]


def feed_forward(X, weights):
    """
    calculates the forward path of our neural network with RELU as activation function of every neuron

    Args:
        X: input data of our neural network (in our cases - our images)
        weights: the learnable parametre of our network

    Returns:
        a matrix which represents the forward path
    """
    a = [X]
    for w in weights:
        # the last item of our list is always the latest item that was calculated
        # which is why a[-1] is always called
        a.append(np.maximum(a[-1].dot(w), 0))
    return a


def grads(X, Y, weights):
    """
    calculates the gradient of our network by using a algorithm called backpropagation

    Args:
        X: input data of our neural network (in our cases - our images)
        Y: labels of our input data
        weights: the learnable parametre of our network

    Returns:
        the gradient of our loss function
    """
    grads = np.empty_like(weights)
    a = feed_forward(X, weights)

    # calculating the gradient
    delta = a[-1] - Y
    grads[-1] = a[-2].T.dot(delta)
    for i in range(len(a)-2, 0, -1):
        delta = (a[i] > 0) * delta.dot(weights[i].T)
        grads[i-1] = a[i-1].T.dot(delta)
    return grads / len(X)


# To test out weather our implementation works, we are going to first initialize our neural network with
# 3 layers with 784 input neurons, 200 hidden neurons and 10 output neurons.
# The 784 input neurons stand for every pixel of one image (every image has a resolution of 28x28) and
# the 10 output neurons stands for every possible numbber the image could stand for (every image could be 0-9).
# We also set up variables for our train and test dataset
trX, trY, teX, teY = train_images, train_labels, test_images, test_labels
weights = [np.random.randn(
    *w) * 0.1 for w in [(784, 200), (200, 100), (100, 10)]]


# After initializing our network we are going to train our network and then see how accurate it performs.
# The number of epochs stands for the amount of times we are going to repeat this/repeat the training.
#
# In order to train our network/minimize our loss we use stochastical gradient descent method
# which is the same as gradient descent but only uses just a part of the whole data
# - a so called "mini-batch" - to calculate the gradient for each iteration.
#
# Gradient descent tries to minimize our loss function
# by substracting our current weights with the gradient of our loss function.
# so we have W_new = W_old - grad(L)*learning_rate
num_epochs, batch_size, learn_rate = 10, 50, 0.1
for i in range(num_epochs):
    for j in range(0, len(trX), batch_size):
        # creating a mini-batch with the size of batch_size
        X, Y = trX[j:j+batch_size], trY[j:j+batch_size]
        weights -= learn_rate * grads(X, Y, weights)
    prediction_test = np.argmax(feed_forward(teX, weights)[-1], axis=1)

    # prints our accuracy on the test data after training
    print(i, np.mean(prediction_test == np.argmax(teY, axis=1)))