Untitled

def softmax(class_scores):
    """
    Calculate class probability distribution for each digit from given class scores.

    :param class_scores: class scores of your function
    :return: probability distribution
    """
    class_scores -= np.max(class_scores)
    return np.exp(class_scores) / np.sum(np.exp(class_scores),axis=1, keepdims=True)

def onehot_encode_label(label):
    """
    Support function to convert label vector into a one hot encoding matrix

    :param label: array with shape (D,) , D can be whatever you want
    :return: one hot encoding matrix
    """
    onehot_encoder = OneHotEncoder(sparse=False)
    label = label.reshape(len(label), 1)
    onehot_encoded_label = onehot_encoder.fit_transform(label)
    return onehot_encoded_label

def data_loss(class_probabilities, onehot_encode_label):
    """
    Compute data_loss L_i for the correct class with a onehot encoded label

    :param class_probabilities: probabilities from the softmax function
    :param onehot_encode_label: correct labels in a one hot enconding shape

    :return: the data loss L_i
    """
    return onehot_encode_label * -np.log(class_probabilities)

def loss(X, y, theta, lam):
    """
    :param X: data
    :param y: label of the data
    :param theta: learnable parameters
    :param lam: regularization factor

    :return: loss and gradient as a tupel
    """
    encoded_labels = onehot_encode_label(y)           # also needed for the gradient, therefore separated calculated
    probabilities = softmax(class_scores(X,theta))    # also needed for the gradient, therefore separated calculated
    loss_Li = data_loss(probabilities,encoded_labels)

    m = X.shape[0]                                    # number of training data for normalization
    l2_regularization = (lam/2)*np.sum(theta*theta)   # regularization loss

    loss = np.sum(loss_Li)/m + l2_regularization

    dl2 = lam*theta
    dloss = np.dot(X.T, (probabilities - encoded_labels)/m)
    gradient = dloss +  dl2

    return  loss, gradient

def sgd(training_data, training_label, theta, lam=0.5, iterations=100, learning_rate=1e-5, batch_size=256):
    losses = []
    for i in range(iterations):
        shuffle_index = np.random.permutation(training_data.shape[0])
        data, label = training_data[shuffle_index], training_label[shuffle_index]
        data, label = data[:batch_size], label[:batch_size]

        l, grad = loss(data, label, theta, lam)
        losses.append(l)
        theta -= learning_rate*grad
    return theta, losses

# Initialize learnable parameters theta
theta = np.zeros([X_train.shape[1],len(np.unique(y_train))])

# Start optimization with traning data, theta and optional hyperparameters
opt_model, loss_history = sgd(X_train,y_train,theta,iterations=250)

# evaluation
print('last iteration loss:',loss_history[-1])
print('first iteration loss:',loss_history[0])
print('Is the first loss equal to ln(10)?', np.log(10) - loss_history[0] < 0.000001) # if its false you have a implementation error

# plot a loss curve
plt.plot(loss_history)
plt.ylabel('loss')
plt.xlabel('iterations')
plt.show()

# plot weights
plt.figure(figsize=(20, 20))
num_classes = 10
for c in range(num_classes):
    f = plt.subplot(10, num_classes, 1 * num_classes + c + 1)
    f.axis('off')
    plt.imshow(np.reshape(opt_model[:,c],[28,28]))
plt.show()