Untitled

from scipy.special import erf
import numpy as np

def logistic(x):
    return 0.5 * (1 + erf(x))

# Computes gradient (possibly, the Jacobian) of y with respect to xs
def grad(y, xs):
    dx = 0.0001
    return np.stack(
        (y(xs + dxs) - y(xs - dxs)) / (2 * dx)
        for dxs in dx * np.eye(len(xs.T))
    ).T

def forward_pure(weights, inputs):
    return logistic(weights @ inputs)

class Layer:
    def __init__(self, input_size, output_size, learning_rate=1):
        # weights column 0 is bias
        self.weights = np.random.normal(size=(output_size, input_size + 1))
        self.learning_rate = learning_rate

    def forward(self, inputs):
        self.inputs = np.array([1, *inputs])
        self.outputs = forward_pure(self.weights, self.inputs)
        return self.outputs

    def backward(self, error_grad):
        # Derivative of output w.r.t. weights
        self.dout_dw = grad(lambda w: forward_pure(w, self.inputs), self.weights)
        # Derivative of output w.r.t. inputs
        # (input 0 is constant, so differentiating wrt. it is useless)
        self.dout_din = grad(lambda i: forward_pure(self.weights, i), self.inputs)[:, 1:]

        # Adjust weights
        # We note that according to the chain rule:
        # d[Error]/d[weights] = d[Error]/d[outputs] @ d[outputs]/d[weights]
        # self.derror_dw = error_grad @ self.dout_dw
        self.derror_dw = error_grad @ self.dout_dw
        self.weights -= self.learning_rate * self.derror_dw

        # Return gradient propagated backwards
        return self.dout_din

def loss(actual, predicted):
    return np.sum((predicted - actual) ** 2)

# Gradient of loss w.r.t. predictions
def loss_grad(actual, predicted):
    return grad(lambda xs: loss(actual, xs), predicted)

class NN:
    def __init__(self, layers):
        self.layers = layers

    def forward(self, inputs):
        for layer in self.layers:
            inputs = layer.forward(inputs)

        self.outputs = inputs
        return inputs

    def backward(self, actual):
        # derivative of error w.r.t. output
        de_do = loss_grad(actual, self.outputs)

        for layer in reversed(self.layers):
            de_do = layer.backward(de_do)

        return de_do

def train(nn, num_epochs, Xs, ys):
    for i in range(num_epochs):
        samples = list(zip(Xs, ys))
        np.random.shuffle(samples)

        for X, y in samples:
            pred = nn.forward(X)
            err = nn.backward(y)

    for X, y in zip(Xs, ys):
        pred = nn.forward(X)
        print(X, y, np.round(pred, 3))

print("Learning logical AND")
train(
    nn = NN(layers=[
        Layer(input_size=2, output_size=1),
    ]),
    num_epochs=10000,
    Xs = np.array([
        [0, 0],
        [0, 1],
        [1, 0],
        [1, 1],
    ]),
    ys = np.array([
        [0],
        [0],
        [0],
        [1],
    ])
)
print("Learning logical OR")
train(
    nn = NN(layers=[
        Layer(input_size=2, output_size=1),
    ]),
    num_epochs=10000,
    Xs = np.array([
        [0, 0],
        [0, 1],
        [1, 0],
        [1, 1],
    ]),
    ys = np.array([
        [0],
        [1],
        [1],
        [1],
    ])
)
print("Learning both at the same time")
train(
    nn = NN(layers=[
        Layer(input_size=2, output_size=2),
    ]),
    num_epochs=10000,
    Xs = np.array([
        [0, 0],
        [0, 1],
        [1, 0],
        [1, 1],
    ]),
    ys = np.array([
        [0, 0],
        [0, 1],
        [0, 1],
        [1, 1],
    ])
)