Untitled

W1 = np.random.randn(H, X.shape[1])
    b1 = np.random.randn(H)

    W2 = np.random.randn(C, H)
    b2 = np.random.randn(C)

    Y = data.create_onehot_matrix(Y_)

    for i in range(param_niter):
        s1 = np.dot(X, W1.T) + b1
        h1 = np.array(s1)
        h1[h1<0] = 0

        s2 = np.dot(s1, W2.T) + b2

        probs = data.matrix_stable_softmax(s2)
        positions = np.where(Y == 1)
        logprobs = np.log(probs[positions])

        loss = -np.mean(logprobs)

        if i % 10 == 0:
            print("iteration {}: loss {}".format(i, loss))

        Gs2 = probs - Y

        grad_W2 = np.dot(Gs2.T, h1)
        grad_b2 = np.sum(Gs2, axis=0)

        Gh1 = np.dot(Gs2, W2)

        tmp = np.array(s1)
        tmp[tmp<=0] = 0
        tmp = np.count_nonzero(tmp, axis=0)
        diag = np.diag(tmp)
        Gs1 = np.dot(Gh1, diag)

        grad_W1 = np.dot(Gs1.T, X)
        grad_b1 = np.sum(Gs1, axis=0)

        W2 -= param_delta * grad_W2
        b2 -= param_delta * grad_b2
        W1 -= param_delta * grad_W1
        b1 -= param_delta * grad_b1