Untitled

using Images
using PyPlot
using FileIO
using Optim
using Random
using Statistics

include("Common.jl")

# TODO: remove
function printarr(name, arr; show=false)
  println("Array: $name, shape: $(size(arr))")
  if show
    display(arr)
  end
end

#---------------------------------------------------------
# Load features and labels from file.
#---------------------------------------------------------
function loaddata(path::String)
  data = load(path)
  features, labels = data["features"], data["labels"]
  @assert length(labels) == size(features,1)
  return features::Array{Float64,2}, labels::Array{Float64,1}
end

#---------------------------------------------------------
# Show a 2-dimensional plot for the given features with
# different colors according to the labels.
#---------------------------------------------------------
function showbefore(features::Array{Float64,2},labels::Array{Float64,1})

  # Create masks for class 0 and 1
  mask0 = labels .== 0.0
  mask1 = labels .== 1.0

  # Plot
  figure()
  scatter(features[mask0, 1], features[mask0, 2], c="red", label="class 0")
  scatter(features[mask1, 1], features[mask1, 2], c="blue", label="class 1")

  # Setup and show
  title("Data before")
  xlabel("x1")
  ylabel("x2")
  legend()
  show()
  return nothing::Nothing
end


#---------------------------------------------------------
# Show a 2-dimensional plot for the given features along
# with the decision boundary.
#---------------------------------------------------------
function showafter(features::Array{Float64,2},labels::Array{Float64,1},Ws::Vector{Any}, bs::Vector{Any})

  return nothing::Nothing
end


#---------------------------------------------------------
# Implements the sigmoid function.
#---------------------------------------------------------
function sigmoid(z)
  s = 1 ./ (1 + exp.(-z))
  return s
end


#---------------------------------------------------------
# Implements the derivative of the sigmoid function.
#---------------------------------------------------------
function dsigmoid_dz(z)
  s = sigmoid.(z)
  ds = s.*(1 .- s)
  return ds
end


#---------------------------------------------------------
# Evaluates the loss function of the MLP.
#---------------------------------------------------------
function nnloss(theta::Array{Float64,1}, X::Array{Float64,2}, y::Array{Float64,1}, netdefinition::Array{Int, 1})
  # Get weights and biases
  Ws, bs = thetaToWeights(theta, netdefinition)

  # Forward pass
  z = X
  for i=1:length(Ws) - 1
    z = sigmoid(Ws[i]' * z .+ bs[i])
  end

  # Last layer: softmax to get class probabilities
  function softmax(x)
    exps = exps.(x)
    sums = sum(exps)
    return exps / sums
  end

  # Get probabilities
  probs = softmax(Ws[end]' * z .+ bs[end])

  # Define loss function
  L(y, t) = -t * log(y) - (1 - t) * log(1 - y)

  # Compute loss
  loss = 1/length(y) * sum(L.(probs, y))

  return loss::Float64
end


#---------------------------------------------------------
# Softmax activation function to get probabilities
#---------------------------------------------------------
function softmax(x)
  exps = exps.(x)
  sums = sum(exps)
  return exps / sums
end

#---------------------------------------------------------
# Feed forward pass. Returns activations for each layer.
#---------------------------------------------------------
function feedforward(theta::Array{Float64,1}, X::Array{Float64,2}, y::Array{Float64,1}, netdefinition::Array{Int, 1})
  println("Entered feedforward")
  # Get weights and biases
  Ws, bs = thetaToWeights(theta, netdefinition)

  # Forward pass
  activations = []
  zs = []

  # Put training samples along columns for efficient multiplication
  a = X'
  push!(activations, a)
  for i=1:length(Ws)
    println("forward loop iteration $i")
    println("z = Ws[$i]' * a .+ bs[$i]")
    println("with shape(Ws[$i]) = $(size(Ws[i])), shape(bs[$i]) = $(length(bs[i]))")
    println("a has shape: $(size(a))")
    z = Ws[i] * a .+ bs[i]
    a = sigmoid.(z)
    push!(zs, z)
    push!(activations, a)
  end
  return activations, zs
end

#---------------------------------------------------------
# Evaluate the gradient of the MLP loss w.r.t. Ws and Bs
# The gradient should be stored in the vector 'storage'
#---------------------------------------------------------
function nnlossgrad(storage::Array{Float64,1}, theta::Array{Float64,1}, X::Array{Float64,2}, y::Array{Float64,1}, netdefinition::Array{Int, 1})
  # Get weights and biases

  Ws, bs = thetaToWeights(theta, netdefinition)
  nlayers = length(netdefinition) - 1

  # Gradient storage
  storage = zeros(length(theta))

  # Compute activations
  activations, zs = feedforward(theta, X, y, netdefinition)
  probs = activations[end]

  dEdz(ypred, ytrue) = ypred - ytrue

  dWs = [zeros(size(W)) for W in Ws]
  dbs = [zeros(size(b)) for b in bs]
  dEdz_eval = dEdz(probs[:], y)
  dsigmoiddz_eval = dsigmoid_dz.(zs[end])
  printarr("dEdz", dEdz_eval)
  printarr("dsigmoiddz_eval", dsigmoiddz_eval)

  delta = dEdz_eval .* dsigmoiddz_eval[:]
  printarr("delta", delta)
  dbs[end] = delta
  dWs[end] = delta' * activations[end-1]'

  printshapes("activations", activations)
  for l=2:nlayers
    println("backprop layer: $l")
    z = zs[end - l + 1]
    printarr("z", z)
    dsig = dsigmoid_dz.(z)
    printarr("dsig", dsig)
    W = Ws[end - l + 2]
    printarr("W", W)
    Wpxdelta = W' * delta
    delta = Wpxdelta * dsig
    dbs[end - l + 1] = delta
    dWs[end - l + 1] = delta * activations[end - l]'
  end

  storage[:] .= weightsToTheta(dWs, dbs)
  return storage::Array{Float64,1}
end

function printshapes(name, arr)
  println(name)
  for (i, a) in enumerate(arr)
    println("$i: $(size(a))")
  end
end

#---------------------------------------------------------
# Use LBFGS to optimize the MLP loss
#---------------------------------------------------------
function train(trainfeatures::Array{Float64,2}, trainlabels::Array{Float64,1}, netdefinition::Array{Int, 1})
  sigma_w = 0.01
  sigma_b = 0.001
  Ws, bs = initWeights(netdefinition, sigma_w, sigma_b)
  theta = weightsToTheta(Ws, bs)
  Wsp, bsp = thetaToWeights(theta, netdefinition)

  L(theta) = nnloss(theta, trainfeatures, trainlabels, netdefinition)
  Lgrad!(storage, theta) = nnlossgrad(storage, theta, trainfeatures, trainlabels, netdefinition)

  res = optimize(L, Lgrad!, theta, LBFGS())
  mintheta = Optim.minimizer(res)

  Ws, bs = thetaToWeights(mintheta)
  return Ws::Vector{Any},bs::Vector{Any}
end


#---------------------------------------------------------
# Predict the classes of the given data points using Ws and Bs.
# p, N x 1 array of Array{Float,2}, contains the output class scores (continuous value) for each input feature.
# c, N x 1 array of Array{Float,2}, contains the output class label (either 0 or 1) for each input feature.
#---------------------------------------------------------
function predict(X::Array{Float64,2}, Ws::Vector{Any}, bs::Vector{Any})

  return p::Array{Float64,2}, c::Array{Float64,2}
end


#---------------------------------------------------------
# A helper function which concatenates weights and biases into a variable theta
#---------------------------------------------------------
function weightsToTheta(Ws::Vector{Any}, bs::Vector{Any})
  # Init theta as dynamic list
  theta = Float64[]
  for i=1:length(Ws)
    # Reshape and unflod Ws and bs
    push!(theta, reshape(Float64.(Ws[i]), :)...)
    push!(theta, Float64.(bs[i])...)
  end
  return theta::Vector{Float64}
end


#---------------------------------------------------------
# A helper function which decomposes and reshapes weights and biases from the variable theta
#---------------------------------------------------------
function thetaToWeights(theta::Vector{Float64}, netdefinition::Array{Int,1})
  # Init weights and bias vectors
  nlayers = length(netdefinition) - 1
  Ws = Vector{Any}(missing, nlayers)
  bs = Vector{Any}(missing, nlayers)

  # Alias
  nd = netdefinition

  # Offset for the theta vector
  offset = 1

  # For each layer
  for i=1:nlayers
    # Lenght of the current layers
    size_wi = nd[i] * nd[i + 1]
    size_bi = nd[i + 1]

    # Get current weights
    wi = theta[offset:offset+size_wi - 1]

    # Shift offset
    offset += size_wi

    # Get current bias
    bi = theta[offset:offset+size_bi - 1]

    # Shift offset
    offset += size_bi

    # Collect weights and biases
    Ws[i] = reshape(wi, nd[i+1], nd[i])
    bs[i] = bi
  end
  return Ws::Vector{Any}, bs::Vector{Any}
end


#---------------------------------------------------------
# Initialize weights and biases from Gaussian distributions
#---------------------------------------------------------
function initWeights(netdefinition::Array{Int,1}, sigmaW::Float64, sigmaB::Float64)
  nlayers = length(netdefinition) - 1
  Ws = Vector{Any}(missing, nlayers)
  bs = Vector{Any}(missing, nlayers)

  nd = netdefinition
  # For each layer: init weight matrix and bias vector
  for i=1:nlayers
    # W has nd[i] as input and nd[i+1] as output
    Ws[i] = randn(nd[i+1], nd[i]) * sigmaW
    bs[i] = randn(nd[i+1]) * sigmaB
  end

  return Ws::Vector{Any}, bs::Vector{Any}
end


# Problem 2: Multilayer Perceptron

function problem2()
  # make results reproducable
  Random.seed!(10)

  # LINEAR SEPARABLE DATA
  # load data
  features,labels = loaddata("separable.jld2")

  # show data points
  showbefore(features,labels)
  title("Data for Separable Case")

  # train MLP
  Ws,bs = train(features,labels, [2,4,1])

  # show optimum and plot decision boundary
  showafter(features,labels,Ws,bs)
  title("Learned Decision Boundary for Separable Case")


  ## LINEAR NON-SEPARABLE DATA
  # load data
  features2,labels2 = loaddata("nonseparable.jld2")

  # show data points
  showbefore(features2,labels2)
  title("Data for Non-Separable Case")

  # train MLP
  Ws,bs = train(features2,labels2, [2,4,1])

  # show optimum and plot decision boundary
  showafter(features2,labels2,Ws, bs)
  title("Learned Decision Boundary for Non-Separable Case")

  # PLANE-BIKE-CLASSIFICATION FROM PROBLEM 2
  # load data
  trainfeatures,trainlabels = loaddata("imgstrain.jld2")
  testfeatures,testlabels = loaddata("imgstest.jld2")

  # train MLP and predict classes
  Ws,bs = train(trainfeatures,trainlabels, [50,40,30,1])
  _,trainpredictions = predict(trainfeatures, Ws, bs)
  _,testpredictions = predict(testfeatures, Ws, bs)

  # show error
  trainerror = sum(trainpredictions.!=trainlabels)/length(trainlabels)
  testerror = sum(testpredictions.!=testlabels)/length(testlabels)
  println("Training Error Rate: $(round(100*trainerror,digits=2))%")
  println("Testing Error Rate: $(round(100*testerror,digits=2))%")

  return
end