Untitled

function [w,b,out] = ALM_SVM(X,y,lam,opts)
%=============================================
%
% augmented Lagrangian method for solving SVM
% min_{w,b,t} sum(t) + lam/2*norm(w)^2
% s.t. y(i)*(w'*X(:,i)+b) >= 1-t(i)
%      t(i) >= 0, i = 1,...,N
%
%===============================================
%
% ==============================================
% input:
%       X: training data, each column is a sample data
%       y: label vector
%       lam: model parameter
%       opts.tol: stopping tolerance
%       opts.maxit: maximum number of outer iteration
%       opts.subtol: stopping tolerance for inner-loop
%       opts.maxsubit: maxinum number of iteration for inner-loop
%       opts.w0: initial w
%       opts.b0: initial b0
%       opts.t0: initial t0
%       opts.beta: penalty parameter
%
% output:
%       w: learned w
%       b: learned b
%       out.hist_pres: historical primal residual
%       out.hist_dres: historical dual residual
%       out.hist_subit: historical iteration number of inner-loop

% ======================================================

%% get size of problem: p is dimension; N is number of data pts
[p,N] = size(X);

%% set parameters
if isfield(opts,'tol')        tol = opts.tol;           else tol = 1e-4;       end
if isfield(opts,'maxit')      maxit = opts.maxit;       else maxit = 500;      end
if isfield(opts,'subtol')     subtol = opts.subtol;     else subtol = 1e-4;    end
if isfield(opts,'maxsubit')   maxsubit = opts.maxsubit; else maxsubit = 5000;  end
if isfield(opts,'w0')         w0 = opts.w0;             else w0 = randn(p,1);  end
if isfield(opts,'b0')         b0 = opts.b0;             else b0 = 0;           end
if isfield(opts,'t0')         t0 = opts.t0;             else t0 = zeros(N,1);  end
if isfield(opts,'beta')       beta = opts.beta;         else beta = 1;         end


alpha0 = 0.5;
alpha = 0.5;
inc_ratio = 2;
dec_ratio = 0.6;

w = w0; b = b0; t = max(0,t0);
% initialize dual variable
u = zeros(N,1);
onevec = ones(N,1);
Lip = lam + beta*max(1,norm(X))*norm([eye(p), X; eye(p),X]);
alpha_min = 1.9/Lip;

%% compute the primal residual and save to pres

p = max(0,onevec-t-y.*(X'*w+b));
pres = norm(p);
% save historical primal residual
hist_pres = pres;

% compute gradient of ordinary Lagrangian function about (w,b,t)
grad_w = lam*w -X*(u.*y);
grad_b = -u'*y ;
grad_t = onevec-u;

id1 = t > 0;
id2 = t == 0;

% save dual residual to dres
dres = norm(grad_w) + norm(grad_b) ...
    + norm(grad_t(id1)) + norm(min(0,grad_t(id2)));
hist_dres = dres;

hist_subit = 0;

iter = 0; subit = 0;
%% start of outer loop
while max(pres,dres) > tol && iter < maxit
    iter = iter + 1;
    % call the subroutine to update primal variable (w,b,t)
    w0 = w;
    b0 = b;
    t0 = t;

    % fill in the subsolver by yourself
    % if slack variables are introduced, you will have more variables
    [w,b,t] = subsolver(w0,b0,t0,subtol,maxsubit);

    hist_subit = [hist_subit; subit];

    % update multiplier u
    primal = max(0,onevec-t-y.*(X'*w+b));
    u = max(0,u + beta*primal);

    % compute primal residual and save to hist_pres
    pres = norm(primal);
    hist_pres = [hist_pres; pres];

    % compute gradient of ordinary Lagrangian function about (w,b,t)
    grad_w = lam*w - X*(u.*y);
    grad_b = -u'*y ;
    grad_t = onevec-u;

    % compute the dual residual and save to hist_dres
    id1 = t > 0;
    id2 = t == 0;
    dres = norm(grad_w) + norm(grad_b) ...
        + norm(grad_t(id1)) + norm(min(0,grad_t(id2)));
    hist_dres = [hist_dres; dres];

    fprintf('out iter = %d, pres = %5.4e, dres = %5.4e, subit = %d\n',iter,pres,dres,subit);
end

out.hist_pres = hist_pres;
out.hist_dres = hist_dres;
out.hist_subit = hist_subit;

%% =====================================================
% subsolver for primal subproblem
    function [w,b,t] = subsolver(w0,b0,t0,subtol,maxsubit)
        % projected gradient for primal subproblem
        w = w0;
        b = b0;
        t = t0;


        % compute gradient of the augmented Lagrangian function at (w,b,t)
        primal = max(0,onevec-t-y.*(X'*w+b));
        grad_w = lam*w - X*(u.*y) - beta*X*(y.*primal);
        grad_b =-u'*y - beta*y'*primal;
        grad_t = onevec-u - beta*primal;

        % compute gradient error
        id1 = t > 0;
        id2 = t == 0;
        grad_err = norm(grad_w) + norm(grad_b) ...
            + norm(grad_t(id1)) + norm( min(0,grad_t(id2)) );

        subit = 0;
        % start of inner-loop
        while grad_err > subtol && subit < maxsubit

            % compute gradient of augmented Lagrangian function at
            % (w0,b0,t0)
            primal_0 = max(0,onevec-t0-y.*(X'*w0+b0));
            grad0_w= lam*w0 - X*(u.*y)- beta*X*(y.*primal_0);
            grad0_b = -u'*y - beta*y'*primal_0;
            grad0_t = onevec-u - beta*primal_0;

            % evaluate the value of augmented Lagrangian function at (w0,b0,t0)
            alpha = alpha0*inc_ratio;
            subit = subit + 1;

            augObj0 = sum(t0) + lam/2*norm(w0)^2 + u'*(onevec-t0-y.*(X'*w0+b0)) ...
                    + beta/2*norm(primal_0)^2;
            augObj = Inf;

            % perform line search by checking local Lip continuity
            while augObj > augObj0 + (w-w0)'*grad0_w + 0.5/alpha*norm(w-w0)^2 ...
                    + (b-b0)'*grad0_b + 0.5/alpha*norm(b-b0)^2 ...
                    + (t-t0)'*grad0_t + 0.5/alpha*norm(t-t0)^2

                %Keep alpha above alpha min
                alpha = dec_ratio*alpha;
                alpha = max(alpha_min, alpha);

                % update (w,b,t) from (w0,b0,t0) by using step size alpha
                w = w0 - alpha*grad0_w;
                b = b0 - alpha*grad0_b;
                t = t0 - alpha*grad0_t;
                t = max(0,t);
                % evaluate the value of augmented Lagrangian function at (w,b,t)

                augObj = sum(t) + lam/2*norm(w)^2 + u'*(onevec-t-y.*(X'*w+b)) ...
                       + beta/2*norm( max(0,onevec-t-y.*(X'*w+b) ))^2;

            end
            alpha0 = alpha;
            w0 = w; b0 = b; t0 = t;

            % compute gradient of the augmented Lagrangian function at (w,b,t)
            primal = max(0,onevec-t-y.*(X'*w+b));
            grad_w = lam*w - X*(u.*y)- beta*X*(y.*primal);
            grad_b = -u'*y -beta*y'*primal;
            grad_t = onevec-u - beta*primal;

            % compute grad_err
            id1 = t > 0;
            id2 = t == 0;
            grad_err = norm(grad_w) + norm(grad_b) ...
                + norm(grad_t(id1)) + norm( min(0,grad_t(id2)) );

        end
    end
%=====================================================

end