function [theta, inmodel] = WLSBackwardElimination(XAll,y,phi,inmodel,keepcols)
%
%
% [theta, inmodel] = WLSBackwardElimination(XAll,y,phi,inmodel,keepcols)
%
%
% Description:
%
% Backward elimination for local models that are estimated with weighted least squares (WLS).
% For each local model the most significant regressor terms are selected corresponding to an
% iterative t-test. For each term the confidence level is computed and the null-hypothesis 
% probability. Terms with pRemove>0.1 are removed from the model.
%
%
% theta:    Regression coefficients after subset selection
% inmodel:  Logical vector that indicates which model terms are initially included
%           after subset selection
%
% XAll:     x-regressor matrix (unweighted)
% y:        Output vector (unweighted)
% phi:      LM weights
% inmodel:  Logical vector that indicates which model terms are initially included
% keepcols: Logical vector that specifies which regression terms must be included
%
% XAll must be scaled to unit variance and zero mean!
%
% Requires tcdf.m. Some content is partly taken from stepwisefit.m.
%
% Benjamin Hartmann, 23-October-2012
% Institute of Mechanics & Automatic Control, University of Siegen, Germany
% Copyright (c) 2012 by Prof. Dr.-Ing. Oliver Nelles


nxAll = size(XAll,2);
[N nOut] = size(y);

% Define the threshold probability for rejection of regressors.
% The amount of terms will be low for low pRemove values.
pRemove = 0.1;

% With the logical vector keepcols the user can specify regression terms that must be included.
if nargin < 5
    keepcols = false(1,nxAll);
end

% Check for subset selection index
if nargin < 4
    inmodel = true(1,nxAll);
end

% Scale rows of X matrix and response y with sqrt(phi)
XAll = bsxfun(@times,sqrt(phi),XAll(:,inmodel));
y = bsxfun(@times,sqrt(phi),y);

% Loop for backward elimination
go_on = true;
while go_on
    
    % Select current regressors
    X = XAll(:,inmodel);
    
    % QR decomposition
    [Q,R,perm] = qr(X,0);
    
    % Transform output vector
    z = Q'*y;
    
    % Check rank of R in order to remove dependent columns of X
    nx = sum(inmodel);
    tol = max(N,nx)*eps(class(X));
    keepCols = (abs(diag(R)) > tol*abs(R(1)));
    rankX = sum(keepCols);
    if rankX < sum(inmodel)
        R = R(keepCols,keepCols);
        Q = Q(:,keepCols);
        z = z(keepCols,:);
        perm = perm(keepCols);
    end
    z( abs(z) < tol*max(abs(z)) ) = 0;
    
    % Estimate parameters
    theta = zeros(nx,nOut);
    theta(perm,1:nOut) = R \ z;
    
    % Remove weights that are almost zero
    theta(abs(theta)<tol) = 0;
    
    % Calculate leverage: diag(S) = diag( W*X*inv(X'*W*X)*X'*W ) , where yHat = S*y
    leverage = phi.*sum(abs(Q).^2,2);
    
    % Calculate effective number of observations and effective DOF nEff: trace(S), where yHat = S*y;
    nEffi = sum(leverage);
    Ni = sum(phi);
    
    % Compute the mean squared error MSE = phi'*(y-X*bStandard).^2./(sum(phi)-nEffi)
    NRMSE = norm(y-X*theta)^2/norm(y-mean(y))^2;
    if NRMSE > tol
        MSE = ( y'*y - z'*z )./(Ni-nEffi);
    else % if Ni = nEffi, we have the exact solution for X*theta == y with zero error
        MSE = zeros(1,nOut);
    end
    
    % Calculate the standard deviations of the weightes LS estimates
    Rinv = R\eye(size(R));
    stdTheta = zeros(nx,nOut);
    stdTheta(perm,1:nOut) = sqrt(sum(Rinv.^2,2) * MSE);
    % The equivalent formulation would be:
    % stdTheta = sqrt( diag(covTheta) ) = sqrt( diag( inv(X'*diag(phi)*X) ) * MSE )
    
    % Compute t-values and p-values
    tVal = zeros(nxAll,nOut);
    tVal(inmodel,1:nOut) = theta./stdTheta;
    pVal = 2*tcdf( -abs(tVal) , Ni-nEffi );
    
    % Store back zeros for coefficients that were removed
    thetaAll = zeros(nxAll,nOut);
    thetaAll(inmodel) = theta;
    theta = thetaAll;
    
    % Check which terms should be removed
    removeIdx = 0;
    p = NaN;
    termsin = find(inmodel & ~keepcols);
    if ~isempty(termsin)
        badterms = termsin(isnan(pVal(termsin)));
        if ~isempty(badterms)
            % Apparently we have a perfect fit but it is also overdetermined.
            % Terms with NaN coefficients may as well be removed.
            removeIdx = isnan(theta(badterms));
            if any(removeIdx)
                removeIdx = badterms(removeIdx);
                removeIdx = removeIdx(1);
            else
                % If there are many terms contributing to a perfect fit, we
                % may as well remove the term that contributes the least.
                % For convenience we'll pick the one with the smallest coeff.
                [~,removeIdx] = min(abs(theta(badterms)));
                removeIdx = badterms(removeIdx);
            end
            p = NaN;
        else
            [pmax,kmax] = max(pVal(termsin));
            if pmax>pRemove
                removeIdx = termsin(kmax(1));
                p = pmax;
            end
        end
    end
    
    if removeIdx == 0 || sum(inmodel)==1
        go_on = false;
    else
        inmodel(removeIdx) = false;
        go_on = true;
    end
    
    
end
end