function [J, gradient, parameter, center, phi, outputModelNew, pseudoInv] = ...
    obliqueGlobalLossFunction(...
    wRed, w0, xRegressor, zRegressor, output, outputModel, ...
    weightedOutputWorstLM, phiWorstLM, smoothness, optGrad, ...
    dataWeighting, LOOCV, optLOOCV, SiiAll,...
    kStepPrediction, xInputDelay, zInputDelay, xOutputDelay, zOutputDelay,...
    leafModels, localModels, worstLM)
% OBLIQUEGLOBALLOSSFUNCTION splits the worst LM of current net and estimates parameters for the newly generated
% LMs. Afterwards it evaluates the new model and calculates the global loss function value for optimization.
%
%
% [J, gradient, parameter, center, phi, outputModelNew] = ...
%     obliqueGlobalLossFunction(wRed, w0, xRegressor, zRegressor, output, outputModel, ...
%     weightedOutputWorstLM, phiWorstLM, smoothness, optGrad, dataWeighting)
%
%
% INPUT
%
%   wRed:                   (nz x 1)   Weigth vector for sigmoid direction.
%   w0:                     (1 x 1)    Sigmoid offset value. Kept constant during optimization.
%   xRegressor:             (N x nx)   Regression matrix for rule consequents (LM estimation).
%   zRegressor:             (N x nz)   Regression matrix for rule premises (Splitting).
%   output:                 (N x q)    Measured training data output.
%   outputModel:            (N x q)    Current model output.
%   weightedOutputWorstLM:  (N x q)    Weighted model output of current worst LM that is splitted.
%   phiWorstLM:             (N x 1)    Validity function values of the worst LM.
%   smoothness:             (1 x 1)    Value for interpolation smoothness.
%   optGrad:                (1 x 1)    Flag for the application of an analytical gradient, if true.
%   dataWeighting:          (N x 1)    Weighting of the data samples. Needed for LM estimation.
%   
%   worst LM                           Index of the worst LM
%
% OUTPUT
%
%   J:                      (1 x 1)    Current global loss function value after splitting.
%   gradient:               (nz x 1)   Derivatives of Loss function with respect to wRed.
%   parameter:              cell(1,2)  New LM parameter values after splitting.
%   center:                 (2 x nz)   New LM center values after splitting.
%   phi:                    (N x 2)    Validity functions for the two newly generated LMs.
%   outputModelNew:         (N x 1)    Model output including the two newly generated LMs.
%
%
% HiLoMoT - Nonlinear System Identification Toolbox
% Benjamin Hartmann, 04-April-2012
% Institute of Mechanics & Automatic Control, University of Siegen, Germany
% Copyright (c) 2012 by Prof. Dr.-Ing. Oliver Nelles

% % TODO: add comments above

%% 1) Calculate the validity functions for the two newly generated LMs with splitting parameter w

% Parameter vector assembly, because only w0 is kept constant during optimization
w  = [wRed; w0];

% Get some constants
numberOfInputs      = size(zRegressor,2);
numberOfxRegressors = size(xRegressor,2);
[numberOfSamples, numberOfOutputs]= size(output);
zLowerBound         = min(zRegressor);
zUpperBound         = max(zRegressor);


%
% 1.1) Calculate centers with crisp validity functions
%

% Normalize and correct weight vector
deltaCenter = 0.01*(zUpperBound - zLowerBound);             % Choose a small value (sharp transition)
if any(deltaCenter<eps); deltaCenter=deltaCenter+eps; end
kappa       = 20/(norm(w)*norm(deltaCenter)*smoothness);    % Factor that normalizes the sigmoid parameters


% Splitting functions
psi      = 1./(1+exp( kappa * ( w0 + zRegressor*wRed )));
psiComp  = 1-psi;

% Validity functions
phi = bsxfun(@times, phiWorstLM, [psi psiComp]);
% bsxfun for better performance, equal to:
% phi      = zeros(numberOfSamples,2);
% phi(:,1) = phiWorstLM.*psi;
% phi(:,2) = phiWorstLM.*psiComp;


%
% 1.2) Calculate validity functions with correctly smoothed transitions
%

% Calculate centers from validity functions and data distribution (crisp transitions)
center = zeros(2,numberOfInputs);
for dim = 1:numberOfInputs
    center(1,dim) = zRegressor(:,dim)'*phi(:,1)/(sum(phi(:,1))+eps);
    center(2,dim) = zRegressor(:,dim)'*phi(:,2)/(sum(phi(:,2))+eps);
end

% Update phi with center info (correctly smoothed transition)
deltaCenter = center(1,:) - center(2,:);                    % Updated distance between LM-centers
if any(deltaCenter<eps); deltaCenter=deltaCenter+eps; end
kappa       = 20/(norm(w)*norm(deltaCenter)*smoothness);    % Factor that normalizes the sigmoid parameters

% Splitting functions
psi      = 1./(1+exp( kappa * ( w0 + zRegressor*wRed )));
psiComp  = 1-psi;

% Validity functions
phi = bsxfun(@times, phiWorstLM, [psi psiComp]);



%% 2) Estimate parameters of the newly generated LMs

% Initialize cell for coefficient storage
parameter = cell(1,2);
pseudoInv = cell(1,2);

% Loop over the two newly generated LMs
for lm = 1:2
    
    % Define vectors/matrices for local estimation
    r = sqrt(phi(:,lm).*dataWeighting);             % Weighting vector for data
    rMat = r(:,ones(numberOfxRegressors,1));        % Generate an N x nx matrix [r r ... r]
    xRegressorWeighted = xRegressor.*rMat;          % Weighted regression matrix
    rMat = r(:,ones(numberOfOutputs,1));            % Generate an N x q matrix [r r ... r]
    outputWeighted = output.*rMat;                  % Weighted output matrix
    
    % LS estimation, generate an nx x q parameter matrix using Q-R-factorization for fast computation
    if ~optLOOCV
        parameter{1,lm} = xRegressorWeighted\outputWeighted; % using the Q-R-factorization for fast computation
    else
        pseudoInv{1,lm} = pinv(xRegressorWeighted);
        parameter{1,lm} = pseudoInv{1,lm}*outputWeighted;
    end

    
end



%% 3) Calculate weighted output for the two newly generated LMs

% Procedure explanation:
%
% Due to the hierarchical model structure, it is possible to substract the
% weighted output of the worstLM from the overall model output in order to
% add the weighted local model outputs that result from splitting the worst
% LM. This approach accelerates the split optimization.
%
% outputModel    = sum_i( phi_i*yHat_i ) + phi_worstLM*yHat_worstLM
% outputModelNew = sum_i( phi_i*yHat_i ) +  phi_newLM1*yHat_newLM1   + phi_newLM2*yHat_newLM2
%
% Difference:
% outputModelNew - outputModel = { phi_newLM1*yHat_newLM1 + phi_newLM2*yHat_newLM2 } - phi_worstLM*yHat_worstLM
%
%                                                        ||                                       ||
% This leads to:
% outputModelNew - outputModel =                 weightedOutputNewLM                 -   weightedOutputWorstLM

if kStepPrediction <= 1
    weightedOutputNewLM = zeros(numberOfSamples,numberOfOutputs);
    if numberOfOutputs == 1
        weightedOutputNewLM = sum(phi.*(xRegressor * [parameter{:}]),2);
    else
        for out = 1:numberOfOutputs % Loop over all outputs
            para_out = cell2mat(cellfun(@(x) x(:,out), parameter, 'UniformOutput', false));
            weightedOutputNewLM(:,out) = sum(phi.*(xRegressor * para_out),2);
        end
    end
    
    % Update model output
    outputModelNew = outputModel - weightedOutputWorstLM + weightedOutputNewLM;
    
else
    % TODO: ADD THE TWO NEW MODELS!
    % Update active models
    leafModels(worstLM) = false;
    leafModels = [leafModels  true(1,2)];
    
    % Normalize sigmoid parameters
    
    % add child no.1
    localModels(end+1) = sigmoidLocalModel(worstLM,kappa*w,center(1,:),parameter{1});
    % get child idx
    childIdx = length(localModels);
    % add the child index to the parent model
    localModels(worstLM).children(end+1) = childIdx;
    % use the smoothness of the parent
    localModels(childIdx).localSmoothness = localModels(worstLM).localSmoothness;
    
    % add child no.2
    localModels(end+1) = sigmoidLocalModel(worstLM,-kappa*w,center(2,:),parameter{2});
    % get child idx
    childIdx = length(localModels);
    % add the child index to the parent model
    localModels(worstLM).children(end+1) = childIdx;
    % use the smoothness of the parent
    localModels(childIdx).localSmoothness = localModels(worstLM).localSmoothness;

    % for kStepPrediction > 1, e.g. dynamic models
    outputModelNew = simulateParallelSub(xRegressor,zRegressor,...
    numberOfOutputs, xInputDelay, zInputDelay, xOutputDelay, zOutputDelay,...
    leafModels, localModels);
end


%% 4) Calculate global loss function value

if ~optLOOCV  % Optimization wrt. training error (default)
    
    % weightPenalty = 0.001*sqrt(w'*w); % Needed to make the optimum unique by favoring short distance vectors
%         weightPenalty     = 0;
%         error2            = (output-outputModelNew).^2; % N x q squared error matrix
%         outputMean        = mean(output,1);
%         outputDifference2 = (output-outputMean(ones(numberOfSamples,1),:)).^2;
%         J = sqrt(sum(sum(error2.*dataWeighting(:,ones(1,numberOfOutputs)),2))/sum(sum(outputDifference2,2))) + weightPenalty;
    J = output-outputModelNew;
else          % Optimization wrt. leave-one-out cross-validation error
    
    [N, nx] = size(xRegressor);
    q      = size(output,2);
    SiiAll = [zeros(N,2) SiiAll];
    for k = 1:2
        psInv = pseudoInv{1,k};
        r2    = phi(:,k).^(3/2);
        XW    = xRegressor.*r2(:,ones(1,nx));
        for m = 1:nx
            SiiAll(:,k) = SiiAll(:,k) + XW(:,m).*psInv(m,:)';
        end
    end
    Sii  = sum(SiiAll,2);
    LOOE = (output-outputModelNew)./(ones(N,q)-Sii(:,ones(1,q)));
%     J    = sqrt(sum(LOOE.^2)/N);
%     J    = sum(J); % Sum up, if multiple outputs are used
    J = LOOE;
end





%% 5) Gradient calculation in order to speed up optimization (optional)

if optGrad
    
    wRed = w(1:end-1);
    w0   = w(end);
    
    error1       = output-outputModelNew;
    F1           = 1./(sqrt(sum(sum(error2,2))).*sqrt(sum(sum(outputDifference2,2))));       	   % Support variable F1
    kappa        = 20/(norm(w)*norm(deltaCenter)*smoothness);                                      % Important parameter for the derivation
    outputModel1 = xRegressor*parameter{1,1};                                                      % Output model of the worst LM (global)
    outputModel2 = xRegressor*parameter{1,2};                                                      % Output model of the last LM (global)
    exponent     = exp(kappa*(w0 + zRegressor*wRed))./((1+ exp(kappa*(w0 + zRegressor*wRed))).^2); % e-function, used for calculating the sigmoid-/splitting-function
    F2           = -20/(smoothness*(norm(w)*norm(deltaCenter)).^2);
    
    % Initialize some variables
    g  = zeros(numberOfSamples,size(wRed,1));
    H1 = zeros(numberOfSamples,numberOfInputs);
    H2 = zeros(numberOfSamples,1);
    
    for k = 1:numberOfInputs % for-loop over all input-dimensions
        E  = 2*norm(w)/norm(deltaCenter).*deltaCenter(k).*1./numberOfSamples * zRegressor(:,k).*phiWorstLM.*exponent.*kappa;
        H1 = H1 + E(:,ones(numberOfInputs,1)').*zRegressor;
        H2 = H2 + 2*norm(w)/norm(deltaCenter).*deltaCenter(k).*1./numberOfSamples.* zRegressor(:,k).*phiWorstLM.*exponent.*(w0 + zRegressor*wRed);
    end
    
    H1 = sum(H1,1);
    H2 = sum(H2,1);
    
    for l = 1:size(wRed,1) % for-loop over vector w
        dPsi = - exponent.*(kappa.*zRegressor(:,l) + (wRed(l).*norm(deltaCenter)./norm(w) - H1(l))./(1/F2 + H2) .* (w0 + zRegressor*wRed));
        dQ = phiWorstLM.*dPsi;
        
        % Use Q-R-Factorization for better performance
        A = (xRegressor'.*psi(:,ones(size(xRegressor,2),1))')*xRegressor;
        a = (xRegressor'.*dQ(:,ones(size(xRegressor,2),1))')*(output-outputModel1);
        AQR = A\a;
        B = (xRegressor'.*psiComp(:,ones(size(xRegressor,2),1))')*xRegressor;
        b = (xRegressor'.*dQ(:,ones(size(xRegressor,2),1))')*(output-outputModel2);
        BQR = B\b;
        
        g(:,l) = - F1.*error1.*phiWorstLM.*( (outputModel1 - outputModel2).*dPsi + xRegressor*AQR.*psi - xRegressor*BQR.*psiComp );
    end
    
    gradient = sum(g,1)';
    
else
    
    gradient = [];
    
end
end

%% ------------------------------------------------------------------------
% Subfunction for parallel simulation
% -------------------------------------------------------------------------
function [outputModel,phi] = simulateParallelSub(xRegressor,zRegressor,...
    numberOfOutputs, xInputDelay, zInputDelay, xOutputDelay, zOutputDelay,...
    leafModels, localModels)
% SIMULATEPARALLEL  Simulate the output of a local linear neuro-fuzzy model
%
% [outputModel,phi] = simulateParallel(obj,xRegressor,zRegressor)
%
% Output:
%   outputModel:    (N x q)    Matrix of model outputs
%   phi:            (N x M)    Matrix of phi (validity function values)
%
% Input:
%   xRegressor for initialization, will be overwriten 
%   zRegressor

% LMNtool - Local Model Network Toolbox
% Tobias Ebert, 24-April-2012
% Institute of Mechanics & Automatic Control, University of Siegen, Germany
% Copyright (c) 2012 by Prof. Dr.-Ing. Oliver Nelles

% Number of data samples
numberOfSamples = size(xRegressor,1);

% Number of outputs
numberOfOutputs = numberOfOutputs;

% Determine number of regressor inputs
xNumberOfInputRegressorsSum = sum(cellfun(@length,xInputDelay));
zNumberOfInputRegressorsSum = sum(cellfun(@length,zInputDelay));

% preallocation for function outputs
outputModel = zeros(numberOfSamples,numberOfOutputs);
phi = zeros(numberOfSamples,sum(leafModels));

% get the parameters of all leaf models
localParameter = arrayfun(@(cobj) cobj.parameter,localModels(leafModels),'UniformOutput',false);

for k = 1:numberOfSamples % Through all samples
    
    % if offset is last entry
    % idx = xNumberOfInputRegressorsSum + 1; % For regression matrix x
    
    % if offset is first regressor, the xOutputDelays start at index
    % xNumberOfInputRegressorsSum+2! (delayed inputs + Offset + first idx after that)
    idx = xNumberOfInputRegressorsSum + 2; % For regression matrix x
    
    % Fill matrix with output regressors
    for out = 1:numberOfOutputs % Through all outputs
        for outReg = 1:length(xOutputDelay{out}) % Through all output regressors
            kDelay = k-xOutputDelay{out}(outReg);
            if kDelay > 0
                xRegressor(k,idx) = outputModel(kDelay,out);
            end
            idx = idx + 1;
        end
    end
    
    idx = zNumberOfInputRegressorsSum + 1; % For regression matrix z
    
    % Fill matrix with output regressors
    for out = 1:numberOfOutputs % Through all outputs
        for outReg = 1:length(zOutputDelay{out}) % Through all output regressors
            kDelay = k-zOutputDelay{out}(outReg);
            if kDelay > 0
                zRegressor(k,idx) = outputModel(kDelay,out);
            end
            idx = idx + 1;
        end
    end
    
    % Calculate validity function values, only one row
    phi1row = calculatePhiSub(localModels,zRegressor(k,:),leafModels);
    
    % save phi(k,:) to variable for function output
    phi(k,:) = phi1row;
    
    % Calculate model output for sample k
    % calcYhat is a static functio
    outputModel(k,:) = calcYhatSub(xRegressor(k,:),phi1row,localParameter);
    
end

end

%% ------------------------------------------------------------------------
% Subfunction for validity calculation
% -------------------------------------------------------------------------

function [phi,phiMemory] = calculatePhiSub(localModels,zRegressor,idx,phiMemory)
% CALCULATEPHI callculates the validity functino values (phi) of every z
% regressor for every local model
%
% [phi,phiMemory] = obj.calculatePhi(zRegressor,idx,phiMemory)

% LMNtool - Local Model Network Toolbox
% Tobias Ebert, 24-April-2012
% Institute of Mechanics & Automatic Control, University of Siegen, Germany
% Copyright (c) 2012 by Prof. Dr.-Ing. Oliver Nelles

% convert to non-logical index, makes things easier
% idx = find(idx);

% get all splitting paarmeters
splittingPara = [localModels(2:end).splittingParameter];
smoothness = [localModels(2:end).localSmoothness];

% vector of psi values, including root to simplify indexing
psi = [1 1./(1+exp( ...
    (1./smoothness) .* (zRegressor*splittingPara(1:end-1,:) + splittingPara(end,:))...
    ))];

for childIdx = (1:length(psi))
   if localModels(childIdx).parent > 1
       % multiply with parent psi if necessary
       psi(childIdx) = psi(childIdx) * psi(localModels(childIdx).parent);
   end
end

phi = psi(idx);

end

function [phi,phiMemory] = calculatePhiSubOld(localModels,zRegressor,idx,phiMemory)
% CALCULATEPHI callculates the validity functino values (phi) of every z
% regressor for every local model
%
% [phi,phiMemory] = obj.calculatePhi(zRegressor,idx,phiMemory)

% LMNtool - Local Model Network Toolbox
% Tobias Ebert, 24-April-2012
% Institute of Mechanics & Automatic Control, University of Siegen, Germany
% Copyright (c) 2012 by Prof. Dr.-Ing. Oliver Nelles

% TODO SIMPLIFY AS REGRESSOR IS ALWAYS ONE SAMPLE ONLY

% prevent errors
if ~exist('phiMemory','var')
    phiMemory = [];
elseif ~iscell(phiMemory)
    % convert to cells
    phiMemory = mat2cell(phiMemory,size(phiMemory,1),ones(1,size(phiMemory,2)));
end

if islogical(idx)
    % convert to non-logical index, makes things easier
    idx = find(idx);
end

% no errormassage, to time consuming
%if arrayfun(@(LMO) isempty(LMO.splittingParameter) && ~isempty(LMO.parent), localModels(idx))
%    % if splittingParameter is empty and knot is not the root, then error
%    error('sigmoidGlobalModel:calculatePhi','Property <splittingParameter> is empty!')
%end


% loop over all idx
phi = zeros(size(zRegressor,1),length(idx));
for k = 1:length(idx)
    
    %if isempty(localModels(idx(k)).parent) && isempty(localModels(idx(k)).splittingParameter)
        
        % if there is no parent AND the parameters of the sigmoid are empty, it is the root
        %phi(:,k) = ones(size(zRegressor,1),1);    
    
        % there is never a root to be calculated
        
    if size(phiMemory,2)<idx(k) || isempty(phiMemory{1,idx(k)}) % test if phi must be calculated
        
        % phi has not been calculated already, calculate now
        phi(:,k) = localModels(idx(k)).calculatePsi(zRegressor);
        % if there is a parent AND the parent is NOT the root, then multiply with it
        if ~isempty(localModels(idx(k)).parent) && ~isempty(localModels(localModels(idx(k)).parent).parent)
            % calculate phi of parent
            [rootPhi,phiMemory] = calculatePhiSub(localModels,zRegressor,localModels(idx(k)).parent,phiMemory);
            % multiply with parent to get phi of idx(k)
            phi(:,k) = phi(:,k) .* rootPhi;
        end    
        
        % write phi to memory, it may be used later
        if any(phi(:,k))==0
            % if there are phis==0 then make a sparse matrix to save memory
            phiMemory{1,idx(k)} = sparse(phi(:,k));
        else
            phiMemory{1,idx(k)} = phi(:,k);
        end
        
    else
        
        % phi has been calculated, read from memory
        phi(:,k) = phiMemory{1,idx(k)};
        
    end
    
end

end

%% ------------------------------------------------------------------------
% Subfunction for output calculation
% -------------------------------------------------------------------------
function outputModel = calcYhatSub(xRegressor,validityFunctionValue,parameter)
% CALCYHAT predicts the output of the model for a given regressor matrix
% of the consequent space (x-regressor).
%
% outputModel = calcYhat(xRegressor,validityFunctionValue,parameter)
%
%
% OUTPUTS:
%
%   outputModel:        (N x q)     Matrix of model outputs
%
%
% INPUTS:
%
% xRegressor:           (N x nx)    Consequent regression matrix
%
% validityFunctionValue {1 x M}     cell array containing the matrices of
%                                   the normalized validity function values
%                                   for each local model
%
% parameter:            {1 x M}     cell array containing the parameter
%                                   vector og each local model
%
%
% SYMBOLS AND ABBREVIATIONS:
%
% LM:  Local model
%
% p:   Number of inputs (physical inputs)
% q:   Number of outputs
% N:   Number of data samples
% M:   Number of LMs
% nx:  Number of regressors (x)
% nz:  Number of regressors (z)
%
%
% LMNtool - Local Model Network Toolbox
% Tobias Ebert, 16-November-2011
% Institute of Mechanics & Automatic Control, University of Siegen, Germany
% Copyright (c) 2012 by Prof. Dr.-Ing. Oliver Nelles

% 2011/11/18:   help updated (TE)


% number of data samples
numberOfSamples = size(xRegressor, 1);

% number of outputs
numberOfOutputs = size(parameter{1},2);

% convert the cell to a matrix
if iscell(validityFunctionValue)
    validityFunctionValue = cell2mat(validityFunctionValue);
end


if numberOfOutputs == 1
    
    % special case onyl one output, faster than loop below
    outputModel = sum(validityFunctionValue.*(xRegressor * [parameter{:}]),2);
    
else
    
    % loop over all outputs to predict the output model
    outputModel = zeros(numberOfSamples,numberOfOutputs);
    for out = 1:numberOfOutputs
        para_out = cell2mat(cellfun(@(x) x(:,out),parameter,'UniformOutput',false));
        outputModel(:,out) = sum(validityFunctionValue.*(xRegressor * para_out),2);
    end
    
end
end
