function [model] = classf_lr_fw_tr(X,Y,param)
% feature-weighted regularized logistic regression. You can set small
% penalty weights to the features that are known to be important beforehand
% One vs. all for multi-class problems. Y should starts from 1.
% X is nSmp-by-nFt and Y is nSmp-by-1.
% PARAM is a struct with possible fields: ftPenal, lambda, and nIter.
% They are parameters for the algorithm. Their meanings are explained in the
% default parameters below.

% Example:

% Please refer to Ke Yan et al., Improving the transfer ability of 
% prediction models for electronic noses, Sensors and Actuators	B: Chemical, 2015
%	Copyright 2015 YAN Ke, Tsinghua Univ. http://yanke23.tk, xjed09@gmail.com


[nSmp,nFt] = size(X);

% default parameters
ftPenal = ones(1,nFt); % penalization weight of each feature. if ftPenal(i) 
% is large, the i'th feature will be less relied in the model
lambda = 0; % regularization parameter to all features
nIter = 50; % number of iteration. Should be larger if nSmp or nFt is large

defParam

X = [ones(nSmp,1),X]; % add constant column
nCls = max(Y);
initTheta = zeros(nFt+1,1);
options = optimset('GradObj','on','MaxIter',nIter);
fmin = @fmincg; % fminunc is very slow

if nCls == 2 % only compute one theta, will be faster
	f = @(t)lrCostFunction(t,X,(Y==1),lambda*ftPenal); % the 3rd para should be 0 or 1
	[thetas,fh] = fmin(f,initTheta,options);
else
	thetas = zeros(nFt+1,nCls); % one vs all
	for p = 1:nCls
		f = @(t)lrCostFunction(t,X,(Y==p),lambda); % the 3rd para should be 0 or 1
		thetas(:,p) = fmin(f,initTheta,options);
	end
end

model.thetas = thetas;

end

function [J, grad] = lrCostFunction(theta, X, Y, reguTerm)

smpNum = length(Y);
hyp = sigmoid(X*theta); % hypothesis
penalTerm = sum(theta(2:end).^2 .* reguTerm')/2/smpNum;
J = -sum(Y.*log(hyp)+(1-Y).*log(1-hyp)) / smpNum + penalTerm;
penalTermG = theta.*[0;reguTerm']/smpNum; % penalty for theta(1) is 0
grad = X'*(hyp-Y)/smpNum + penalTermG;

end