cosmo pca

function [pca_samples, params] = cosmo_pca(samples, retain)
    % Principal Component Analysis
    %
    % [pca_samples,params]=cosmo_pca(samples[,retain])
    %
    % Input:
    %   samples                 M x N  numeric matrix
    %   retain                  (optional) number of components to retain;
    %                           must be less than or equal to N. Default: N
    %
    % Output:
    %   pca_samples             M x retain samples in Principal Component
    %                           space, after samples have been centered
    %   params                  struct with fields:
    %     .coef                 M x retain Principal Component coefficients
    %     .mu                   M x 1 column-wise average of samples
    %                           It holds that:
    %                             samples=bsxfun(@plus,params.mu,...
    %                                                 pca_samples*params.coef')
    %     .explained            1 x N Percentage of explained variance
    %
    %
    % Examples:
    %     samples=[      2.0317   -0.8918   -0.8258;...
    %                    0.5838    1.8439    1.1656;...
    %                   -1.4437   -0.2617   -1.9207;...
    %                   -0.5177    2.3387    0.4412;...
    %                    1.1908   -0.2040   -0.2088;...
    %                   -1.3265    2.7235    0.1476];
    %     %
    %     % apply PCA, keeping two dimensions
    %     [pca_samples,params]=cosmo_pca(samples,2);
    %     %
    %     % show samples in PC space
    %     cosmo_disp(pca_samples);
    %     %|| [  -2.64     0.654
    %     %||    0.923      1.43
    %     %||   -0.723     -2.48
    %     %||     1.64     0.265
    %     %||    -1.46     0.569
    %     %||     2.27    -0.438 ]
    %     %
    %     % show parameters
    %     cosmo_disp(params);
    %     %|| .coef
    %     %||   [ -0.512     0.744
    %     %||      0.794     0.219
    %     %||      0.328     0.632 ]
    %     %|| .mu
    %     %||   [ 0.0864     0.925      -0.2 ]
    %     %|| .explained
    %     %||   [    66
    %     %||      33.3
    %     %||     0.676 ]
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    if nargin < 2
        retain = [];
    end

    verify_parameters(samples, retain);
    ndim = get_number_of_components(samples, retain);

    % subtract mean
    mu = mean(samples, 1);
    samples_demu = bsxfun(@minus, samples, mu);

    % singular value decomposition
    [u, s, w] = svd(samples_demu, 'econ');

    % extract eigen values
    [nrow, ncol] = size(samples);
    samples_is_vector = nrow == 1 || ncol == 1;
    if samples_is_vector
        % single eigen value
        eigvals = s(1);
    else
        % take diagonal
        eigvals = diag(s);
    end

    if ndim == 0
        % separate case for zero dimensions
        pca_samples = zeros(1, 0);
        coef = zeros(ncol, 0);
    else
        pca_samples_rand_sign = bsxfun(@times, u(:, 1:ndim), eigvals(1:ndim)');
        [coef, sgn] = max_abs_positive_columnwise(w(:, 1:ndim));
        pca_samples = bsxfun(@times, pca_samples_rand_sign, sgn);
    end

    % store coefficients
    params = struct();
    params.coef = coef;
    params.mu = mu;

    nexpl = min([nrow - 1, ncol]);
    if nrow == 1 || ncol == 0
        % special case for empty vecto with explained variance
        params.explained = zeros(1, 0);
    else
        explained_ratio = eigvals'.^2;
        params.explained = 100 * explained_ratio(1:nexpl) / sum(explained_ratio);
    end

function ncomp = get_number_of_components(samples, retain)
    max_retain = size(samples, 2);

    if isempty(retain)
        retain = max_retain;
    end

    if retain > max_retain
        error('retain argument %d must be less than %d', ...
              retain, max_retain);
    end

    [nrow, ncol] = size(samples);
    ncomp = min([nrow - 1, ncol, retain]);

function [coef_pos, sgn] = max_abs_positive_columnwise(coef)
    % swap sign for each column in which the maximum absolute value is
    % negative. sgn contains the sign used in each column (-1 or 1)
    [unused, i] = max(abs(coef), [], 1);
    [nrows, ncols] = size(coef);
    mx_idx = (0:(ncols - 1)) * nrows + i;
    mx = coef(mx_idx);

    sgn = (mx > 0) * 2 - 1;
    coef_pos = bsxfun(@times, coef, sgn);

function verify_parameters(samples, retain)
    if ~isnumeric(samples)
        error('samples argument must be numeric');
    end

    if numel(size(samples)) > 2
        error('samples argument must be a matrix');
    end

    if ~(isempty(retain) || ...
         (isscalar(retain) && ...
          retain > 0 && ...
          isequal(round(retain), retain)))
        error('retain argument must be positive integer');
    end