cosmo normalize

function [ds, params] = cosmo_normalize(ds, params, dim)
    % normalize dataset either by estimating or applying estimated parameters
    %
    % [ds, est_params]=cosmo_normalize(ds, norm_type[, dim])
    %
    % Inputs
    %   ds            a dataset struct with field .samples of size PxQ, or a
    %                 numeric array of that size
    %   params        either the type of normalization:
    %                   - 'demean'     (mean of zero)
    %                   - 'zscore'     (demean and unit variance)
    %                   - 'scale_unit' (scale to interval [-1,1])
    %                 -or-
    %                 previously estimated parameters using the 'params'
    %                 output result from a previous call to this function.
    %   dim           1 or 2, indicating along with dimension of ds to
    %                 normalize (if params it not a string and ends with '1' or
    %                 2').
    %
    % Output
    %   ds_norm       a dataset struct similar to ds, but with .samples data
    %                 normalized. If the input was a numeric array then ds_norm
    %                 is a numeric array as well.
    %   params        estimated parameters for normalization. These can be
    %                 reused for a second normalization step of an independent
    %                 dataset. For example, parameters can be estimated from a
    %                 training dataset and then applied to a testing dataset
    %
    %
    % Examples:
    %     ds=struct();
    %     ds.samples=reshape(1:15,5,3)*2;
    %     cosmo_disp(ds);
    %     %|| .samples
    %     %||   [  2        12        22
    %     %||      4        14        24
    %     %||      6        16        26
    %     %||      8        18        28
    %     %||     10        20        30 ]
    %     %
    %     % demean along first dimension
    %     dsn=cosmo_normalize(ds,'demean',1);
    %     cosmo_disp(dsn);
    %     %|| .samples
    %     %||   [ -4        -4        -4
    %     %||     -2        -2        -2
    %     %||      0         0         0
    %     %||      2         2         2
    %     %||      4         4         4 ]
    %     %
    %     % demean along second dimension
    %     dsn=cosmo_normalize(ds,'demean',2);
    %     cosmo_disp(dsn);
    %     %|| .samples
    %     %||   [ -10         0        10
    %     %||     -10         0        10
    %     %||     -10         0        10
    %     %||     -10         0        10
    %     %||     -10         0        10 ]
    %     %
    %     % scale to range [-1,1] along first dimension
    %     dsn=cosmo_normalize(ds,'scale_unit',1);
    %     cosmo_disp(dsn);
    %     %|| .samples
    %     %||   [   -1        -1        -1
    %     %||     -0.5      -0.5      -0.5
    %     %||        0         0         0
    %     %||      0.5       0.5       0.5
    %     %||        1         1         1 ]
    %     %
    %     % z-score along first dimension
    %     dsn=cosmo_normalize(ds,'zscore',1);
    %     cosmo_disp(dsn);
    %     %|| .samples
    %     %||   [  -1.26     -1.26     -1.26
    %     %||     -0.632    -0.632    -0.632
    %     %||          0         0         0
    %     %||      0.632     0.632     0.632
    %     %||       1.26      1.26      1.26 ]
    %     %
    %     % z-score along second dimension
    %     dsn=cosmo_normalize(ds,'zscore',2);
    %     cosmo_disp(dsn);
    %     %|| .samples
    %     %||   [ -1         0         1
    %     %||     -1         0         1
    %     %||     -1         0         1
    %     %||     -1         0         1
    %     %||     -1         0         1 ]
    %     %
    %     % use samples 1, 3, and 4 to estimate parameters ('training set'),
    %     % and apply these to samples 2 and 5
    %     ds_train=cosmo_slice(ds,[1 3 4]);
    %     ds_test=cosmo_slice(ds,[2 5]);
    %     [dsn_train,params]=cosmo_normalize(ds_train,'scale_unit', 1);
    %     cosmo_disp(dsn_train);
    %     %|| .samples
    %     %||   [    -1        -1        -1
    %     %||     0.333     0.333     0.333
    %     %||         1         1         1 ]
    %     %
    %     % show estimated parameters (min and max for each column, in this
    %     % case)
    %     cosmo_disp(params);
    %     %|| .method
    %     %||   'scale_unit'
    %     %|| .dim
    %     %||   [ 1 ]
    %     %|| .min
    %     %||   [ 2        12        22 ]
    %     %|| .max
    %     %||   [ 8        18        28 ]
    %     %
    %     % apply parameters to test dataset
    %     dsn_test=cosmo_normalize(ds_test,params);
    %     cosmo_disp(dsn_test);
    %     %|| .samples
    %     %||   [ -0.333    -0.333    -0.333
    %     %||       1.67      1.67      1.67 ]
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    if isempty(params) || (ischar(params) && strcmp(params, 'none'))
        return
    end

    if nargin < 3
        dim = [];
    end

    apply_params = isstruct(params);
    if apply_params
        if ~isempty(dim) && params.dim ~= dim
            error('Dim specified as %d, but estimates used %d', dim, params.dim);
        end
    elseif ischar(params)
        params = get_params_from_string(params, dim);
    else
        error('norm_spec must be struct or string');
    end

    method = params.method;
    dim = params.dim;

    is_ds = isstruct(ds) && isfield(ds, 'samples');

    if is_ds
        samples = ds.samples;
    else
        samples = ds;
    end

    switch method
        case 'demean'
            if apply_params
                mu = params.mu;
            else
                mu = mean(samples, dim);
                params.mu = mu;
            end
            samples = bsxfun(@minus, samples, mu);

        case 'zscore'
            if apply_params
                mu = params.mu;
                sigma = params.sigma;
            else
                n = size(samples, dim);
                mu = sum(samples, dim) / n;
                params.mu = mu;
            end

            d = bsxfun(@minus, samples, mu);

            if ~apply_params
                sigma = sqrt(sum(d.^2, dim) / (n - 1));
                params.sigma = sigma;
            end

            if isempty(sigma)
                % Octave 3.8.2 on Ubuntu gives it the wrong size
                sigma = zeros(size(d));
            end

            % In Octave, std can give non-NaN even with NaN input
            nan_mask = any(isnan(samples), dim);
            sigma(nan_mask) = NaN;

            samples = bsxfun(@rdivide, d, sigma);

        case 'scale_unit'
            if apply_params
                min_ = params.min;
                max_ = params.max;
            else
                min_ = min(samples, [], dim);
                max_ = max(samples, [], dim);
                params.min = min_;
                params.max = max_;

            end
            samples = bsxfun(@times, 1 ./ (max_ - min_), ...
                             bsxfun(@minus, samples, min_)) * 2 - 1;

        otherwise
            error('Unsupported normalization: %s', method);

    end

    nan_msk = ~isfinite(samples);
    if any(nan_msk(:))
        cosmo_warning('%d samples are NaN or Inf after normalization', ...
                      sum(nan_msk(:)));
    end

    if is_ds
        ds.samples = samples;
    else
        ds = samples;
    end

function params = get_params_from_string(norm_method, dim_arg)
    persistent cached_params
    persistent cached_norm_method
    persistent cached_dim_arg

    if ~(isequal(norm_method, cached_norm_method) && ...
         isequal(dim_arg, cached_dim_arg))

        assert(ischar(norm_method));
        assert(numel(norm_method) > 0);

        dim = [];
        method = norm_method;

        if isempty(dim_arg)
            if isempty(dim)
                dim = 1;
            end
        else
            if isempty(dim)
                dim = dim_arg;
            else
                error(['Cannot have third argument when second argument '...
                       'ends with a number']);
            end
        end

        cached_params = struct();
        cached_params.method = method;
        cached_params.dim = dim;

        cached_norm_method = norm_method;
        cached_dim_arg = dim_arg;
    end

    params = cached_params;