cosmo independent samples partitioner skl

function partitions = cosmo_independent_samples_partitioner(ds, varargin)
    % Compute partitioning scheme based on dataset with independent samples
    %
    % partitions=cosmo_independent_samples_partitioner(ds,...)
    %
    % Inputs:
    %   ds                          dataset structure with fields .samples,
    %                               .sa.targets and .sa.chunks. Since this
    %                               function is intended for the case that all
    %                               rows (patterns) in .samples are
    %                               independent, it is required that all values
    %                               in .sa.chunks are unique.
    %   'fold_count',c              Return partitions with c folds.
    %   'test_count', tc            } Return partitions so that in each test
    %   'test_ratio', tr            } set there are tc samples (or (tr*100%)
    %                               } per unique target. These two options are
    %                               } mutually exclusive
    %   'seed',s                    (optional) use seed s for pseudo-random
    %                               number generator (default: s=1). If
    %                               provided, then this function behaves
    %                               pseudo-ranomly but deterministically, and
    %                               different calls return the same output.
    %                               If s=0, then repeated calls to this
    %                               function gives different outputs.
    %   'max_fold_count'            Safety limit to the maximum number of folds
    %                               that can be returned (default: 10000). When
    %                               this number is set to a larger value, this
    %                               may result in too much memory being
    %                               required and slowing down or crashing the
    %                               machine.
    % Output:
    %   partitions                  Cell with fields .train_indices and
    %                               .test_indices, both of size c x 1.
    %                               Each element in .test_indices has tc (when
    %                               using 'test_count') or tr * min_count
    %                               (when using 'test_ratio'; min_count is the
    %                               minimum number of samples over classes)
    %                               elements; each element in .train_indices
    %                               has min_count-tc elements.
    %                               In other words, the resulting partitions
    %                               are balanced for both training and test
    %                               set.
    %
    % Examples:
    %     % make simple dataset with 9 samples, 3 features
    %     ds=struct();
    %     ds.samples=randn(9,3);
    %     ds.sa.targets=[1 1 1 1 1 2 2 2 2]';
    %     ds.sa.chunks=(1:9)';
    %     %
    %     % Partition scheme with 5 folds, each in which 1 target in each chunk
    %     % is used for testing
    %     partitions=cosmo_independent_samples_partitioner(ds,...
    %                                             'test_count',1,...
    %                                             'fold_count',5);
    %     cosmo_disp(partitions)
    %     %|| .test_indices
    %     %||   { [ 2    [ 4    [ 2    [ 5    [ 1
    %     %||       6 ]    8 ]    7 ]    8 ]    7 ] }
    %     %|| .train_indices
    %     %||   { [ 1    [ 1    [ 1    [ 1    [ 3
    %     %||       3      3      3      2      4
    %     %||       5      5      5      4      5
    %     %||       7      6      6      6      6
    %     %||       8      7      8      7      8
    %     %||       9 ]    9 ]    9 ]    9 ]    9 ] }
    %     %
    %     % As above, but now with 2 targets in each chunk used for testing
    %     partitions=cosmo_independent_samples_partitioner(ds,...
    %                                             'test_count',2,...
    %                                             'fold_count',5);
    %     cosmo_disp(partitions)
    %     %|| .test_indices
    %     %||   { [ 1    [ 4    [ 2    [ 1    [ 1
    %     %||       2      5      3      5      5
    %     %||       6      7      6      6      6
    %     %||       7 ]    8 ]    7 ]    8 ]    7 ] }
    %     %|| .train_indices
    %     %||   { [ 3    [ 1    [ 1    [ 2    [ 3
    %     %||       5      3      5      4      4
    %     %||       8      6      8      7      8
    %     %||       9 ]    9 ]    9 ]    9 ]    9 ] }
    %     %
    %     % Now use 30% of the targets in each chunk for testing,
    %     % and return 20 chunks.
    %     partitions=cosmo_independent_samples_partitioner(ds,...
    %                                             'test_ratio',0.3,...
    %                                             'fold_count',20);
    %     cosmo_disp(partitions)
    %     %|| .test_indices
    %     %||   { [ 3    [ 4    [ 1   ... [ 5    [ 1    [ 4
    %     %||       6 ]    7 ]    8 ]       6 ]    9 ]    7 ]   }@1x20
    %     %|| .train_indices
    %     %||   { [ 1    [ 2    [ 2   ... [ 1    [ 3    [ 1
    %     %||       2      3      4         2      4      2
    %     %||       4      5      5         3      5      5
    %     %||       7      6      6         7      6      6
    %     %||       8      8      7         8      7      8
    %     %||       9 ]    9 ]    9 ]       9 ]    8 ]    9 ]   }@1x20
    %
    % Notes:
    % - Unless the number of targets and chunks is very small, the number of
    %   partitions returned by this function (=c) is less than the total number
    %   of possible partitions. In these cases, a random subset of possible
    %   partitions is chosen, with the constraint that no combination of train
    %   and test indices is repeated in partitions. No attempt is made to
    %   balance the number of times each sample is used for training and/or
    %   testing.
    % - This function behaves, by default, pseudo-randomly and
    %   deterministically; different calls to this function, with the same
    %   inputs, result in the same output. To get different outputs for
    %   different calls, set the 'seed' option to 0.
    %
    % See also: cosmo_nfold_partitions, cosmo_nchoosek_partitioner
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    % process input
    defaults = struct();
    defaults.max_fold_count = 10000;
    defaults.seed = 1;

    opt = cosmo_structjoin(defaults, varargin);
    check_input(ds, opt);

    % see which classes there are
    [class_idx, classes] = cosmo_index_unique(ds.sa.targets);
    class_counts = cellfun(@numel, class_idx);

    % see how many possible unique folds there are
    test_count = get_test_count(class_counts, classes, opt);
    train_count = min(class_counts) - test_count;

    test_combi_count = get_unique_fold_count(class_counts, ...
                                             test_count);
    train_combi_count = get_unique_fold_count(class_counts - test_count, ...
                                              train_count);

    unique_fold_count = test_combi_count * train_combi_count;

    fold_count = get_fold_count(unique_fold_count, opt);

    % Use case 1: single-subject analysis with single trial MEEG data,
    % hundreds of trials; require crossvalidation scheme with 10 or so
    % folds.
    % The total number of possible folds is 'large', generating all
    % possible folds would require too much memory. Instead we generate
    % folds randomly. Since it is almost always the case that there are
    % some duplicates, these have to be removed. We keep on generating new
    % random folds - and remove duplicates - until we have generated
    % the target count, opt.fold_count.
    %
    % Use case 2: between-subject analysis, trying to discriminate between
    % participants groups (such as patients versus healthy controls).
    % In this case, use a crossvalidation scheme with
    % take-one-participant-per-group-for-testing (i.e. test_count=1), and
    % the total number of folds may not be so large. For example, with 2
    % groups, N=20 in each group, there are F=N*(N-1)/2=190 possible folds.
    % If the target fold count is near this number F, then randomly
    % generating folds until enough folds are generated will take a long
    % time, because many folds will be duplicates of existing ones. In such
    % a scenario it is more efficient to generate all possible folds first,
    % then select the required number of folds randomly

    fold_enumerate_ratio = .1;

    if fold_count > fold_enumerate_ratio * unique_fold_count
        func = @enumerated_partitions;
    else
        func = @sampled_partitions;
    end

    if isfield(opt, 'seed')
        seed = opt.seed;
    else
        seed = 0;
    end

    partitions = func(class_idx, test_count, fold_count, seed);

function fold_count = get_fold_count(unique_fold_count, opt)
    if ~isfield(opt, 'fold_count')
        error(['the option ''fold_count'', indicating how many '...
               'cross-validation folds this function should '...
               'generate, is required. It should have a value '...
               'between 1 and %d.\n'...
               'If in doubt, a value between 10 '...
               'and 1000 may be quite adequate for most use cases; '...
               'with the lower value more appropriate for cases of '...
               'within-subject analysis with many trials in '...
               'different conditions of interest (e.g. different '...
               'stimuli, or seen versus unseen stimulus), and the '...
               'upper value more appropriate for classification ' ...
               'of participants in different groups (such as '...
               'patient versus control).'], ...
              min(unique_fold_count, opt.max_fold_count));
    end

    fold_count = opt.fold_count;

    if fold_count > opt.max_fold_count
        error(['The number of requested folds, fold_count=%d, '...
               'exceeds the safety limit max_fold_count=%d. '...
               'You can either reduce fold_count, or '...
               'increase max_fold_count. In the latter case, note '...
               'that higher values of max_fold_count may result in '...
               'significant processor and memory usage; for very '...
               'large values, it may result in the computer becoming '...
               'unresponsive or crashing'], ...
              fold_count, opt.max_fold_count);
    end

    if fold_count > unique_fold_count
        error(['Cannot generate fold_count=%d folds as there are only '...
               '%d possible folds'], ...
              fold_count, unique_fold_count);
    end

function count = get_unique_fold_count(test_class_counts, test_count)
    % see how often each class occurs
    class_counts = arrayfun(@(c)nchoosek_lower_limit(c, test_count), ...
                            test_class_counts);
    count = prod(class_counts);

function c = nchoosek_lower_limit(n, k)
    % for many combinations return a lower limit of nchoosek. This avoids a
    % warning by nchoosek if n and k are large, and still properly
    % deals with cases of small n and k.
    if n - k > 10 && k > 10
        % it holds that nchoosek(n,k)<1e5 if n-k>10 and k>10
        c = 1e5;
    else
        c = nchoosek(n, k);
    end

function test_count = get_test_count(class_counts, classes, opt)
    % get number of samples in each class for testing.

    [min_class_count, min_idx] = min(class_counts);

    if isfield(opt, 'test_count')
        test_count = opt.test_count;
    else
        assert(isfield(opt, 'test_ratio'));
        test_ratio = opt.test_ratio;
        test_count = round(test_ratio * min_class_count);
    end

    if test_count < 1 || test_count >= min_class_count
        error(['here are not enough '...
               'samples in class %d to have at least %d '...
               'samples in the train and test set'], ...
              classes(min_idx), test_count);
    end

function partitions = sampled_partitions(class_idx, test_count, ...
                                         fold_count, seed)

    class_counts = cellfun(@numel, class_idx);
    safety_max_repeats_limit = 100;

    folds_cell = cell(safety_max_repeats_limit, 1);

    iter_seeds = ceil(cosmo_rand(1, safety_max_repeats_limit, ...
                                 'seed', seed) * 1e8);
    has_enough_folds = false;
    for i_gen = 1:safety_max_repeats_limit
        iter_seed = iter_seeds(i_gen);
        folds_cell{i_gen} = generate_random_folds(fold_count, ...
                                                  class_counts, test_count, iter_seed);
        folds_so_far = cat(3, folds_cell{1:i_gen});
        folds_no_dupl = remove_duplicate_fold_indices(folds_so_far);
        n_folds = size(folds_no_dupl, 3);
        has_enough_folds = n_folds >= fold_count;

        if has_enough_folds
            break
        end
    end

    if ~has_enough_folds
        error(['Unable to generate %d folds. This may be a bug. '...
               'Consider contacting the CoSMoMVPA developers'], ...
              nfolds);
    end

    partitions = fold_indices2partitions(folds_no_dupl, class_idx);

function fold_indices = generate_random_folds(fold_count, ...
                                              class_counts, test_count, seed)
    nclasses = numel(class_counts);
    min_class_count = min(class_counts);
    train_count = min_class_count - test_count;

    % do just a single call to cosmo_rand, so that different calls to this
    % function with different seeds will almost certainly result in
    % different folds
    rand_arr = cosmo_rand(nclasses, max(class_counts), fold_count, 'seed', seed);

    % allocate space for output
    fold_indices = cell(nclasses, 2, fold_count);

    for i_fold = 1:fold_count
        for i_class = 1:nclasses
            r = rand_arr(i_class, 1:class_counts(i_class), i_fold);
            [unused, idx] = sort(r);
            te_idx = idx(1:test_count);
            tr_idx = idx(test_count + (1:train_count));

            fold_indices{i_class, 1, i_fold} = te_idx;
            fold_indices{i_class, 2, i_fold} = tr_idx;
        end
    end

function fold_indices = remove_duplicate_fold_indices(fold_indices)
    [nclasses, two, nfolds] = size(fold_indices);
    assert(two == 2);

    for test_train_idx = 1:2
        % assuming training and test sets same size, everywhere
        counts = cellfun(@numel, fold_indices(:, test_train_idx, :));
        assert(all(counts(1) == counts(:)));
    end

    test_count = numel(fold_indices{1, 1, 1});
    train_count = numel(fold_indices{1, 2, 1});
    total_count = train_count + test_count;

    max_index = max(cellfun(@max, fold_indices(:)));

    fold_mat = zeros(nfolds, total_count * nclasses);
    first_row = 0;
    for f_i = 1:nfolds
        for c_i = 1:nclasses
            te = fold_indices{c_i, 1, f_i};
            fold_mat(f_i, first_row + (1:test_count)) = te;
            first_row = first_row + test_count;

            tr = fold_indices{c_i, 2, f_i};
            % discriminate test and train indices by ensuring:
            % test_indices  <= max_index
            % train_indices >  max_index;
            tr_offset = tr + max_index;
            fold_mat(f_i, first_row + (1:train_count)) = tr_offset;
            first_row = first_row + train_count;
        end
    end

    % find and remove duplicates
    [s, i] = sortrows(fold_mat);

    is_dupl = false(nfolds, 1);
    for row = 2:nfolds
        is_dupl(i(row)) = isequal(s(row - 1, :), s(row, :));
    end

    fold_indices = fold_indices(:, :, ~is_dupl);

function partitions = enumerated_partitions(class_idx, test_count, ...
                                            fold_count, seed)
    nclasses = numel(class_idx);
    class_counts = cellfun(@numel, class_idx);

    train_count = min(class_counts) - test_count;

    test_idx = cell(1, nclasses);
    rem_train_idx = cell(1, nclasses);

    for c_i = 1:nclasses
        test_idx{c_i} = nchoosek(1:class_counts(c_i), test_count);
        rem_train_idx{c_i} = nchoosek(1:class_counts(c_i) - test_count, ...
                                      train_count);
    end

    % take all combinations, with first (nclasses*test_count) columns
    % for the test index positions, and the remaining
    % (nclasses*train_count) columns for indices of remaining test sets
    % Thus, possible test indices for class c are in all_idx{c,1},
    % and indices for remaining train indices are in all_idx{c,2}.

    all_idx = [test_idx; rem_train_idx];

    ref_all_idx = cellfun(@(x)1:size(x, 1), all_idx, 'UniformOutput', false);
    combis = cosmo_cartprod(ref_all_idx);

    nfolds = size(combis, 1);

    % allocate space for fold indices.
    %
    %   fold_indices{c,i,f}=[x1,...xN]
    %
    % for fold f means that the x1-th, ... xN-th sample
    % in class c are used for testing (i=1) or training (i=2).
    %
    fold_indices = cell(nclasses, 2, nfolds); % test and train

    for f_i = 1:nfolds
        for c_i = 1:nclasses
            % use linear indexing in all_idx
            te_pos = c_i * 2 - 1;
            rem_tr_pos = te_pos + 1;

            ref_te = combis(f_i, te_pos);
            ref_rem_tr = combis(f_i, rem_tr_pos);

            te = all_idx{te_pos}(ref_te, :);

            tr_rem = setdiff(1:class_counts(c_i), te);
            tr_rem_ref = all_idx{rem_tr_pos}(ref_rem_tr, :);
            tr = tr_rem(tr_rem_ref);

            fold_indices{c_i, 1, f_i} = te;
            fold_indices{c_i, 2, f_i} = tr;
        end
    end

    if fold_count < nfolds
        rp = cosmo_randperm(nfolds, fold_count, 'seed', seed);
        fold_indices = fold_indices(:, :, rp);
    end

    partitions = fold_indices2partitions(fold_indices, class_idx);

function partitions = fold_indices2partitions(fold_indices, class_idx)
    % For the inputs,
    %
    %   fold_indices{c,i,f}=[x1,...xN]
    %
    % for fold f means that the x1-th, ... xN-th sample
    % in class c are used for testing (i=1) or training (i=2).
    %
    %   class_idx{c_i}=p1,...,pN
    %
    % means that the p1-th,...,pN-th row in .samples are in class c_i.

    partitions = struct();
    [nclasses, two, nfolds] = size(fold_indices);
    assert(two == 2);

    keys = {'test_indices', 'train_indices'};
    for k = 1:numel(keys)
        key = keys{k};

        all_idx = cell(1, nfolds);
        for f_i = 1:nfolds

            fold_idx = cell(1, nclasses);

            for c_i = 1:nclasses
                ref = fold_indices{c_i, k, f_i}(:);
                fold_idx{c_i} = class_idx{c_i}(ref);
            end

            all_idx{f_i} = sort(cat(1, fold_idx{:}));
        end

        partitions.(key) = all_idx;
    end

function check_input(ds, opt)
    cosmo_check_dataset(ds);

    raise = true;
    cosmo_isfield(ds, {'sa.targets', 'sa.chunks'}, raise);

    chunks = ds.sa.chunks;
    unq_chunks = unique(chunks);
    if numel(unq_chunks) ~= numel(chunks)
        h = histc(chunks, unq_chunks);
        i = find(h >= 2, 1);
        assert(numel(i) == 1);
        error(['.sa.chunks==%d occurs %d times, whereas all values '...
               'must be unique'], ...
              unq_chunks(i), h(i));
    end

    if isfield(opt, 'test_count')
        if isfield(opt, 'test_ratio')
            error(['the options ''test_count'' and ''test_ratio'''...
                   'are mutually exclusive']);
        end
    elseif ~isfield(opt, 'test_ratio')
        error(['one of the options ''test_count'' and '...
               '''test_ratio'' is required']);
    end