cosmo check dataset

function is_ok = cosmo_check_dataset(ds, ds_type, error_if_not_ok)
    % Check consistency of a dataset.
    %
    %
    % is_ok=cosmo_dataset_check(ds, [ds_type,][,error_if_not_ok])
    %
    % Inputs:
    %   ds                     dataset struct.
    %   ds_type                string indicating the specific type of dataset.
    %                          Currently  supports 'fmri' and 'meeg'.
    %   error_if_not_ok        if true (the default) or a string, an error is
    %                          raised if the dataset is not kosher (see below).
    %                          If a string, then it is prefixed in the error
    %                          message. If false, then no error is raised.
    %
    % Returns:
    %   is_ok                  boolean indicating kosherness of ds.
    %                          It is consider ok if:
    %                          - it has a field .samples with a PxQ array.
    %                          - if it has a field .features [.samples], then
    %                            it should be a struct, and each field in it
    %                            should have P [Q] elements along the first
    %                            [second] dimension or be empty.
    %                          - .sa.{targets,chunks} are numeric vectors with
    %                            integers (if present)
    %                          - if ds_type is provided, then some more tests
    %                            (depending on ds_type) are performed.
    %
    % Examples:
    %     cosmo_check_dataset([])
    %     %|| error('dataset not a struct')
    %
    %     cosmo_check_dataset(struct())
    %     %|| error('dataset has no field .samples')
    %
    %     % this (very minimal) dataset is kosher
    %     cosmo_check_dataset(struct('samples',zeros(2)))
    %     %|| true
    %
    %     % error can be silenced
    %     cosmo_check_dataset('this is not ok',false)
    %     %|| false
    %
    %     % run some more tests
    %     ds=cosmo_synthetic_dataset('type','fmri');
    %     cosmo_check_dataset(ds)
    %     %|| true
    %     ds.sa.chunks=[2;3]; % wrong size
    %     cosmo_check_dataset(ds)
    %     %|| error('sa.chunks has 2 values in dimension 1, expected 6')
    %     ds.sa.chunks={'a','b','c','a','b','c'}';
    %     cosmo_check_dataset(ds)
    %     %|| error('.sa.chunks must be numeric vector with integers')
    %
    %     % set illegal dimension values
    %     ds=cosmo_synthetic_dataset('type','fmri');
    %     ds.a.fdim.values{1}=[1 2];
    %     cosmo_check_dataset(ds)
    %     %|| error('.fa.i must be vector with integers in range 1..2')
    %
    %     % check for specific type of dataset
    %     ds=cosmo_synthetic_dataset('type','fmri');
    %     cosmo_check_dataset(ds,'meeg')
    %     %|| error('missing field .a.meeg for meeg-dataset');
    %
    %     % destroy crucial information in fmri dataset
    %     % this error is only caught if explicit checking for fmri dataset is
    %     % enabled, because the dataset remains valid when considered as a
    %     % non-fmri dataset
    %     ds=cosmo_synthetic_dataset('type','fmri');
    %     % destroy volume information
    %     ds.a=rmfield(ds.a,'vol');
    %     cosmo_check_dataset(ds)
    %     %|| true  % error not caught
    %     cosmo_check_dataset(ds,'fmri')
    %     %|| error('missing field .a.vol for fmri-dataset')
    %
    %     % check meeg dataset
    %     ds=cosmo_synthetic_dataset('type','meeg');
    %     cosmo_check_dataset(ds,'meeg')
    %     %|| true
    %     ds.fa.chan=ds.fa.chan+6; % outside range
    %     cosmo_check_dataset(ds)
    %     %|| error('.fa.chan must be vector with integers in range 1..3')
    %
    % Notes:
    %  - if the second argument is a boolean then its value is used for
    %    error_if_not_ok, and ds_type is not used
    %  - this function throws one error at most, even if it is inconsistent for
    %    several reasons.
    %  - it is good practice to use this function when a new dataset is created
    %    to ensure consistency of the data
    %
    % #   For CoSMoMVPA's copyright information and license terms,   #
    % #   see the COPYING file distributed with CoSMoMVPA.           #

    % deal with input arguments
    if nargin < 3
        error_if_not_ok = true;
    end
    if nargin >= 2
        if islogical(ds_type)
            error_if_not_ok = ds_type;
            ds_type = [];
        end
    else
        ds_type = [];
        error_if_not_ok = true;
    end

    if ischar(error_if_not_ok)
        error_prefix = error_if_not_ok;
        error_if_not_ok = true;
    else
        error_prefix = '';
    end

    % list check functions
    checkers = {@check_fields, ...
                @check_samples, ...
                @check_targets, ...
                @check_chunks, ...
                @check_attributes, ...
                @check_dim_legacy, ...
                @check_dim, ...
                []}; % space for check_with_type

    if ~isempty(ds_type)
        % add checker for specific type (fmri, meeg, surface)
        checkers{end} = @(x) check_with_type(x, ds_type);
    end

    msg = run_checkers(checkers, ds);
    is_ok = isempty(msg);

    if ~is_ok && error_if_not_ok
        error('%s: %s', error_prefix, msg);
    end

function msg = run_checkers(checkers, ds)
    n = numel(checkers);
    msg = '';
    for k = 1:n
        checker = checkers{k};
        if isempty(checker)
            continue
        end
        msg = checker(ds);
        if ~isempty(msg)
            return
        end
    end

function msg = check_with_type(ds, ds_type)
    % additional checks for fmri, surface or meeg dataset

    % note: check_dim should have already checked that
    % all fields are present in .fa

    msg = '';
    switch ds_type
        case 'fmri'
            required_dim_labels = {'i', 'j', 'k'};
            a_fields = {'vol'};
        case 'surface'
            required_dim_labels = {'node_indices'};
            a_fields = {};
        case 'meeg'
            required_dim_labels = {};
            a_fields = {'meeg'};
        otherwise
            error('Unsupported ds_type=%s', ds_type);
    end

    % check present of .a.fdim field
    if ~cosmo_isfield(ds, 'a.fdim', false)
        msg = 'missing field .a.fdim';
        return
    end

    m = cosmo_match(required_dim_labels, ds.a.fdim.labels);
    if any(~m)
        i = find(~m, 1);
        msg = sprintf('missing value %s in .a.fdim.values for %s-dataset', ...
                      required_dim_labels{i}, ds_type);
        return
    end

    a_fns = fieldnames(ds.a);
    m = cosmo_match(a_fields, a_fns);
    if any(~m)
        i = find(~m, 1);
        msg = sprintf('missing field .a.%s for %s-dataset', ...
                      a_fields{i}, ds_type);
        return
    end

function tf = is_int_vector(x)
    tf = isnumeric(x) && isvector(x) && all(round(x) == x | isnan(x));

function msg = check_dim_legacy(ds)
    msg = '';

    if cosmo_isfield(ds, 'a.dim')
        msg = sprintf(['***CoSMoMVPA legacy***\n'...
                       'Feature dimension information is now stored '...
                       'in .a.fdim, whereas earlier versions used .a.dim. '...
                       'To adapt a existing dataset struct ''ds'', run:\n'...
                       '  ds.a.fdim=ds.a.dim;\n'...
                       '  ds.a=rmfield(ds.a,''dim'')\n']);
        return
    end

function msg = check_fields(ds)
    msg = '';

    if ~isstruct(ds)
        msg = 'input must be a struct';
        return
    end

    delta = setdiff(fieldnames(ds), {'samples', 'fa', 'sa', 'a'});
    if ~isempty(delta)
        msg = sprintf('illegal field .%s', delta{1});
        return
    end

function msg = check_targets(ds)
    msg = '';

    if cosmo_isfield(ds, 'sa.targets') && ~is_int_vector(ds.sa.targets)
        msg = ['.sa.targets must be numeric vector with integers '...
               '(.sa.labels can be used to store string labels)'];
    end

function msg = check_chunks(ds)
    msg = '';

    if cosmo_isfield(ds, 'sa.chunks') && ~isnumeric(ds.sa.chunks)
        msg = '.sa.chunks must be numeric vector with integers';
    end

function msg = check_samples(ds)
    msg = '';

    if  ~isfield(ds, 'samples')
        msg = 'dataset has no field .samples';
        return
    end

    % has samples, so check the rest
    ds_size = size(ds.samples);
    if numel(ds_size) ~= 2
        msg = sprintf('.samples should be 2D, found %dD', numel(ds_size));
        return
    end

function msg = check_attributes(ds)
    msg = '';
    attrs_fns = {'sa', 'fa'};
    ds_size = size(ds.samples);

    % check sample and feature attributes
    for dim = 1:2
        attrs_fn = attrs_fns{dim};
        if isfield(ds, attrs_fn)

            % get feature/sample attributes
            attrs = ds.(attrs_fn);
            fns = fieldnames(attrs);
            n = numel(fns);

            % check each one
            for j = 1:n
                fn = fns{j};
                attr = attrs.(fn);
                if isempty(attr)
                    continue
                end
                attr_size = size(attr);
                if numel(attr_size) ~= 2
                    msg = sprintf('%s.%s should be 2D', attrs_fn, fn);
                    return
                end
                if attr_size(dim) ~= ds_size(dim)
                    msg = sprintf(['%s.%s has %d values in dimension '...
                                   '%d, expected %d'], attrs_fn, fn, ...
                                  attr_size(dim), dim, ds_size(dim));
                    if attr_size(3 - dim) == ds_size(dim)
                        msg = [msg ' (maybe the data was intended '...
                               'to be transposed?)'];
                    end
                    return
                end
            end
        end
    end

function msg = check_dim(ds)
    % helper function to check dataset with dimensions
    % (i.e., .a.{s,f}dim is present)
    msg = '';

    suffixes = 'sf';

    for dim = 1:2
        suffix = suffixes(dim);
        dim_attrs_str = sprintf('a.%sdim', suffix);

        if ~cosmo_isfield(ds, dim_attrs_str)
            continue
        end

        attrs_str = [suffix 'a'];
        if ~isfield(ds, attrs_str)
            msg = sprintf('Missing field .%s', attrs_str);
            return
        end

        attrs = ds.(attrs_str);
        dim_attrs = ds.a.([suffix 'dim']);
        msg = check_dim_helper(attrs, dim_attrs, attrs_str, dim_attrs_str);

        if ~isempty(msg)
            return
        end
    end

function msg = check_dim_helper(attrs, dim_attrs, attrs_str, dim_attrs_str)
    msg = '';
    % attrs is from .sa or .fa; dim_attrs from .a.sdim or .a.fdim
    % the *_str arguments contain a string representation
    if ~all(cosmo_isfield(dim_attrs, {'labels', 'values'}))
        msg = sprintf('Missing field .%s.{labels,values}', dim_attrs_str);
        return
    end

    labels = dim_attrs.labels;
    values = dim_attrs.values;

    if ~iscellstr(labels)
        msg = sprintf('.%s.labels must be a cell', dim_attrs_str);
        return
    end

    if ~iscell(values)
        msg = sprintf('.%s.values must be a cell', dim_attrs_str);
        return
    end

    ndim = numel(labels);
    if numel(values) ~= ndim
        msg = sprintf('size mismatch between .%s.labels and .%s.values', ...
                      dim_attrs_str, dim_attrs_str);
        return
    end

    for dim = 1:ndim
        label = labels{dim};
        if ~isfield(attrs, label)
            msg = sprintf('Missing field .%s.%s', attrs_str, label);
            return
        end
        v = attrs.(label);

        % empty vectors are allowed (in empty datasets)
        if isempty(v)
            continue
        end

        vmax = numel(values{dim});
        all_int = is_int_vector(v);
        if ~all_int || min(v) < 1 || max(v) > vmax
            msg = sprintf(['.%s.%s must be vector with integers in '...
                           'range 1..%d'], attrs_str, label, vmax);
            if all_int && min(v) == 0
                % could be mistaken base-0 indexing
                msg = sprintf(['%s\nThe lowest index is 0, which may '...
                               'indicate base-0 indexing (the first '...
                               'element is indexed by 0). Note that '...
                               'Matlab (and CoSMoMVPA) use base-1 '...
                               'indexing.\n'...
                               '- Manual conversion from base-0 to '...
                               'base-1 can be achieved by increasing '...
                               'the values in .%s.%s by 1.\n'...
                               '- If this dataset was exported from '...
                               'PyMVPA and contains fMRI volumetric '...
                               'or surface-based data, consider using '...
                               'cosmo_fmri_dataset or '...
                               'cosmo_surface_dataset (respectively) '...
                               'to convert PyMVPA''s base-0 indexing to '...
                               'CoSMoMVPA''s base-1 indexing'], ...
                              msg, attrs_str, label);
            end

            return
        end
    end