function is_ok = cosmo_check_dataset(ds, ds_type, error_if_not_ok)
% Check consistency of a dataset.
%
%
% is_ok=cosmo_dataset_check(ds, [ds_type,][,error_if_not_ok])
%
% Inputs:
% ds dataset struct.
% ds_type string indicating the specific type of dataset.
% Currently supports 'fmri' and 'meeg'.
% error_if_not_ok if true (the default) or a string, an error is
% raised if the dataset is not kosher (see below).
% If a string, then it is prefixed in the error
% message. If false, then no error is raised.
%
% Returns:
% is_ok boolean indicating kosherness of ds.
% It is consider ok if:
% - it has a field .samples with a PxQ array.
% - if it has a field .features [.samples], then
% it should be a struct, and each field in it
% should have P [Q] elements along the first
% [second] dimension or be empty.
% - .sa.{targets,chunks} are numeric vectors with
% integers (if present)
% - if ds_type is provided, then some more tests
% (depending on ds_type) are performed.
%
% Examples:
% cosmo_check_dataset([])
% %|| error('dataset not a struct')
%
% cosmo_check_dataset(struct())
% %|| error('dataset has no field .samples')
%
% % this (very minimal) dataset is kosher
% cosmo_check_dataset(struct('samples',zeros(2)))
% %|| true
%
% % error can be silenced
% cosmo_check_dataset('this is not ok',false)
% %|| false
%
% % run some more tests
% ds=cosmo_synthetic_dataset('type','fmri');
% cosmo_check_dataset(ds)
% %|| true
% ds.sa.chunks=[2;3]; % wrong size
% cosmo_check_dataset(ds)
% %|| error('sa.chunks has 2 values in dimension 1, expected 6')
% ds.sa.chunks={'a','b','c','a','b','c'}';
% cosmo_check_dataset(ds)
% %|| error('.sa.chunks must be numeric vector with integers')
%
% % set illegal dimension values
% ds=cosmo_synthetic_dataset('type','fmri');
% ds.a.fdim.values{1}=[1 2];
% cosmo_check_dataset(ds)
% %|| error('.fa.i must be vector with integers in range 1..2')
%
% % check for specific type of dataset
% ds=cosmo_synthetic_dataset('type','fmri');
% cosmo_check_dataset(ds,'meeg')
% %|| error('missing field .a.meeg for meeg-dataset');
%
% % destroy crucial information in fmri dataset
% % this error is only caught if explicit checking for fmri dataset is
% % enabled, because the dataset remains valid when considered as a
% % non-fmri dataset
% ds=cosmo_synthetic_dataset('type','fmri');
% % destroy volume information
% ds.a=rmfield(ds.a,'vol');
% cosmo_check_dataset(ds)
% %|| true % error not caught
% cosmo_check_dataset(ds,'fmri')
% %|| error('missing field .a.vol for fmri-dataset')
%
% % check meeg dataset
% ds=cosmo_synthetic_dataset('type','meeg');
% cosmo_check_dataset(ds,'meeg')
% %|| true
% ds.fa.chan=ds.fa.chan+6; % outside range
% cosmo_check_dataset(ds)
% %|| error('.fa.chan must be vector with integers in range 1..3')
%
% Notes:
% - if the second argument is a boolean then its value is used for
% error_if_not_ok, and ds_type is not used
% - this function throws one error at most, even if it is inconsistent for
% several reasons.
% - it is good practice to use this function when a new dataset is created
% to ensure consistency of the data
%
% # For CoSMoMVPA's copyright information and license terms, #
% # see the COPYING file distributed with CoSMoMVPA. #
% deal with input arguments
if nargin < 3
error_if_not_ok = true;
end
if nargin >= 2
if islogical(ds_type)
error_if_not_ok = ds_type;
ds_type = [];
end
else
ds_type = [];
error_if_not_ok = true;
end
if ischar(error_if_not_ok)
error_prefix = error_if_not_ok;
error_if_not_ok = true;
else
error_prefix = '';
end
% list check functions
checkers = {@check_fields, ...
@check_samples, ...
@check_targets, ...
@check_chunks, ...
@check_attributes, ...
@check_dim_legacy, ...
@check_dim, ...
[]}; % space for check_with_type
if ~isempty(ds_type)
% add checker for specific type (fmri, meeg, surface)
checkers{end} = @(x) check_with_type(x, ds_type);
end
msg = run_checkers(checkers, ds);
is_ok = isempty(msg);
if ~is_ok && error_if_not_ok
error('%s: %s', error_prefix, msg);
end
function msg = run_checkers(checkers, ds)
n = numel(checkers);
msg = '';
for k = 1:n
checker = checkers{k};
if isempty(checker)
continue
end
msg = checker(ds);
if ~isempty(msg)
return
end
end
function msg = check_with_type(ds, ds_type)
% additional checks for fmri, surface or meeg dataset
% note: check_dim should have already checked that
% all fields are present in .fa
msg = '';
switch ds_type
case 'fmri'
required_dim_labels = {'i', 'j', 'k'};
a_fields = {'vol'};
case 'surface'
required_dim_labels = {'node_indices'};
a_fields = {};
case 'meeg'
required_dim_labels = {};
a_fields = {'meeg'};
otherwise
error('Unsupported ds_type=%s', ds_type);
end
% check present of .a.fdim field
if ~cosmo_isfield(ds, 'a.fdim', false)
msg = 'missing field .a.fdim';
return
end
m = cosmo_match(required_dim_labels, ds.a.fdim.labels);
if any(~m)
i = find(~m, 1);
msg = sprintf('missing value %s in .a.fdim.values for %s-dataset', ...
required_dim_labels{i}, ds_type);
return
end
a_fns = fieldnames(ds.a);
m = cosmo_match(a_fields, a_fns);
if any(~m)
i = find(~m, 1);
msg = sprintf('missing field .a.%s for %s-dataset', ...
a_fields{i}, ds_type);
return
end
function tf = is_int_vector(x)
tf = isnumeric(x) && isvector(x) && all(round(x) == x | isnan(x));
function msg = check_dim_legacy(ds)
msg = '';
if cosmo_isfield(ds, 'a.dim')
msg = sprintf(['***CoSMoMVPA legacy***\n'...
'Feature dimension information is now stored '...
'in .a.fdim, whereas earlier versions used .a.dim. '...
'To adapt a existing dataset struct ''ds'', run:\n'...
' ds.a.fdim=ds.a.dim;\n'...
' ds.a=rmfield(ds.a,''dim'')\n']);
return
end
function msg = check_fields(ds)
msg = '';
if ~isstruct(ds)
msg = 'input must be a struct';
return
end
delta = setdiff(fieldnames(ds), {'samples', 'fa', 'sa', 'a'});
if ~isempty(delta)
msg = sprintf('illegal field .%s', delta{1});
return
end
function msg = check_targets(ds)
msg = '';
if cosmo_isfield(ds, 'sa.targets') && ~is_int_vector(ds.sa.targets)
msg = ['.sa.targets must be numeric vector with integers '...
'(.sa.labels can be used to store string labels)'];
end
function msg = check_chunks(ds)
msg = '';
if cosmo_isfield(ds, 'sa.chunks') && ~isnumeric(ds.sa.chunks)
msg = '.sa.chunks must be numeric vector with integers';
end
function msg = check_samples(ds)
msg = '';
if ~isfield(ds, 'samples')
msg = 'dataset has no field .samples';
return
end
% has samples, so check the rest
ds_size = size(ds.samples);
if numel(ds_size) ~= 2
msg = sprintf('.samples should be 2D, found %dD', numel(ds_size));
return
end
function msg = check_attributes(ds)
msg = '';
attrs_fns = {'sa', 'fa'};
ds_size = size(ds.samples);
% check sample and feature attributes
for dim = 1:2
attrs_fn = attrs_fns{dim};
if isfield(ds, attrs_fn)
% get feature/sample attributes
attrs = ds.(attrs_fn);
fns = fieldnames(attrs);
n = numel(fns);
% check each one
for j = 1:n
fn = fns{j};
attr = attrs.(fn);
if isempty(attr)
continue
end
attr_size = size(attr);
if numel(attr_size) ~= 2
msg = sprintf('%s.%s should be 2D', attrs_fn, fn);
return
end
if attr_size(dim) ~= ds_size(dim)
msg = sprintf(['%s.%s has %d values in dimension '...
'%d, expected %d'], attrs_fn, fn, ...
attr_size(dim), dim, ds_size(dim));
if attr_size(3 - dim) == ds_size(dim)
msg = [msg ' (maybe the data was intended '...
'to be transposed?)'];
end
return
end
end
end
end
function msg = check_dim(ds)
% helper function to check dataset with dimensions
% (i.e., .a.{s,f}dim is present)
msg = '';
suffixes = 'sf';
for dim = 1:2
suffix = suffixes(dim);
dim_attrs_str = sprintf('a.%sdim', suffix);
if ~cosmo_isfield(ds, dim_attrs_str)
continue
end
attrs_str = [suffix 'a'];
if ~isfield(ds, attrs_str)
msg = sprintf('Missing field .%s', attrs_str);
return
end
attrs = ds.(attrs_str);
dim_attrs = ds.a.([suffix 'dim']);
msg = check_dim_helper(attrs, dim_attrs, attrs_str, dim_attrs_str);
if ~isempty(msg)
return
end
end
function msg = check_dim_helper(attrs, dim_attrs, attrs_str, dim_attrs_str)
msg = '';
% attrs is from .sa or .fa; dim_attrs from .a.sdim or .a.fdim
% the *_str arguments contain a string representation
if ~all(cosmo_isfield(dim_attrs, {'labels', 'values'}))
msg = sprintf('Missing field .%s.{labels,values}', dim_attrs_str);
return
end
labels = dim_attrs.labels;
values = dim_attrs.values;
if ~iscellstr(labels)
msg = sprintf('.%s.labels must be a cell', dim_attrs_str);
return
end
if ~iscell(values)
msg = sprintf('.%s.values must be a cell', dim_attrs_str);
return
end
ndim = numel(labels);
if numel(values) ~= ndim
msg = sprintf('size mismatch between .%s.labels and .%s.values', ...
dim_attrs_str, dim_attrs_str);
return
end
for dim = 1:ndim
label = labels{dim};
if ~isfield(attrs, label)
msg = sprintf('Missing field .%s.%s', attrs_str, label);
return
end
v = attrs.(label);
% empty vectors are allowed (in empty datasets)
if isempty(v)
continue
end
vmax = numel(values{dim});
all_int = is_int_vector(v);
if ~all_int || min(v) < 1 || max(v) > vmax
msg = sprintf(['.%s.%s must be vector with integers in '...
'range 1..%d'], attrs_str, label, vmax);
if all_int && min(v) == 0
% could be mistaken base-0 indexing
msg = sprintf(['%s\nThe lowest index is 0, which may '...
'indicate base-0 indexing (the first '...
'element is indexed by 0). Note that '...
'Matlab (and CoSMoMVPA) use base-1 '...
'indexing.\n'...
'- Manual conversion from base-0 to '...
'base-1 can be achieved by increasing '...
'the values in .%s.%s by 1.\n'...
'- If this dataset was exported from '...
'PyMVPA and contains fMRI volumetric '...
'or surface-based data, consider using '...
'cosmo_fmri_dataset or '...
'cosmo_surface_dataset (respectively) '...
'to convert PyMVPA''s base-0 indexing to '...
'CoSMoMVPA''s base-1 indexing'], ...
msg, attrs_str, label);
end
return
end
end