diff --git a/tools/schemacode/bidsschematools/tests/test_validator.py b/tools/schemacode/bidsschematools/tests/test_validator.py index d44265e34d..222e94a732 100644 --- a/tools/schemacode/bidsschematools/tests/test_validator.py +++ b/tools/schemacode/bidsschematools/tests/test_validator.py @@ -3,266 +3,111 @@ import pytest +from .. import validator +from ..types import Namespace from .conftest import BIDS_ERROR_SELECTION, BIDS_SELECTION -def test__add_entity(): - from bidsschematools.validator import _add_entity +def test_path_rule(): + rule = Namespace.build({"path": "dataset_description.json", "level": "required"}) + assert validator._path_rule(rule) == {"regex": r"dataset_description\.json", "mandatory": True} - # Test empty input and directory creation and required entity - regex_entities = "" - entity = "subject" - entity_shorthand = "sub" - variable_field = "[0-9a-zA-Z]+" - requirement_level = "required" + rule = Namespace.build({"path": "LICENSE", "level": "optional"}) + assert validator._path_rule(rule) == {"regex": "LICENSE", "mandatory": False} - _regex_entities = _add_entity( - regex_entities, - entity, - entity_shorthand, - variable_field, - requirement_level, - ) - - assert _regex_entities == "sub-(?P=subject)" - - # Test append input and optional entity - regex_entities = ( - "sub-(?P=subject)(|_ses-(?P=session))" - "(|_task-(?P[0-9a-zA-Z]+))(|_trc-(?P[0-9a-zA-Z]+))" - "(|_rec-(?P[0-9a-zA-Z]+))" - "(|_run-(?P[0-9a-zA-Z]+))" - ) - entity = "recording" - entity_shorthand = "recording" - variable_field = "[0-9a-zA-Z]+" - requirement_level = "optional" - - _regex_entities = _add_entity( - regex_entities, - entity, - entity_shorthand, - variable_field, - requirement_level, - ) - assert ( - _regex_entities == "sub-(?P=subject)(|_ses-(?P=session))" - "(|_task-(?P[0-9a-zA-Z]+))(|_trc-(?P[0-9a-zA-Z]+))" - "(|_rec-(?P[0-9a-zA-Z]+))" - "(|_run-(?P[0-9a-zA-Z]+))" - "(|_recording-(?P[0-9a-zA-Z]+))" - ) - - -def test__add_extensions(): - from bidsschematools.validator import _add_extensions - - # Test single extension - regex_string = ( - "sub-(?P=subject)(|_ses-(?P=session))" - "_sample-(?P[0-9a-zA-Z]+)" - "(|_acq-(?P[0-9a-zA-Z]+))_photo" - ) - variant = { - "suffixes": ["photo"], - "extensions": [".jpg"], - "entities": { - "subject": "required", - "session": "optional", - "sample": "required", - "acquisition": "optional", - }, +def test_stem_rule(): + rule = Namespace.build({"stem": "README", "level": "required", "extensions": ["", ".md"]}) + assert validator._stem_rule(rule) == { + "regex": r"README(?P|\.md)", + "mandatory": True, } - _regex_string = _add_extensions(regex_string, variant) - assert ( - _regex_string == "sub-(?P=subject)(|_ses-(?P=session))" - "_sample-(?P[0-9a-zA-Z]+)" - "(|_acq-(?P[0-9a-zA-Z]+))_photo\\.jpg" + rule = Namespace.build( + {"stem": "participants", "level": "optional", "extensions": [".tsv", ".json"]} ) - - # Test multiple extensions - regex_string = ( - "sub-(?P=subject)(|_ses-(?P=session))" - "_sample-(?P[0-9a-zA-Z]+)" - "(|_acq-(?P[0-9a-zA-Z]+))_photo" - ) - variant = { - "suffixes": ["photo"], - "extensions": [".jpg", ".png", ".tif"], - "entities": { - "subject": "required", - "session": "optional", - "sample": "required", - "acquisition": "optional", - }, + assert validator._stem_rule(rule) == { + "regex": r"participants(?P\.tsv|\.json)", + "mandatory": False, } - _regex_string = _add_extensions(regex_string, variant) - - assert ( - _regex_string == "sub-(?P=subject)(|_ses-(?P=session))" - "_sample-(?P[0-9a-zA-Z]+)" - "(|_acq-(?P[0-9a-zA-Z]+))" - "_photo(\\.jpg|\\.png|\\.tif)" - ) - -def test__add_subdirs(): - from bidsschematools.validator import _add_subdirs - regex_string = "sub-(?P=subject)_sessions\\.(tsv|json)" - variant = { - "suffixes": ["sessions"], - "extensions": [".tsv", ".json"], - "entities": {"subject": "required"}, - } - datatype = "tabular_metadata" - entity_definitions = { - "acquisition": { - "display_name": "Acquisition", - "name": "acq", - "type": "string", - "format": "label", - }, - "session": { - "display_name": "Session", - "name": "ses", - "type": "string", - "format": "label", - }, - "subject": { - "display_name": "Subject", - "name": "sub", - "type": "string", - "format": "label", - }, - } - formats = { - "label": { - "pattern": "[0-9a-zA-Z]+", +def test_entity_rule(schema_obj): + # Simple + rule = Namespace.build( + { + "datatypes": ["anat"], + "entities": {"subject": "required", "session": "optional"}, + "suffixes": ["T1w"], + "extensions": [".nii"], } - } - modality_datatypes = [ - "anat", - "dwi", - "fmap", - "func", - "perf", - "eeg", - "ieeg", - "meg", - "beh", - "pet", - "micr", - ] - _regex_string = _add_subdirs( - regex_string, variant, datatype, entity_definitions, formats, modality_datatypes ) - - assert _regex_string == "/sub-(?P[0-9a-zA-Z]+)/sub-(?P=subject)_sessions\\.(tsv|json)" - - -def test__add_suffixes(): - from bidsschematools.validator import _add_suffixes - - # Test single expansion - regex_entities = "sub-(?P=subject)" - variant = { - "suffixes": ["sessions"], - "extensions": [ - ".tsv", - ".json", - ], - "entities": {"subject": "required"}, + assert validator._entity_rule(rule, schema_obj) == { + "regex": ( + r"sub-(?P[0-9a-zA-Z]+)/" + r"(?:ses-(?P[0-9a-zA-Z]+)/)?" + r"(?Panat)/" + r"sub-(?P=subject)_" + r"(?:ses-(?P=session)_)?" + r"(?PT1w)" + r"(?P\.nii)" + ), + "mandatory": False, } - regex_string = "sub-(?P=subject)_sessions" - _regex_string = _add_suffixes(regex_entities, variant) - - assert _regex_string == regex_string - - # Test multiple expansions - regex_entities = ( - "sub-(?P=subject)(|_ses-(?P=session))" - "(|_acq-(?P[0-9a-zA-Z]+))" - "(|_rec-(?P[0-9a-zA-Z]+))" - "(|_dir-(?P[0-9a-zA-Z]+))(|_run-(?P[0-9a-zA-Z]+))" - "(|_recording-(?P[0-9a-zA-Z]+))" + # Sidecar entities are optional + rule = Namespace.build( + { + "datatypes": ["anat", ""], + "entities": {"subject": "optional", "session": "optional"}, + "suffixes": ["T1w"], + "extensions": [".json"], + } ) - variant = { - "suffixes": [ - "physio", - "stim", - ], - "extensions": [ - ".tsv.gz", - ".json", - ], - "entities": { - "subject": "required", - "session": "optional", - "acquisition": "optional", - "reconstruction": "optional", - "direction": "optional", - "run": "optional", - "recording": "optional", - }, + assert validator._entity_rule(rule, schema_obj) == { + "regex": ( + r"(?:sub-(?P[0-9a-zA-Z]+)/)?" + r"(?:ses-(?P[0-9a-zA-Z]+)/)?" + r"(?:(?Panat)/)?" + r"(?:sub-(?P=subject)_)?" + r"(?:ses-(?P=session)_)?" + r"(?PT1w)" + r"(?P\.json)" + ), + "mandatory": False, } - regex_string = ( - "sub-(?P=subject)(|_ses-(?P=session))" - "(|_acq-(?P[0-9a-zA-Z]+))" - "(|_rec-(?P[0-9a-zA-Z]+))" - "(|_dir-(?P[0-9a-zA-Z]+))(|_run-(?P[0-9a-zA-Z]+))" - "(|_recording-(?P[0-9a-zA-Z]+))" - "_(physio|stim)" - ) - - _regex_string = _add_suffixes(regex_entities, variant) - - assert _regex_string == regex_string -@pytest.mark.parametrize("extension", ["bvec", "json", "tsv"]) -def test__inheritance_expansion(extension): - from bidsschematools.validator import _inheritance_expansion +def test_split_inheritance_rules(): + rule = { + "datatypes": ["anat"], + "entities": {"subject": "required", "session": "optional"}, + "suffixes": ["T1w"], + "extensions": [".nii", ".json"], + } - # test .json - base_entry = ( - r".*?/sub-(?P[0-9a-zA-Z]+)/" - r"(|ses-(?P[0-9a-zA-Z]+)/)func/sub-(?P=subject)" - r"(|_ses-(?P=session))_task-(?P[0-9a-zA-Z]+)" - r"(|_acq-(?P[0-9a-zA-Z]+))" - r"(|_ce-(?P[0-9a-zA-Z]+))" - r"(|_rec-(?P[0-9a-zA-Z]+))" - r"(|_dir-(?P[0-9a-zA-Z]+))" - r"(|_run-(?P[0-9]*[1-9]+[0-9]*))" - r"(|_echo-(?P[0-9]*[1-9]+[0-9]*))" - r"_phase(\.nii\.gz|\.nii|\.{})$".format(extension) - ) - expected_entries = [ - ".*?/sub-(?P[0-9a-zA-Z]+)/" - "(|ses-(?P[0-9a-zA-Z]+)/)sub-(?P=subject)" - "(|_ses-(?P=session))_task-(?P[0-9a-zA-Z]+)" - "(|_acq-(?P[0-9a-zA-Z]+))" - "(|_ce-(?P[0-9a-zA-Z]+))" - "(|_rec-(?P[0-9a-zA-Z]+))" - "(|_dir-(?P[0-9a-zA-Z]+))" - "(|_run-(?P[0-9]*[1-9]+[0-9]*))" - "(|_echo-(?P[0-9]*[1-9]+[0-9]*))" - "_phase(\\.nii\\.gz|\\.nii|\\.{})$".format(extension), - ".*?/task-(?P[0-9a-zA-Z]+)" - "(|_acq-(?P[0-9a-zA-Z]+))" - "(|_ce-(?P[0-9a-zA-Z]+))" - "(|_rec-(?P[0-9a-zA-Z]+))" - "(|_dir-(?P[0-9a-zA-Z]+))" - "(|_run-(?P[0-9]*[1-9]+[0-9]*))" - "(|_echo-(?P[0-9]*[1-9]+[0-9]*))" - "_phase(\\.nii\\.gz|\\.nii|\\.{})$".format(extension), - ] + main, sidecar = validator.split_inheritance_rules(rule) + assert main == { + "datatypes": ["anat"], + "entities": {"subject": "required", "session": "optional"}, + "suffixes": ["T1w"], + "extensions": [".nii"], + } + assert sidecar == { + "datatypes": ["", "anat"], + "entities": {"subject": "optional", "session": "optional"}, + "suffixes": ["T1w"], + "extensions": [".json"], + } - inheritance_expanded_entries = _inheritance_expansion(base_entry, datatype="func") - assert inheritance_expanded_entries == expected_entries + # Can't split again + (main2,) = validator.split_inheritance_rules(main) + assert main2 == { + "datatypes": ["anat"], + "entities": {"subject": "required", "session": "optional"}, + "suffixes": ["T1w"], + "extensions": [".nii"], + } def test_inheritance_examples(): diff --git a/tools/schemacode/bidsschematools/validator.py b/tools/schemacode/bidsschematools/validator.py index efa2349af4..be80040828 100644 --- a/tools/schemacode/bidsschematools/validator.py +++ b/tools/schemacode/bidsschematools/validator.py @@ -2,13 +2,18 @@ import json import os import re +import typing as ty +from collections.abc import Mapping from copy import deepcopy from functools import lru_cache from pathlib import Path -from . import schema, utils +import bidsschematools as bst +import bidsschematools.schema +import bidsschematools.types +import bidsschematools.utils -lgr = utils.get_logger() +lgr = bst.utils.get_logger() # The list of which entities create directories could be dynamically specified by the YAML, but for # now, it is not. @@ -90,257 +95,198 @@ def _get_paths( return path_list -def _add_entity(regex_entities, entity, entity_shorthand, variable_field, requirement_level): - """Add entity pattern to filename template based on requirement level.""" +def _capture_regex(name, pattern, backref): + """Capture pattern to name or match back-reference to name - # We need to do this here, although it would be easier to back-reference in the directory. - # This is because regex evaluates sequentially and we can not forward-reference a group. - if entity in DIR_ENTITIES: - variable_regex = f"(?P={entity})" - else: - variable_regex = f"(?P<{entity}>{variable_field})" - - if requirement_level == "required": - if len(regex_entities.strip()): - regex_entities += f"_{entity_shorthand}-{variable_regex}" - else: - # Only the first entity doesn't need an underscore - regex_entities += f"{entity_shorthand}-{variable_regex}" - else: - if len(regex_entities.strip()): - regex_entities += f"(|_{entity_shorthand}-{variable_regex})" - else: - # Only the first entity doesn't need an underscore - regex_entities += f"(|{entity_shorthand}-{variable_regex})" - - return regex_entities - - -def _extension_safety(extension): + >>> _capture_regex("run", "[0-9]+", False) + '(?P[0-9]+)' + >>> _capture_regex("run", "[0-9]+", True) + '(?P=run)' + >>> re.match(_capture_regex("run", "[0-9]+", False), "123_").groupdict() + {'run': '123'} """ - Making extensions formatting-safe. - Issues covered by this function are listed under “Notes” - - Parameters - ---------- - extension : str - Extension string, as present in the BIDS YAML schema. + return f"(?P={name})" if backref else f"(?P<{name}>{pattern})" + + +def _optional_regex(regex, optional): + """Return an optional version of a regex if optional is True + + A required regex is passed through unchanged: + + >>> pattern = _optional_regex("xyz", False) + >>> pattern + 'xyz' + >>> re.match(pattern, "xyz").groups() + () + >>> re.match(pattern, "") is None + True + + An optional regex uses a non-capturing group, to avoid interfering + with existing groups + + >>> pattern = _optional_regex("x(?P[a-z])z", True) + >>> pattern + '(?:x(?P[a-z])z)?' + >>> re.match(pattern, "xyz").groups() + ('y',) + >>> re.match(pattern, "xyz").groupdict() + {'name': 'y'} + >>> re.match(pattern, "").groups() + (None,) + >>> re.match(pattern, "").groupdict() + {'name': None} + """ + return f"(?:{regex})?" if optional else regex - Returns - ------- - str - Extension string, safe for use in validator Regex formatting. - Notes - ----- - * Bash-wildcard safety: https://github.com/bids-standard/bids-specification/issues/990 - * Period safety: https://github.com/bids-standard/bids-specification/issues/1055 - * Hopefully this function will be deprecated soon, but it will not break safe entries. - """ - if extension == "None": +@lru_cache() +def _format_entity(entity, name, pattern, level, directory=False): + if directory and entity not in DIR_ENTITIES: return "" - if "." in extension: - extension = extension.replace(".", "\\.") - if "*" in extension: - extension = extension.replace("*", ".*?") - - return extension - - -def _add_extensions(regex_string, variant): - """Add extensions to a regex string.""" - fixed_variant_extensions = [] - for variant_extension in variant["extensions"]: - variant_extension = _extension_safety(variant_extension) - fixed_variant_extensions.append(variant_extension) - if len(fixed_variant_extensions) > 1: - regex_extensions = "({})".format("|".join(fixed_variant_extensions)) - else: - regex_extensions = fixed_variant_extensions[0] - regex_string = f"{regex_string}{regex_extensions}" - return regex_string + label = _capture_regex(entity, pattern, not directory and entity in DIR_ENTITIES) + post = "/" if directory else "_" + return _optional_regex(f"{name}-{label}{post}", level != "required") -def _add_subdirs(regex_string, variant, datatype, entity_definitions, formats, modality_datatypes): - """Add appropriate subdirectories as required by entities present.""" - regex_dirs = "/" - for dir_entity in DIR_ENTITIES: - if dir_entity in variant["entities"].keys(): - format_selection = formats[entity_definitions[dir_entity]["format"]] - variable_field = format_selection["pattern"] - shorthand = entity_definitions[dir_entity]["name"] - if variant["entities"][dir_entity] == "required": - regex_subdir = f"{shorthand}-(?P<{dir_entity}>{variable_field})/" - else: - regex_subdir = f"(|{shorthand}-(?P<{dir_entity}>{variable_field})/)" - regex_dirs = f"{regex_dirs}{regex_subdir}" - if datatype in modality_datatypes: - regex_dirs = f"{regex_dirs}{datatype}/" - regex_string = f"{regex_dirs}{regex_string}" +def split_inheritance_rules(rule: Mapping) -> ty.List[Mapping]: + """Break composite rules into main and sidecar rules - return regex_string + Implements the inheritance principle for file naming. + """ + heritable_exts = {".tsv", ".json", ".bval", ".bvec"} + rule_exts = set(rule["extensions"]) + + main_exts = rule_exts - heritable_exts + # If a rule only has TSV or JSON files, entities can be + # made required + if not main_exts: + if ".tsv" in rule_exts: + main_exts = {".tsv"} + elif ".json" in rule_exts: + main_exts = {".json"} + + sidecar_exts = rule_exts - main_exts + if not sidecar_exts: + return [rule] + + sidecar_dtypes = [""] + rule.get("datatypes", []) + sidecar_entities = {ent: "optional" for ent in rule.get("entities")} + + main_rule = {**rule, **{"extensions": list(main_exts)}} + sidecar_rule = { + **rule, + **{ + "extensions": list(sidecar_exts), + "datatypes": sidecar_dtypes, + "entities": sidecar_entities, + }, + } + return [main_rule, sidecar_rule] -def _add_suffixes(regex_string, variant): - """Add suffixes to a regex string.""" - if len(variant["suffixes"]) == 1: - regex_suffixes = variant["suffixes"][0] - else: - regex_suffixes = "({})".format("|".join(variant["suffixes"])) - regex_string = f"{regex_string}_{regex_suffixes}" - return regex_string +def _path_rule(rule: bst.types.Namespace): + return {"regex": re.escape(rule.path), "mandatory": rule.level == "required"} -def load_top_level( - my_schema, -): - """ - Create full path regexes for top level files, as documented by a target BIDS YAML schema - version. +def _sanitize_extension(ext: str) -> str: + if ext == ".*": + return r"\.[a-zA-Z0-9.]+" + return re.escape(ext) - Parameters - ---------- - my_schema : dict - A nested dictionary, as returned by `bidsschematools.schema.load_schema()`. +def _stem_rule(rule: bst.types.Namespace): + stem_regex = re.escape(rule.stem) + ext_match = "|".join(_sanitize_extension(ext) for ext in rule.extensions) + ext_regex = f"(?P{ext_match})" - Returns - ------- - regex_schema : list of dict - A list of dictionaries, with keys including 'regex' and 'mandatory'. - """ + return {"regex": stem_regex + ext_regex, "mandatory": rule.level == "required"} - top_level_files = my_schema["rules"]["top_level_files"] - regex_schema = [] - for top_level_filename in top_level_files.keys(): - top_level_file = top_level_files[top_level_filename] - # None value gets passed as list of strings... - extensions = top_level_file["extensions"] - if extensions != ["None"]: - extensions_regex = "|".join(map(_extension_safety, extensions)) - regex = f".*?/{top_level_filename}({extensions_regex})$" +def _entity_rule(rule: Mapping, schema: bst.types.Namespace): + dir_regex = [] + entity_regex = [] + for ent in schema.rules.entities: + if ent not in rule["entities"]: + continue + ent_obj = rule["entities"][ent] + if isinstance(ent_obj, str): + ent_obj = {"level": ent_obj} + # Allow filename rule to override original entity fields + entity = {**schema.objects.entities[ent], **ent_obj} + + if "enum" in entity: + pattern = "|".join(entity["enum"]) else: - regex = f".*?/{top_level_filename}$" - regex_entry = { - "regex": regex, - "mandatory": top_level_file["required"], - } - regex_schema.append(regex_entry) + pattern = schema.objects.formats[entity["format"]].pattern - return regex_schema + dir_regex.append( + _format_entity(ent, entity["name"], pattern, entity["level"], directory=True) + ) + entity_regex.append(_format_entity(ent, entity["name"], pattern, entity["level"])) + + dtypes = set(rule.get("datatypes", ())) + optional_dtype = "" in dtypes + if optional_dtype: + dtypes.remove("") + if dtypes: + pattern = f"(?P{'|'.join(dtypes)})/" + if optional_dtype: + pattern = f"(?:{pattern})?" + dir_regex += pattern + + # If we move to referring to suffixes by keys in the object table: + # suffixes = [schema.objects.suffixes[suffix].value for suffix in rule["suffixes"]] + suffixes = rule["suffixes"] + suffix_regex = f"(?P{'|'.join(suffixes)})" + + # If we move to referring to extensions by keys in the object table: + # extensions = [schema.objects.extensions[ext].value for ext in rule["extensions"]] + extensions = rule["extensions"] + ext_match = "|".join(_sanitize_extension(ext) for ext in extensions) + ext_regex = f"(?P{ext_match})" + + return { + "regex": "".join(dir_regex + entity_regex + [suffix_regex, ext_regex]), + "mandatory": False, + } -def load_entities( - my_schema, - inheritance_regex=r".*?\\\.(tsv|bvec|json)(\$|\||\)).*?", +def load_filename_rules( + rule_group: bst.types.Namespace, + schema: bst.types.Namespace, + level: int, ): - """Create full path regexes for entities, as documented by a target BIDS YAML schema version. + """Load schema rules into regular expressions Parameters ---------- - my_schema : dict + rule_group : Namespace + The set of rules to load from the schema + schema : Namespace A nested dictionary, as returned by `bidsschematools.schema.load_schema()`. - inheritance_regex : str, optional - Valid regex string identifying filenames to which inheritance expansion should be applied. - - Notes - ----- - - * Suggest to BIDS-specification to remove the periods from the extensions, the leading period - is not part of the extension, but a delimiter defining the fact that it's an extension. - Code sections marked as `Making it period-safe` should be edited when this fix is in, - though they will work in any case. - https://github.com/bids-standard/bids-specification/issues/990 - * More issues in comments. + level : int + The depth in rule_group to look for rules Returns ------- - regex_schema : list of dict + rules : list of dict A list of dictionaries, with keys including 'regex' and 'mandatory'. """ - - # Parsing tabular_metadata as a datatype, might be done automatically if the YAML is moved - # to the same subdirectory - datatypes = { - "tabular_metadata": my_schema.rules.tabular_metadata, - **my_schema.rules.datatypes, - } - entity_order = my_schema["rules"]["entities"] - entity_definitions = my_schema["objects"]["entities"] - formats = my_schema["objects"]["formats"] - - # # Descriptions are not needed and very large. - # for i in entity_definitions.values(): - # i.pop("description", None) - - # Needed for non-modality file separation as per: - # https://github.com/bids-standard/bids-specification/pull/985#issuecomment-1019573787 - modalities = my_schema["rules"]["modalities"] - modality_datatypes = [] - for modality_key in modalities.keys(): - for modality_datatype in modalities[modality_key]["datatypes"]: - modality_datatypes.append(modality_datatype) - regex_schema = [] - for datatype in datatypes: - if datatype == "derivatives": - continue - for variant in datatypes[datatype].values(): - regex_entities = "" - for entity in entity_order: - # Slightly awkward construction to account for new-style file specification. - # As in: - # https://github.com/bids-standard/bids-specification/pull/987 - try: - if entity in variant["entities"]: - entity_shorthand = entity_definitions[entity]["name"] - if "enum" in entity_definitions[entity].keys(): - # Entity key-value pattern with specific allowed values - # tested, works! - variable_field = "|".join(entity_definitions[entity]["enum"]) - else: - format_selection = formats[entity_definitions[entity]["format"]] - variable_field = format_selection["pattern"] - regex_entities = _add_entity( - regex_entities, - entity, - entity_shorthand, - variable_field, - variant["entities"][entity], - ) - except KeyError: - pass - - regex_string = _add_suffixes(regex_entities, variant) - regex_string = _add_extensions(regex_string, variant) - regex_string = _add_subdirs( - regex_string, - variant, - datatype, - entity_definitions, - formats, - modality_datatypes, + for rule_template in rule_group.values(level=level): + # Simple rules, e.g. dataset_description.json, README + if "path" in rule_template: + regex_schema.append(_path_rule(rule_template)) + elif "stem" in rule_template: + regex_schema.append(_stem_rule(rule_template)) + else: + regex_schema.extend( + _entity_rule(rule, schema) for rule in split_inheritance_rules(rule_template) ) - regex_string = f".*?{regex_string}$" - regex_entry = { - "regex": regex_string, - "mandatory": False, - } - regex_schema.append(regex_entry) - if re.match(inheritance_regex, regex_string): - expansion_list = _inheritance_expansion(regex_string, datatype) - for expansion in expansion_list: - expansion_entry = { - "regex": expansion, - "mandatory": False, - } - regex_schema.append(expansion_entry) - return regex_schema @@ -360,20 +306,16 @@ def load_all( ------- all_regex : list of dict A list of dictionaries, with keys including 'regex' and 'mandatory'. - my_schema : list of dict + my_schema : Mapping Nested dictionaries representing the full schema. """ - my_schema = schema.load_schema(schema_dir) - all_regex = load_entities( - my_schema=my_schema, - ) - top_level_regex = load_top_level( - my_schema=my_schema, - ) - all_regex.extend(top_level_regex) + schema = bst.schema.load_schema(schema_dir) + all_regex = [] + for group in (schema.rules.files.common, schema.rules.files.raw): + all_regex.extend(load_filename_rules(group, schema, level=2)) - return all_regex, my_schema + return all_regex, schema def validate_all( @@ -420,7 +362,7 @@ def validate_all( for regex_entry in tracking_schema: target_regex = regex_entry["regex"] lgr.debug("\t* `%s`, with pattern: `%`", target_path, target_regex) - matched = re.match(target_regex, target_path) + matched = re.match(r"(?:.*/)?" + target_regex, target_path) itemwise_result = {} itemwise_result["path"] = target_path itemwise_result["regex"] = target_regex @@ -697,72 +639,6 @@ def log_errors(validation_result): lgr.warning("The `%s` file was not matched by any regex schema entry.", i) -def _inheritance_expansion( - regex_string, - datatype=None, -): - """ - Generate regex strings applying BIDS inheritance expansion to an input string. - - Parameters - ---------- - regex_string : str - String representing the regex to which inheritance expansion should be applied. - datatype : str, optional - Datatype string to remove as part of inheritance expansion. - - Returns - ------- - expanded_regexes : list of str - """ - - # Order is important as the string is eroded. - # Session is eroded *together with* and *after* subject, as it is always optional - # and the erosion is: - # * only required if a dangling leading underscore is present after subject removal. - # * only BIDS-valid after the subject field is eroded from the filename. - expansions = [ - { - "regex": [ - r".*?(?Psub-\(\?P\[0\-9a\-zA\-Z\]\+\)/).*?", - r".*?(?Psub-\(\?P=subject\))", - r".*?/(?P\(\|ses-\(\?P\[0\-9a\-zA\-Z\]\+\)/\)\(\|_ses-\(" - r"\?P=session\)\)_).*?", - ], - "replace": ["", "", ""], - }, - ] - if datatype: - # Inserting at the beginning, since datatype goes first. - expansions.insert( - 0, - { - "regex": [ - f".*?(?P{datatype}/).*?", - ], - "replace": [ - "", - ], - }, - ) - - expanded_regexes = [] - lgr.debug("Applying inheritance expansion to:\n`%s`", regex_string) - for expansion in expansions: - modified = False - for ix, regex in enumerate(expansion["regex"]): - matched = re.match(regex, regex_string) - if matched: - matched = matched.groupdict()["remove"] - regex_string = regex_string.replace(matched, expansion["replace"][ix]) - modified = True - if modified: - expanded_regexes.append(regex_string) - lgr.debug("\t* Generated expansion:\n\t%s", regex_string) - - return expanded_regexes - - def _get_directory_suffixes(my_schema): """Query schema for suffixes which identify directory entities. @@ -876,7 +752,7 @@ def validate_bids( ) # Record schema version. - bids_version = schema._get_bids_version(bids_schema_dir) + bids_version = bst.schema._get_bids_version(bids_schema_dir) validation_result["bids_version"] = bids_version log_errors(validation_result)