diff --git a/src/schema/objects/formats.yaml b/src/schema/objects/formats.yaml new file mode 100644 index 0000000000..dab370d2c7 --- /dev/null +++ b/src/schema/objects/formats.yaml @@ -0,0 +1,115 @@ +--- +# This file defines valid patterns for different formats +# Entity patterns +index: + name: Index + description: | + Non-negative, non-zero integers, optionally prefixed with leading zeros for sortability. + An index may not be all zeros. + pattern: '[0-9]*[1-9]+[0-9]*' +label: + name: Label + description: | + Freeform labels without special characters. + pattern: '[0-9a-zA-Z]+' +# Metadata types +boolean: + name: Boolean + description: | + A boolean. + Must be either "true" or "false". + pattern: '(true|false)' +integer: + name: Integer + description: | + An integer which may be positive or negative. + pattern: '[+-]?\d+' +number: + name: Number + description: | + A number which may be an integer or float, positive or negative. + pattern: '[+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)([eE][+-]?[0-9]+)?' +string: + name: String + description: | + The basic string type (not a specific format). + This should allow any free-form string. + pattern: '.*' +# String formats +bids_uri: + name: BIDS uniform resource indicator + description: | + A BIDS uniform resource indicator. + + The validation for this format is minimal. + It simply ensures that the value is a string with any characters that may appear in a valid URI, + starting with "bids:". + pattern: 'bids:[0-9a-zA-Z/#:\?\_\-\.]+' +dataset_relative: + name: Path relative to the BIDS dataset folder + description: | + A path to a file, relative to the dataset folder. + + The validation for this format is minimal. + It simply ensures that the value is a string with any characters that may appear in a valid path, + without starting with "/" (an absolute path). + pattern: '(?!/)[0-9a-zA-Z/\_\-\.]+' +date: + name: Date + description: | + A date in the form `"YYYY-MM-DD[Z]"`, + where [Z] is an optional, valid timezone code. + pattern: '[0-9]{4}-[0-9]{2}-[0-9]{2}([A-Z]{2,4})?' +datetime: + name: Datetime + description: | + A datetime in the form `"YYYY-MM-DDThh:mm:ss[.000000][Z]"`, + where [.000000] is an optional subsecond resolution between 1 and 6 decimal points, + and [Z] is an optional, valid timezone code. + pattern: '[0-9]{4}-[0-9]{2}-[0-9]{2}T(?:2[0-3]|[01][0-9]):[0-5][0-9]:[0-5][0-9](\.[0-9]{1,6})?([A-Z]{2,4})?' +participant_relative: + name: Path relative to the participant folder + description: | + A path to a file, relative to the participant's folder in the dataset. + + The validation for this format is minimal. + It simply ensures that the value is a string with any characters that may appear in a valid path, + without starting with "/" (an absolute path) or "sub/" + (a relative path starting with the participant folder, rather than relative to that folder). + pattern: '(?!/)(?!sub-)[0-9a-zA-Z/\_\-\.]+' +rrid: + name: Research resource identifier + description: | + A [research resource identifier](https://scicrunch.org/resources). + pattern: 'RRID:.+_.+' +stimuli_relative: + name: Path relative to the stimuli folder + description: | + A path to a stimulus file, relative to a `/stimuli` folder somewhere. + + The validation for this format is minimal. + It simply ensures that the value is a string with any characters that may appear in a valid path, + without starting with "/" (an absolute path) or "stimuli/" + (a relative path starting with the stimuli folder, rather than relative to that folder). + pattern: '(?!/)(?!stimuli/)[0-9a-zA-Z/\_\-\.]+' +time: + name: Time + description: | + A time in the form `"hh:mm:ss"`. + pattern: '(?:2[0-3]|[01]?[0-9]):[0-5][0-9]:[0-5][0-9]' +unit: + name: A standardized unit + description: | + A unit. + SI units in CMIXF formatting are RECOMMENDED + (see [Units](/02-common-principles.html#units)). + + Currently this matches any string. + + TODO: Somehow reference the actual unit options in the Units appendix. + pattern: '.*' +uri: + name: Uniform resource indicator + description: | + A uniform resource indicator. + pattern: '(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml index 6fe9ecfc20..b6cb9d5a95 100644 --- a/src/schema/objects/metadata.yaml +++ b/src/schema/objects/metadata.yaml @@ -1684,7 +1684,7 @@ MolarActivityMeasTime: Time to which molar radioactivity measurement above applies in the default unit `"hh:mm:ss"`. type: string - pattern: ^(?:2[0-3]|[01][0-9]):[0-5][0-9]:[0-5][0-9]$ + format: time MolarActivityUnits: name: MolarActivityUnits description: | @@ -2447,7 +2447,7 @@ SoftwareRRID: Examples: The RRID for Psychtoolbox is 'SCR_002881', and that of PsychoPy is 'SCR_006571'. type: string - pattern: .+_.+ + format: rrid SoftwareVersion: name: SoftwareVersion description: | @@ -2535,7 +2535,7 @@ SpecificRadioactivityMeasTime: Time to which specific radioactivity measurement above applies in the default unit `"hh:mm:ss"`. type: string - pattern: ^(?:2[0-3]|[01][0-9]):[0-5][0-9]:[0-5][0-9]$ + format: time SpecificRadioactivityUnits: name: SpecificRadioactivityUnits description: | @@ -2654,7 +2654,7 @@ TimeZero: in the unit "hh:mm:ss". This should be equal to `InjectionStart` or `ScanStart`. type: string - pattern: ^(?:2[0-3]|[01][0-9]):[0-5][0-9]:[0-5][0-9]$ + format: time TissueDeformationScaling: name: TissueDeformationScaling description: | diff --git a/tools/schemacode/schemacode/tests/test_schema.py b/tools/schemacode/schemacode/tests/test_schema.py index 5d1400a778..9e169d0f1c 100644 --- a/tools/schemacode/schemacode/tests/test_schema.py +++ b/tools/schemacode/schemacode/tests/test_schema.py @@ -27,3 +27,111 @@ def test_object_definitions(schema_obj): assert "name" in obj_def.keys(), obj_key assert "description" in obj_def.keys(), obj_key + + +def test_formats(schema_obj): + """Test valid string patterns allowed by the specification.""" + import re + + # Check that valid strings match the search pattern. + GOOD_PATTERNS = { + "label": ["01", "test", "test01", "Test01"], + "index": ["01", "1", "10000", "00001"], + "string": ["any string is valid."], + "integer": ["5", "10", "-5", "-10"], + "number": [ + "5", # integers are allowed + "3.14", # floats too + "-5", # they can be negative + "-3.14", + "1e3", # scientific notation is allowed + "-2.1E+5", + ], + "boolean": ["true", "false"], + "date": ["2022-01-05", "2022-01-05UTC", "2022-50-50"], + "datetime": [ + "2022-01-05T13:16:30", + "2022-01-05T13:16:30.5", # subsecond resolution is allowed + "2022-01-05T13:16:30.000005", # up to 6 decimal points + "2022-01-05T13:16:30UTC", # timezones are allowed + "2022-01-05T13:16:30.05UTC", + ], + "time": [ + "13:16:30", + "09:00:00", + "9:00:00", # leading zeros are not required for hours + ], + "unit": ["any string is valid."], + "stimuli_relative": ["any/arbitrary/path/file.txt"], + "dataset_relative": ["any/arbitrary/path/file.txt"], + "participant_relative": ["any/arbitrary/path/file.txt"], + "rrid": ["RRID:SCR_017398"], + "uri": ["foo://example.com:8042/over/there?name=ferret#nose"], + "bids_uri": [ + "bids::sub-01/fmap/sub-01_dir-AP_epi.nii.gz", + "bids:ds000001:sub-02/anat/sub-02_T1w.nii.gz", + "bids:myderivatives:sub-03/func/sub-03_task-rest_space-MNI152_bold.nii.gz", + ], + } + for pattern, test_list in GOOD_PATTERNS.items(): + pattern_format = schema_obj["objects"]["formats"][pattern]["pattern"] + search_pattern = "^" + pattern_format + "$" + search = re.compile(search_pattern) + for test_string in test_list: + assert bool( + search.fullmatch(test_string) + ), f"'{test_string}' is not a valid match for the pattern '{search.pattern}'" + + # Check that invalid strings do not match the search pattern. + BAD_PATTERNS = { + "label": ["test_01", "!", "010101-", "01-01", "-01"], + "index": ["test", "0.1", "0-1", "0_1"], + "string": [], + "integer": ["3.14", "-3.14", "1.", "-1.", "string", "s1", "1%", "one"], + "number": ["string", "1%"], + "boolean": ["True", "False", "T", "F"], + "date": [ + "05-01-2022", # MM-DD-YYYY or DD-MM-YYYY + "05/01/2022", # MM/DD/YYYY or DD/MM/YYYY + ], + "datetime": [ + "2022-01-05T13:16:30.1000005", # too many decimal points + "2022-01-05T13:16:30U", # time zone too short + "2022-01-05T13:16:30UTCUTC", # time zone too long + "2022-01-05T34:10:10", # invalid time + ], + "time": [ + "34:10:10", # invalid time + "24:00:00", # should be 00:00:00 + "00:60:00", # should be 01:00:00 + "00:00:60", # should be 00:01:00 + "01:23", # lacks either hours or seconds + ], + "unit": [], + "stimuli_relative": [ + "/path/with/starting/slash/file.txt", + "stimuli/path/file.txt", + ], + "dataset_relative": [ + "/path/with/starting/slash/file.txt", + ], + "participant_relative": [ + "/path/with/starting/slash/file.txt", + "sub-01/path/file.txt", + ], + "rrid": [ + "RRID:", # empty one + ], + "uri": [ + # "ftp://", # lacks anything but protocol. This should fail, but doesn't ATM. + ], + "bids_uri": [], + } + for pattern, test_list in BAD_PATTERNS.items(): + pattern_format = schema_obj["objects"]["formats"][pattern]["pattern"] + search_pattern = f"^{pattern_format}$" + search = re.compile(search_pattern) + for test_string in test_list: + assert not bool( + search.fullmatch(test_string) + ), f"'{test_string}' should not be a valid match for the pattern '{search.pattern}'"