From f7d2af053cdeec12758f6bc9777b2e1c181deb2c Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Thu, 8 Feb 2024 15:52:48 -0500 Subject: [PATCH 1/6] ENH: Create a pre-receive hook that accepts ignore rules and file listing --- tools/schemacode/bidsschematools/__main__.py | 62 +++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index a250e37cab..61197307d2 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -1,6 +1,8 @@ import logging import os +import re import sys +from itertools import chain import click @@ -9,8 +11,9 @@ else: from importlib.resources import files - +from .rules import regexify_filename_rules from .schema import export_schema, load_schema +from .validator import _bidsignore_check @click.group() @@ -53,5 +56,62 @@ def export_metaschema(ctx, output): fobj.write(metaschema) +@cli.command("pre-receive-hook") +@click.option("--schema", "-s", type=click.Path(), help="Path to the BIDS schema") +@click.option( + "--input", "-i", "input_", default="-", type=click.Path(), help="Input file (default: stdin)" +) +@click.option( + "--output", + "-o", + "output", + default="-", + type=click.Path(), + help="Output file (default: stdout)", +) +def pre_receive_hook(schema, input_, output): + """Validate filenames from a list of files against the BIDS schema + + The input should be a list of ignore patterns followed by a line containing + "0001" and then a list of filenames. The output will be a list of filenames + that do not match the schema. + + This is intended to be used in a git pre-receive hook. + """ + # Slurp inputs for now; we can think about streaming later + if input_ == "-": + lines = sys.stdin.readlines() + else: + with open(input_) as fobj: + lines = fobj.readlines() + + split = lines.index("0001\n") + ignore = [line.rstrip() for line in lines[:split]] + filenames = [line.rstrip() for line in lines[split + 1 :]] + + schema = load_schema(schema) + all_rules = chain.from_iterable( + regexify_filename_rules(group, schema, level=2) + for group in (schema.rules.files.common, schema.rules.files.raw) + ) + regexes = [rule["regex"] for rule in all_rules] + # XXX Hack for phenotype files - this can be removed once we + # have a schema definition for them + regexes.append(r"phenotype/.*\.tsv") + + output = sys.stdout if output == "-" else open(output, "w") + + rc = 0 + with output: + for filename in filenames: + if any(_bidsignore_check(pattern, filename, "") for pattern in ignore): + continue + if not any(re.match(regex, filename) for regex in regexes): + output.write(f"{filename}\n") + rc = 1 + + sys.exit(rc) + + if __name__ == "__main__": cli() From 10467bef2e417162795acd5a4b6c66a0841e72a0 Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Thu, 8 Feb 2024 20:00:58 -0500 Subject: [PATCH 2/6] FIX: Use consistent logger, set sensible default level, enable -q --- tools/schemacode/bidsschematools/__main__.py | 6 ++++-- tools/schemacode/bidsschematools/utils.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 61197307d2..7f0416f489 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -18,9 +18,11 @@ @click.group() @click.option("-v", "--verbose", count=True) -def cli(verbose): +@click.option("-q", "--quiet", count=True) +def cli(verbose, quiet): """BIDS Schema Tools""" - logging.getLogger("bidsschematools").setLevel(logging.INFO - verbose * 10) + verbose = verbose - quiet + logging.getLogger("bidsschematools").setLevel(logging.WARNING - verbose * 10) @cli.command() diff --git a/tools/schemacode/bidsschematools/utils.py b/tools/schemacode/bidsschematools/utils.py index ec2947fb37..6f9850edb0 100644 --- a/tools/schemacode/bidsschematools/utils.py +++ b/tools/schemacode/bidsschematools/utils.py @@ -29,7 +29,7 @@ def get_logger(name=None): logging.Logger logger object. """ - return logging.getLogger("bids-schema" + (".%s" % name if name else "")) + return logging.getLogger("bidsschematools" + (".%s" % name if name else "")) def set_logger_level(lgr, level): From 26c64d8b71a48cc573242b3be24108485153260b Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Wed, 28 Feb 2024 19:43:28 -0500 Subject: [PATCH 3/6] ENH: Determine dataset type from bundled dataset_description.json --- tools/schemacode/bidsschematools/__main__.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 7f0416f489..ff8e942d0e 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -1,3 +1,4 @@ +import json import logging import os import re @@ -88,14 +89,30 @@ def pre_receive_hook(schema, input_, output): lines = fobj.readlines() split = lines.index("0001\n") - ignore = [line.rstrip() for line in lines[:split]] + preamble = [line.rstrip() for line in lines[:split]] filenames = [line.rstrip() for line in lines[split + 1 :]] + try: + split = preamble.index("0000") + except ValueError: + description = {} + ignore = preamble + else: + description = json.loads("".join(preamble[:split])) + ignore = preamble[split + 1 :] + + dataset_type = description.get("DatasetType", "raw") schema = load_schema(schema) all_rules = chain.from_iterable( regexify_filename_rules(group, schema, level=2) for group in (schema.rules.files.common, schema.rules.files.raw) ) + if dataset_type == "derivative": + all_rules = chain( + all_rules, + regexify_filename_rules(schema.rules.files.derivatives, schema, level=2), + ) + regexes = [rule["regex"] for rule in all_rules] # XXX Hack for phenotype files - this can be removed once we # have a schema definition for them From 7cc6dd345a72b9e19132288da470fc37c2d6a14f Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Fri, 1 Mar 2024 12:12:42 -0500 Subject: [PATCH 4/6] RF: Stream filename validation, rewrite protocol The 0001 is a particular git-protocol-ism, and 0000 does not behave as I expected. Instead of adding multiple 0001s and attempting to identify the meaning, the new protocol preceeds the old protocol with a header line and a single JSON line. --- tools/schemacode/bidsschematools/__main__.py | 63 ++++++++++++++------ 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index ff8e942d0e..75df8dd2c1 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -81,26 +81,42 @@ def pre_receive_hook(schema, input_, output): This is intended to be used in a git pre-receive hook. """ + logger = logging.getLogger("bidsschematools") + schema = load_schema(schema) + # Slurp inputs for now; we can think about streaming later if input_ == "-": - lines = sys.stdin.readlines() + stream = sys.stdin else: - with open(input_) as fobj: - lines = fobj.readlines() - - split = lines.index("0001\n") - preamble = [line.rstrip() for line in lines[:split]] - filenames = [line.rstrip() for line in lines[split + 1 :]] - try: - split = preamble.index("0000") - except ValueError: - description = {} - ignore = preamble + stream = open(input_) + + first_line = next(stream) + if first_line == "bids-hook-v2\n": + # V2 format: header line, description JSON, followed by legacy format + description_str = next(stream) + fail = False + try: + description: dict = json.loads(description_str) + except json.JSONDecodeError: + fail = True + if fail or not isinstance(description, dict): + logger.critical("Protocol error: invalid JSON in description") + logger.critical( + "Dataset description must be one JSON object, written to a single line" + ) + logger.critical("Received: %s", description_str) + stream.close() + sys.exit(2) else: - description = json.loads("".join(preamble[:split])) - ignore = preamble[split + 1 :] + # Legacy: ignore patterns, followed by "0001", followed by filenames + stream = chain([first_line], stream) + description = {} dataset_type = description.get("DatasetType", "raw") + logger.info("Dataset type: %s", dataset_type) + + ignore = [line.strip() for line in stream if line != "0001\n"] + logger.info("Ignore patterns found: %d", len(ignore)) schema = load_schema(schema) all_rules = chain.from_iterable( @@ -116,19 +132,32 @@ def pre_receive_hook(schema, input_, output): regexes = [rule["regex"] for rule in all_rules] # XXX Hack for phenotype files - this can be removed once we # have a schema definition for them - regexes.append(r"phenotype/.*\.tsv") + regexes.append(r"phenotype/.*\.(tsv|json)") output = sys.stdout if output == "-" else open(output, "w") rc = 0 + any_files = False + valid_files = 0 with output: - for filename in filenames: + for filename in stream: + if not any_files: + logger.debug("Validating files, first file: %s", filename) + any_files = True + filename = filename.strip() if any(_bidsignore_check(pattern, filename, "") for pattern in ignore): continue if not any(re.match(regex, filename) for regex in regexes): - output.write(f"{filename}\n") + print(filename, file=output) rc = 1 + else: + valid_files += 1 + + if valid_files == 0: + logger.error("No files to validate") + rc = 2 + stream.close() sys.exit(rc) From daf2fe837b9846297626a1d620e4ef039f7a9486 Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Sat, 23 Mar 2024 14:13:49 -0400 Subject: [PATCH 5/6] DOC: Update docstring --- tools/schemacode/bidsschematools/__main__.py | 27 +++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 75df8dd2c1..89248c35b2 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -75,9 +75,30 @@ def export_metaschema(ctx, output): def pre_receive_hook(schema, input_, output): """Validate filenames from a list of files against the BIDS schema - The input should be a list of ignore patterns followed by a line containing - "0001" and then a list of filenames. The output will be a list of filenames - that do not match the schema. + The expected input takes the following form: + + ``` + bids-hook-v2 + {"Name": "My dataset", "BIDSVersion": "1.9.0", "DatasetType": "raw"} + ignore-pattern1 + ... + ignore-patternN + 0001 + .datalad/config + .gitattributes + CHANGES + README + dataset_description.json + participants.tsv + sub-01/anat/sub-01_T1w.nii.gz + ... + ``` + + The header identifies the protocol version. For protocol ``bids-hook-v2``, + the second line MUST be the dataset_description.json file, with any newlines removed. + The following lines, up to the line containing "0001", are ignore patterns + from the .bidsignore file. The lines following "0001" are the filenames to + be validated. This is intended to be used in a git pre-receive hook. """ From 19bc8ece7d700a083dcb539395fa104b9d0da2a8 Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Wed, 13 Nov 2024 15:32:49 -0500 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Nell Hardcastle --- tools/schemacode/bidsschematools/__main__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 89248c35b2..777bea2d87 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -136,10 +136,13 @@ def pre_receive_hook(schema, input_, output): dataset_type = description.get("DatasetType", "raw") logger.info("Dataset type: %s", dataset_type) - ignore = [line.strip() for line in stream if line != "0001\n"] + ignore = [] + for line in stream: + if line == "0001\n": + break + ignore.append(line.strip()) logger.info("Ignore patterns found: %d", len(ignore)) - schema = load_schema(schema) all_rules = chain.from_iterable( regexify_filename_rules(group, schema, level=2) for group in (schema.rules.files.common, schema.rules.files.raw)