Skip to content

Commit

Permalink
feat(cli): Add tool for filename validation for use in pre-receive ho…
Browse files Browse the repository at this point in the history
…oks (#1986)

* ENH: Create a pre-receive hook that accepts ignore rules and file listing

* FIX: Use consistent logger, set sensible default level, enable -q

* ENH: Determine dataset type from bundled dataset_description.json

* RF: Stream filename validation, rewrite protocol

The 0001 is a particular git-protocol-ism, and 0000 does not behave as I
expected. Instead of adding multiple 0001s and attempting to identify
the meaning, the new protocol preceeds the old protocol with a header
line and a single JSON line.

* DOC: Update docstring

* Apply suggestions from code review

Co-authored-by: Nell Hardcastle <[email protected]>

---------

Co-authored-by: Nell Hardcastle <[email protected]>
  • Loading branch information
effigies and nellh authored Nov 13, 2024
1 parent 519279b commit bb139e9
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 4 deletions.
138 changes: 135 additions & 3 deletions tools/schemacode/bidsschematools/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
import logging
import os
import re
import sys
from itertools import chain

import click

Expand All @@ -9,15 +12,18 @@
else:
from importlib.resources import files


from .rules import regexify_filename_rules
from .schema import export_schema, load_schema
from .validator import _bidsignore_check


@click.group()
@click.option("-v", "--verbose", count=True)
def cli(verbose):
@click.option("-q", "--quiet", count=True)
def cli(verbose, quiet):
"""BIDS Schema Tools"""
logging.getLogger("bidsschematools").setLevel(logging.INFO - verbose * 10)
verbose = verbose - quiet
logging.getLogger("bidsschematools").setLevel(logging.WARNING - verbose * 10)


@cli.command()
Expand Down Expand Up @@ -53,5 +59,131 @@ def export_metaschema(ctx, output):
fobj.write(metaschema)


@cli.command("pre-receive-hook")
@click.option("--schema", "-s", type=click.Path(), help="Path to the BIDS schema")
@click.option(
"--input", "-i", "input_", default="-", type=click.Path(), help="Input file (default: stdin)"
)
@click.option(
"--output",
"-o",
"output",
default="-",
type=click.Path(),
help="Output file (default: stdout)",
)
def pre_receive_hook(schema, input_, output):
"""Validate filenames from a list of files against the BIDS schema
The expected input takes the following form:
```
bids-hook-v2
{"Name": "My dataset", "BIDSVersion": "1.9.0", "DatasetType": "raw"}
ignore-pattern1
...
ignore-patternN
0001
.datalad/config
.gitattributes
CHANGES
README
dataset_description.json
participants.tsv
sub-01/anat/sub-01_T1w.nii.gz
...
```
The header identifies the protocol version. For protocol ``bids-hook-v2``,
the second line MUST be the dataset_description.json file, with any newlines removed.
The following lines, up to the line containing "0001", are ignore patterns
from the .bidsignore file. The lines following "0001" are the filenames to
be validated.
This is intended to be used in a git pre-receive hook.
"""
logger = logging.getLogger("bidsschematools")
schema = load_schema(schema)

# Slurp inputs for now; we can think about streaming later
if input_ == "-":
stream = sys.stdin
else:
stream = open(input_)

first_line = next(stream)
if first_line == "bids-hook-v2\n":
# V2 format: header line, description JSON, followed by legacy format
description_str = next(stream)
fail = False
try:
description: dict = json.loads(description_str)
except json.JSONDecodeError:
fail = True
if fail or not isinstance(description, dict):
logger.critical("Protocol error: invalid JSON in description")
logger.critical(
"Dataset description must be one JSON object, written to a single line"
)
logger.critical("Received: %s", description_str)
stream.close()
sys.exit(2)
else:
# Legacy: ignore patterns, followed by "0001", followed by filenames
stream = chain([first_line], stream)
description = {}

dataset_type = description.get("DatasetType", "raw")
logger.info("Dataset type: %s", dataset_type)

ignore = []
for line in stream:
if line == "0001\n":
break
ignore.append(line.strip())
logger.info("Ignore patterns found: %d", len(ignore))

all_rules = chain.from_iterable(
regexify_filename_rules(group, schema, level=2)
for group in (schema.rules.files.common, schema.rules.files.raw)
)
if dataset_type == "derivative":
all_rules = chain(
all_rules,
regexify_filename_rules(schema.rules.files.derivatives, schema, level=2),
)

regexes = [rule["regex"] for rule in all_rules]
# XXX Hack for phenotype files - this can be removed once we
# have a schema definition for them
regexes.append(r"phenotype/.*\.(tsv|json)")

output = sys.stdout if output == "-" else open(output, "w")

rc = 0
any_files = False
valid_files = 0
with output:
for filename in stream:
if not any_files:
logger.debug("Validating files, first file: %s", filename)
any_files = True
filename = filename.strip()
if any(_bidsignore_check(pattern, filename, "") for pattern in ignore):
continue
if not any(re.match(regex, filename) for regex in regexes):
print(filename, file=output)
rc = 1
else:
valid_files += 1

if valid_files == 0:
logger.error("No files to validate")
rc = 2

stream.close()
sys.exit(rc)


if __name__ == "__main__":
cli()
2 changes: 1 addition & 1 deletion tools/schemacode/bidsschematools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_logger(name=None):
logging.Logger
logger object.
"""
return logging.getLogger("bids-schema" + (".%s" % name if name else ""))
return logging.getLogger("bidsschematools" + (".%s" % name if name else ""))


def set_logger_level(lgr, level):
Expand Down

0 comments on commit bb139e9

Please sign in to comment.