diff --git a/.github/workflows/add-to-dashboard.yml b/.github/workflows/add-to-dashboard.yml index 0fc3d2a09..e72d98965 100644 --- a/.github/workflows/add-to-dashboard.yml +++ b/.github/workflows/add-to-dashboard.yml @@ -1,35 +1,19 @@ -name: Add Issue or PR to Dashboard +name: Add Issue or Pull Request to Dashboard on: issues: - types: opened - + types: + - opened pull_request: types: - opened jobs: - issue_opened: - name: Add Issue to Dashboard - runs-on: ubuntu-latest - if: github.event_name == 'issues' - steps: - - name: Add Issue to Dashboard - uses: leonsteinhaeuser/project-beta-automations@v1.2.1 - with: - gh_token: ${{ secrets.MY_GITHUB_TOKEN }} - organization: catalystneuro - project_id: 3 - resource_node_id: ${{ github.event.issue.node_id }} - pr_opened: - name: Add PR to Dashboard + add-to-project: + name: Add issue or pull request to project runs-on: ubuntu-latest - if: github.event_name == 'pull_request' && github.event.action == 'opened' steps: - - name: Add PR to Dashboard - uses: leonsteinhaeuser/project-beta-automations@v1.2.1 + - uses: actions/add-to-project@v0.5.0 with: - gh_token: ${{ secrets.MY_GITHUB_TOKEN }} - organization: catalystneuro - project_id: 3 - resource_node_id: ${{ github.event.pull_request.node_id }} + project-url: https://github.com/orgs/catalystneuro/projects/3 + github-token: ${{ secrets.PROJECT_TOKEN }} diff --git a/.github/workflows/dev-testing.yml b/.github/workflows/dev-testing.yml index 5b6f02fe9..9ad54911b 100644 --- a/.github/workflows/dev-testing.yml +++ b/.github/workflows/dev-testing.yml @@ -18,7 +18,7 @@ env: jobs: run: - name: Dev Branch Testing with Python 3.9 and ubuntu-latest + name: Ubuntu tests with Python ${{ matrix.python-version }} runs-on: ubuntu-latest strategy: fail-fast: false diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index c65557029..f0415337e 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -4,7 +4,7 @@ on: jobs: run: - name: Doctests on ${{ matrix.os }} with Python ${{ matrix.python-version }} + name: ${{ matrix.os }} Python ${{ matrix.python-version }} runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/formatwise-installation-testing.yml b/.github/workflows/formatwise-installation-testing.yml index 953a1af17..54ba21b7c 100644 --- a/.github/workflows/formatwise-installation-testing.yml +++ b/.github/workflows/formatwise-installation-testing.yml @@ -6,7 +6,7 @@ on: jobs: run: - name: Formatwise gallery tests for ${{ format.type }}:${{ format.name }} on ${{ matrix.os }} with Python ${{ matrix.python-version }} + name: ${{ format.type }}:${{ format.name }} on ${{ matrix.os }} with Python ${{ matrix.python-version }} runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/live-service-testing.yml b/.github/workflows/live-service-testing.yml index a5764b228..8862e3cf3 100644 --- a/.github/workflows/live-service-testing.yml +++ b/.github/workflows/live-service-testing.yml @@ -12,7 +12,7 @@ env: jobs: run: - name: Live service testing on ${{ matrix.os }} with Python ${{ matrix.python-version }} + name: ${{ matrix.os }} Python ${{ matrix.python-version }} runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index a08183723..3a930fa4b 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -14,7 +14,7 @@ on: jobs: run: - name: Minimal and full tests on ${{ matrix.os }} with Python ${{ matrix.python-version }} + name: ${{ matrix.os }} Python ${{ matrix.python-version }} runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/CHANGELOG.md b/CHANGELOG.md index c17152260..1ef34120b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,13 +5,21 @@ * Changed the metadata schema for `Fluorescence` and `DfOverF` where the traces metadata can be provided as a dict instead of a list of dicts. The name of the plane segmentation is used to determine which traces to add to the `Fluorescence` and `DfOverF` containers. [PR #632](https://github.com/catalystneuro/neuroconv/pull/632) * Modify the filtering of traces to also filter out traces with empty values. [PR #649](https://github.com/catalystneuro/neuroconv/pull/649) +* Added tool function `get_default_dataset_configurations` for identifying and collecting all fields of an in-memory `NWBFile` that could become datasets on disk; and return instances of the Pydantic dataset models filled with default values for chunking/buffering/compression. [PR #569](https://github.com/catalystneuro/neuroconv/pull/569) * Added `set_probe()` method to `BaseRecordingExtractorInterface`. [PR #639](https://github.com/catalystneuro/neuroconv/pull/639) ### Fixes * Fixed GenericDataChunkIterator (in hdmf.py) in the case where the number of dimensions is 1 and the size in bytes is greater than the threshold of 1 GB. [PR #638](https://github.com/catalystneuro/neuroconv/pull/638) * Changed `np.floor` and `np.prod` usage to `math.floor` and `math.prod` in various files. [PR #638](https://github.com/catalystneuro/neuroconv/pull/638) +* Updated minimal required version of DANDI CLI; updated `run_conversion_from_yaml` API function and tests to be compatible with naming changes. [PR #664](https://github.com/catalystneuro/neuroconv/pull/664) -# v0.4.5 +### Improvements +* Change metadata extraction library from `fparse` to `parse`. [PR #654](https://github.com/catalystneuro/neuroconv/pull/654) +* The `dandi` CLI/API is now an optional dependency; it is still required to use the `tool` function for automated upload as well as the YAML-based NeuroConv CLI. [PR #655](https://github.com/catalystneuro/neuroconv/pull/655) + + + +# v0.4.5 (November 6, 2023) ### Back-compatibility break * The `CEDRecordingInterface` has now been removed; use the `Spike2RecordingInterface` instead. [PR #602](https://github.com/catalystneuro/neuroconv/pull/602) diff --git a/docs/developer_guide/testing_suite.rst b/docs/developer_guide/testing_suite.rst index f9d044040..73631da32 100644 --- a/docs/developer_guide/testing_suite.rst +++ b/docs/developer_guide/testing_suite.rst @@ -20,7 +20,7 @@ Then install all required and optional dependencies in a fresh environment. .. code:: bash - pip install -e . neuroconv[test,full] + pip install -e .[test,full] Then simply run all tests with pytest @@ -29,6 +29,10 @@ Then simply run all tests with pytest pytest +.. note:: + + You will likely observe many failed tests if the test data is not available. See the section 'Testing on Example Data' for instructions on how to download the test data. + Minimal ------- diff --git a/requirements-minimal.txt b/requirements-minimal.txt index 7acc23dc2..a665b2b21 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -6,8 +6,9 @@ h5py>=3.9.0 hdmf>=3.11.0 hdmf_zarr>=0.4.0 pynwb>=2.3.2;python_version>='3.8' +nwbinspector>=0.4.31 +pydantic>=1.10.13,<2.0.0 psutil>=5.8.0 tqdm>=4.60.0 -dandi>=0.57.0 pandas -fparse +parse>=1.20.0 diff --git a/requirements-testing.txt b/requirements-testing.txt index 24db85e3e..6aa116be1 100644 --- a/requirements-testing.txt +++ b/requirements-testing.txt @@ -2,6 +2,5 @@ pytest pytest-cov ndx-events>=0.2.0 # for special tests to ensure load_namespaces is set to allow NWBFile load at all timess parameterized>=0.8.1 -scikit-learn # For SI Waveform tests -numba; python_version <= '3.10' # For SI Waveform tests ndx-miniscope +spikeinterface[qualitymetrics]>=0.99.1 diff --git a/setup.py b/setup.py index 0c9541967..7ff18db27 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,9 @@ testing_suite_dependencies = f.readlines() extras_require = defaultdict(list) +extras_require["dandi"].append("dandi>=0.58.1") +extras_require["full"].extend(extras_require["dandi"]) + extras_require.update(test=testing_suite_dependencies, docs=documentation_dependencies) for modality in ["ophys", "ecephys", "icephys", "behavior", "text"]: modality_path = root / "src" / "neuroconv" / "datainterfaces" / modality @@ -75,7 +78,7 @@ extras_require=extras_require, entry_points={ "console_scripts": [ - "neuroconv = neuroconv.tools.yaml_conversion_specification.yaml_conversion_specification:run_conversion_from_yaml_cli", + "neuroconv = neuroconv.tools.yaml_conversion_specification._yaml_conversion_specification:run_conversion_from_yaml_cli", ], }, license="BSD-3-Clause", diff --git a/src/neuroconv/datainterfaces/ecephys/requirements.txt b/src/neuroconv/datainterfaces/ecephys/requirements.txt index 891ebb813..c88aa47a9 100644 --- a/src/neuroconv/datainterfaces/ecephys/requirements.txt +++ b/src/neuroconv/datainterfaces/ecephys/requirements.txt @@ -1,2 +1,2 @@ -spikeinterface>=0.98.2 +spikeinterface>=0.99.1 packaging<22.0 diff --git a/src/neuroconv/tools/__init__.py b/src/neuroconv/tools/__init__.py index 4a1a637d2..dac0abb9f 100644 --- a/src/neuroconv/tools/__init__.py +++ b/src/neuroconv/tools/__init__.py @@ -1,3 +1,4 @@ +"""Collection of all helper functions that require at least one external dependency (some being optional as well).""" from .importing import get_package from .nwb_helpers import get_module from .path_expansion import LocalPathExpander diff --git a/src/neuroconv/tools/data_transfers/__init__.py b/src/neuroconv/tools/data_transfers/__init__.py new file mode 100644 index 000000000..03c2dc980 --- /dev/null +++ b/src/neuroconv/tools/data_transfers/__init__.py @@ -0,0 +1,5 @@ +"""Collection of helper functions for assessing and performing automated data transfers.""" +from ._aws import estimate_s3_conversion_cost +from ._dandi import automatic_dandi_upload +from ._globus import get_globus_dataset_content_sizes, transfer_globus_content +from ._helpers import estimate_total_conversion_runtime diff --git a/src/neuroconv/tools/data_transfers/_aws.py b/src/neuroconv/tools/data_transfers/_aws.py new file mode 100644 index 000000000..66f213e01 --- /dev/null +++ b/src/neuroconv/tools/data_transfers/_aws.py @@ -0,0 +1,34 @@ +"""Collection of helper functions for assessing and performing automated data transfers related to AWS.""" + + +def estimate_s3_conversion_cost( + total_mb: float, + transfer_rate_mb: float = 20.0, + conversion_rate_mb: float = 17.0, + upload_rate_mb: float = 40.0, + compression_ratio: float = 1.7, +): + """ + Estimate potential cost of performing an entire conversion on S3 using full automation. + + Parameters + ---------- + total_mb: float + The total amount of data (in MB) that will be transferred, converted, and uploaded to dandi. + transfer_rate_mb : float, default: 20.0 + Estimate of the transfer rate for the data. + conversion_rate_mb : float, default: 17.0 + Estimate of the conversion rate for the data. Can vary widely depending on conversion options and type of data. + Figure of 17MB/s is based on extensive compression of high-volume, high-resolution ecephys. + upload_rate_mb : float, default: 40.0 + Estimate of the upload rate of a single file to the DANDI Archive. + compression_ratio : float, default: 1.7 + Estimate of the final average compression ratio for datasets in the file. Can vary widely. + """ + c = 1 / compression_ratio # compressed_size = total_size * c + total_mb_s = ( + total_mb**2 / 2 * (1 / transfer_rate_mb + (2 * c + 1) / conversion_rate_mb + 2 * c**2 / upload_rate_mb) + ) + cost_gb_m = 0.08 / 1e3 # $0.08 / GB Month + cost_mb_s = cost_gb_m / (1e3 * 2.628e6) # assuming 30 day month; unsure how amazon weights shorter months? + return cost_mb_s * total_mb_s diff --git a/src/neuroconv/tools/data_transfers/_dandi.py b/src/neuroconv/tools/data_transfers/_dandi.py new file mode 100644 index 000000000..e411f36e2 --- /dev/null +++ b/src/neuroconv/tools/data_transfers/_dandi.py @@ -0,0 +1,118 @@ +"""Collection of helper functions for assessing and performing automated data transfers for the DANDI archive.""" +import os +from pathlib import Path +from shutil import rmtree +from tempfile import mkdtemp +from typing import Union +from warnings import warn + +from pynwb import NWBHDF5IO + +from ...utils import FolderPathType, OptionalFolderPathType + + +def automatic_dandi_upload( + dandiset_id: str, + nwb_folder_path: FolderPathType, + dandiset_folder_path: OptionalFolderPathType = None, + version: str = "draft", + staging: bool = False, + cleanup: bool = False, + number_of_jobs: Union[int, None] = None, + number_of_threads: Union[int, None] = None, +): + """ + Fully automated upload of NWBFiles to a DANDISet. + + Requires an API token set as an envrinment variable named DANDI_API_KEY. + + To set this in your bash terminal in Linux or MacOS, run + export DANDI_API_KEY=... + or in Windows + set DANDI_API_KEY=... + + DO NOT STORE THIS IN ANY PUBLICLY SHARED CODE. + + Parameters + ---------- + dandiset_id : str + Six-digit string identifier for the DANDISet the NWBFiles will be uploaded to. + nwb_folder_path : folder path + Folder containing the NWBFiles to be uploaded. + dandiset_folder_path : folder path, optional + A separate folder location within which to download the dandiset. + Used in cases where you do not have write permissions for the parent of the 'nwb_folder_path' directory. + Default behavior downloads the DANDISet to a folder adjacent to the 'nwb_folder_path'. + version : {None, "draft", "version"} + The default is "draft". + staging : bool, default: False + Is the DANDISet hosted on the staging server? This is mostly for testing purposes. + The default is False. + cleanup : bool, default: False + Whether to remove the dandiset folder path and nwb_folder_path. + Defaults to False. + number_of_jobs : int, optional + The number of jobs to use in the DANDI upload process. + number_of_threads : int, optional + The number of threads to use in the DANDI upload process. + """ + from dandi.download import download as dandi_download + from dandi.organize import organize as dandi_organize + from dandi.upload import upload as dandi_upload + + assert os.getenv("DANDI_API_KEY"), ( + "Unable to find environment variable 'DANDI_API_KEY'. " + "Please retrieve your token from DANDI and set this environment variable." + ) + + dandiset_folder_path = ( + Path(mkdtemp(dir=nwb_folder_path.parent)) if dandiset_folder_path is None else dandiset_folder_path + ) + dandiset_path = dandiset_folder_path / dandiset_id + # Odd big of logic upstream: https://github.com/dandi/dandi-cli/blob/master/dandi/cli/cmd_upload.py#L92-L96 + if number_of_threads is not None and number_of_threads > 1 and number_of_jobs is None: + number_of_jobs = -1 + + url_base = "https://gui-staging.dandiarchive.org" if staging else "https://dandiarchive.org" + dandiset_url = f"{url_base}/dandiset/{dandiset_id}/{version}" + dandi_download(urls=dandiset_url, output_dir=str(dandiset_folder_path), get_metadata=True, get_assets=False) + assert dandiset_path.exists(), "DANDI download failed!" + + # TODO: need PR on DANDI to expose number of jobs + dandi_organize( + paths=str(nwb_folder_path), dandiset_path=str(dandiset_path), devel_debug=True if number_of_jobs == 1 else False + ) + organized_nwbfiles = dandiset_path.rglob("*.nwb") + + # DANDI has yet to implement forcing of session_id inclusion in organize step + # This manually enforces it when only a single session per subject is organized + for organized_nwbfile in organized_nwbfiles: + if "ses" not in organized_nwbfile.stem: + with NWBHDF5IO(path=organized_nwbfile, mode="r") as io: + nwbfile = io.read() + session_id = nwbfile.session_id + dandi_stem = organized_nwbfile.stem + dandi_stem_split = dandi_stem.split("_") + dandi_stem_split.insert(1, f"ses-{session_id}") + corrected_name = "_".join(dandi_stem_split) + ".nwb" + organized_nwbfile.rename(organized_nwbfile.parent / corrected_name) + organized_nwbfiles = dandiset_path.rglob("*.nwb") + # The above block can be removed once they add the feature + + assert len(list(dandiset_path.iterdir())) > 1, "DANDI organize failed!" + + dandi_instance = "dandi-staging" if staging else "dandi" # Test + dandi_upload( + paths=[str(x) for x in organized_nwbfiles], + dandi_instance=dandi_instance, + jobs=number_of_jobs, + jobs_per_file=number_of_threads, + ) + + # Cleanup should be confirmed manually; Windows especially can complain + if cleanup: + try: + rmtree(path=dandiset_folder_path) + rmtree(path=nwb_folder_path) + except PermissionError: # pragma: no cover + warn("Unable to clean up source files and dandiset! Please manually delete them.") diff --git a/src/neuroconv/tools/data_transfers.py b/src/neuroconv/tools/data_transfers/_globus.py similarity index 51% rename from src/neuroconv/tools/data_transfers.py rename to src/neuroconv/tools/data_transfers/_globus.py index db26ef598..907a89f71 100644 --- a/src/neuroconv/tools/data_transfers.py +++ b/src/neuroconv/tools/data_transfers/_globus.py @@ -1,29 +1,17 @@ """Collection of helper functions for assessing and performing automated data transfers.""" import json -import os import re from pathlib import Path -from shutil import rmtree -from tempfile import mkdtemp from time import sleep, time -from typing import Dict, List, Optional, Tuple, Union -from warnings import warn +from typing import Dict, List, Tuple, Union -from dandi.download import download as dandi_download -from dandi.organize import organize as dandi_organize -from dandi.upload import upload as dandi_upload -from pynwb import NWBHDF5IO +from nwbinspector.utils import is_module_installed +from pydantic import DirectoryPath from tqdm import tqdm -from .processes import deploy_process -from ..utils import FolderPathType, OptionalFolderPathType +from ..processes import deploy_process -try: # pragma: no cover - import globus_cli - - HAVE_GLOBUS = True -except ModuleNotFoundError: - HAVE_GLOBUS = False +HAVE_GLOBUS = is_module_installed(module_name="globus_cli") def get_globus_dataset_content_sizes( @@ -50,7 +38,7 @@ def transfer_globus_content( source_endpoint_id: str, source_files: Union[str, List[List[str]]], destination_endpoint_id: str, - destination_folder: FolderPathType, + destination_folder: DirectoryPath, display_progress: bool = True, progress_update_rate: float = 60.0, progress_update_timeout: float = 600.0, @@ -203,167 +191,3 @@ def _track_transfer( progress_update_timeout=progress_update_timeout, ) return success, list(task_total_sizes) - - -def estimate_total_conversion_runtime( - total_mb: float, - transfer_rate_mb: float = 20.0, - conversion_rate_mb: float = 17.0, - upload_rate_mb: float = 40, - compression_ratio: float = 1.7, -): - """ - Estimate how long the combined process of data transfer, conversion, and upload is expected to take. - - Parameters - ---------- - total_mb: float - The total amount of data (in MB) that will be transferred, converted, and uploaded to dandi. - transfer_rate_mb : float, default: 20.0 - Estimate of the transfer rate for the data. - conversion_rate_mb : float, default: 17.0 - Estimate of the conversion rate for the data. Can vary widely depending on conversion options and type of data. - Figure of 17MB/s is based on extensive compression of high-volume, high-resolution ecephys. - upload_rate_mb : float, default: 40.0 - Estimate of the upload rate of a single file to the DANDI archive. - compression_ratio : float, default: 1.7 - Estimate of the final average compression ratio for datasets in the file. Can vary widely. - """ - c = 1 / compression_ratio # compressed_size = total_size * c - return total_mb * (1 / transfer_rate_mb + 1 / conversion_rate_mb + c / upload_rate_mb) - - -def estimate_s3_conversion_cost( - total_mb: float, - transfer_rate_mb: float = 20.0, - conversion_rate_mb: float = 17.0, - upload_rate_mb: float = 40.0, - compression_ratio: float = 1.7, -): - """ - Estimate potential cost of performing an entire conversion on S3 using full automation. - - Parameters - ---------- - total_mb: float - The total amount of data (in MB) that will be transferred, converted, and uploaded to dandi. - transfer_rate_mb : float, default: 20.0 - Estimate of the transfer rate for the data. - conversion_rate_mb : float, default: 17.0 - Estimate of the conversion rate for the data. Can vary widely depending on conversion options and type of data. - Figure of 17MB/s is based on extensive compression of high-volume, high-resolution ecephys. - upload_rate_mb : float, default: 40.0 - Estimate of the upload rate of a single file to the DANDI Archive. - compression_ratio : float, default: 1.7 - Estimate of the final average compression ratio for datasets in the file. Can vary widely. - """ - c = 1 / compression_ratio # compressed_size = total_size * c - total_mb_s = ( - total_mb**2 / 2 * (1 / transfer_rate_mb + (2 * c + 1) / conversion_rate_mb + 2 * c**2 / upload_rate_mb) - ) - cost_gb_m = 0.08 / 1e3 # $0.08 / GB Month - cost_mb_s = cost_gb_m / (1e3 * 2.628e6) # assuming 30 day month; unsure how amazon weights shorter months? - return cost_mb_s * total_mb_s - - -def automatic_dandi_upload( - dandiset_id: str, - nwb_folder_path: FolderPathType, - dandiset_folder_path: OptionalFolderPathType = None, - version: str = "draft", - staging: bool = False, - cleanup: bool = False, - number_of_jobs: Optional[int] = None, - number_of_threads: Optional[int] = None, -): - """ - Fully automated upload of NWBFiles to a DANDISet. - - Requires an API token set as an envrinment variable named DANDI_API_KEY. - - To set this in your bash terminal in Linux or MacOS, run - export DANDI_API_KEY=... - or in Windows - set DANDI_API_KEY=... - - DO NOT STORE THIS IN ANY PUBLICLY SHARED CODE. - - Parameters - ---------- - dandiset_id : str - Six-digit string identifier for the DANDISet the NWBFiles will be uploaded to. - nwb_folder_path : folder path - Folder containing the NWBFiles to be uploaded. - dandiset_folder_path : folder path, optional - A separate folder location within which to download the dandiset. - Used in cases where you do not have write permissions for the parent of the 'nwb_folder_path' directory. - Default behavior downloads the DANDISet to a folder adjacent to the 'nwb_folder_path'. - version : {None, "draft", "version"} - The default is "draft". - staging : bool, default: False - Is the DANDISet hosted on the staging server? This is mostly for testing purposes. - The default is False. - cleanup : bool, default: False - Whether to remove the dandiset folder path and nwb_folder_path. - Defaults to False. - number_of_jobs : int, optional - The number of jobs to use in the DANDI upload process. - number_of_threads : int, optional - The number of threads to use in the DANDI upload process. - """ - assert os.getenv("DANDI_API_KEY"), ( - "Unable to find environment variable 'DANDI_API_KEY'. " - "Please retrieve your token from DANDI and set this environment variable." - ) - - dandiset_folder_path = ( - Path(mkdtemp(dir=nwb_folder_path.parent)) if dandiset_folder_path is None else dandiset_folder_path - ) - dandiset_path = dandiset_folder_path / dandiset_id - # Odd big of logic upstream: https://github.com/dandi/dandi-cli/blob/master/dandi/cli/cmd_upload.py#L92-L96 - if number_of_threads is not None and number_of_threads > 1 and number_of_jobs is None: - number_of_jobs = -1 - - url_base = "https://gui-staging.dandiarchive.org" if staging else "https://dandiarchive.org" - dandiset_url = f"{url_base}/dandiset/{dandiset_id}/{version}" - dandi_download(urls=dandiset_url, output_dir=str(dandiset_folder_path), get_metadata=True, get_assets=False) - assert dandiset_path.exists(), "DANDI download failed!" - - # TODO: need PR on DANDI to expose number of jobs - dandi_organize( - paths=str(nwb_folder_path), dandiset_path=str(dandiset_path), devel_debug=True if number_of_jobs == 1 else False - ) - organized_nwbfiles = dandiset_path.rglob("*.nwb") - - # DANDI has yet to implement forcing of session_id inclusion in organize step - # This manually enforces it when only a single session per subject is organized - for organized_nwbfile in organized_nwbfiles: - if "ses" not in organized_nwbfile.stem: - with NWBHDF5IO(path=organized_nwbfile, mode="r") as io: - nwbfile = io.read() - session_id = nwbfile.session_id - dandi_stem = organized_nwbfile.stem - dandi_stem_split = dandi_stem.split("_") - dandi_stem_split.insert(1, f"ses-{session_id}") - corrected_name = "_".join(dandi_stem_split) + ".nwb" - organized_nwbfile.rename(organized_nwbfile.parent / corrected_name) - organized_nwbfiles = dandiset_path.rglob("*.nwb") - # The above block can be removed once they add the feature - - assert len(list(dandiset_path.iterdir())) > 1, "DANDI organize failed!" - - dandi_instance = "dandi-staging" if staging else "dandi" # Test - dandi_upload( - paths=[str(x) for x in organized_nwbfiles], - dandi_instance=dandi_instance, - jobs=number_of_jobs, - jobs_per_file=number_of_threads, - ) - - # Cleanup should be confirmed manually; Windows especially can complain - if cleanup: - try: - rmtree(path=dandiset_folder_path) - rmtree(path=nwb_folder_path) - except PermissionError: # pragma: no cover - warn("Unable to clean up source files and dandiset! Please manually delete them.") diff --git a/src/neuroconv/tools/data_transfers/_helpers.py b/src/neuroconv/tools/data_transfers/_helpers.py new file mode 100644 index 000000000..7bb3e59a0 --- /dev/null +++ b/src/neuroconv/tools/data_transfers/_helpers.py @@ -0,0 +1,29 @@ +"""Collection of helper functions for assessing automated data transfers.""" + + +def estimate_total_conversion_runtime( + total_mb: float, + transfer_rate_mb: float = 20.0, + conversion_rate_mb: float = 17.0, + upload_rate_mb: float = 40, + compression_ratio: float = 1.7, +): + """ + Estimate how long the combined process of data transfer, conversion, and upload is expected to take. + + Parameters + ---------- + total_mb: float + The total amount of data (in MB) that will be transferred, converted, and uploaded to dandi. + transfer_rate_mb : float, default: 20.0 + Estimate of the transfer rate for the data. + conversion_rate_mb : float, default: 17.0 + Estimate of the conversion rate for the data. Can vary widely depending on conversion options and type of data. + Figure of 17MB/s is based on extensive compression of high-volume, high-resolution ecephys. + upload_rate_mb : float, default: 40.0 + Estimate of the upload rate of a single file to the DANDI archive. + compression_ratio : float, default: 1.7 + Estimate of the final average compression ratio for datasets in the file. Can vary widely. + """ + c = 1 / compression_ratio # compressed_size = total_size * c + return total_mb * (1 / transfer_rate_mb + 1 / conversion_rate_mb + c / upload_rate_mb) diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py index 4be1a5dc6..46f0fd865 100644 --- a/src/neuroconv/tools/hdmf.py +++ b/src/neuroconv/tools/hdmf.py @@ -8,77 +8,113 @@ class GenericDataChunkIterator(HDMFGenericDataChunkIterator): def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: - num_axes = len(self.maxshape) - chunk_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize + return self.estimate_default_buffer_shape( + buffer_gb=buffer_gb, chunk_shape=self.chunk_shape, maxshape=self.maxshape, dtype=self.dtype + ) + + # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own + @staticmethod + def estimate_default_chunk_shape(chunk_mb: float, maxshape: Tuple[int, ...], dtype: np.dtype) -> Tuple[int, ...]: + """ + Select chunk shape with size in MB less than the threshold of chunk_mb. + + Keeps the dimensional ratios of the original data. + """ + assert chunk_mb > 0.0, f"chunk_mb ({chunk_mb}) must be greater than zero!" + # Eventually, Pydantic validation can handle this validation for us + + n_dims = len(maxshape) + itemsize = dtype.itemsize + chunk_bytes = chunk_mb * 1e6 + + min_maxshape = min(maxshape) + v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in maxshape) + prod_v = math.prod(v) + while prod_v * itemsize > chunk_bytes and prod_v != 1: + non_unit_min_v = min(x for x in v if x != 1) + v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) + prod_v = math.prod(v) + k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) + return tuple([min(k * x, maxshape[dim]) for dim, x in enumerate(v)]) + + # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own + @staticmethod + def estimate_default_buffer_shape( + buffer_gb: float, chunk_shape: Tuple[int, ...], maxshape: Tuple[int, ...], dtype: np.dtype + ) -> Tuple[int]: + num_axes = len(maxshape) + chunk_bytes = math.prod(chunk_shape) * dtype.itemsize + assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" assert ( buffer_gb >= chunk_bytes / 1e9 ), f"buffer_gb ({buffer_gb}) must be greater than the chunk size ({chunk_bytes / 1e9})!" - assert all( - np.array(self.chunk_shape) > 0 - ), f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" + assert all(np.array(chunk_shape) > 0), f"Some dimensions of chunk_shape ({chunk_shape}) are less than zero!" - maxshape = np.array(self.maxshape) + maxshape = np.array(maxshape) # Early termination condition - if math.prod(maxshape) * self.dtype.itemsize / 1e9 < buffer_gb: - return tuple(self.maxshape) + if math.prod(maxshape) * dtype.itemsize / 1e9 < buffer_gb: + return tuple(maxshape) buffer_bytes = chunk_bytes - axis_sizes_bytes = maxshape * self.dtype.itemsize + axis_sizes_bytes = maxshape * dtype.itemsize target_buffer_bytes = buffer_gb * 1e9 - if num_axes > 1: - smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(self.chunk_shape) - # If the smallest full axis does not fit within the buffer size, form a square along the two smallest axes - sub_square_buffer_shape = np.array(self.chunk_shape) - if min(axis_sizes_bytes) > target_buffer_bytes: - k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5) - for axis in [smallest_chunk_axis, second_smallest_chunk_axis]: - sub_square_buffer_shape[axis] = k1 * sub_square_buffer_shape[axis] - return tuple(sub_square_buffer_shape) - elif num_axes == 1: - smallest_chunk_axis = 0 - # Handle the case where the single axis is too large to fit in the buffer - if axis_sizes_bytes[0] > target_buffer_bytes: - k1 = math.floor(target_buffer_bytes / chunk_bytes) - return tuple( - [ - k1 * self.chunk_shape[0], - ] - ) - else: - raise ValueError(f"num_axes ({num_axes}) is less than one!") + + if min(axis_sizes_bytes) > target_buffer_bytes: + if num_axes > 1: + smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(chunk_shape) + # If the smallest full axis does not fit within the buffer size, form a square along the smallest axes + sub_square_buffer_shape = np.array(chunk_shape) + if min(axis_sizes_bytes) > target_buffer_bytes: + k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5) + for axis in [smallest_chunk_axis, second_smallest_chunk_axis]: + sub_square_buffer_shape[axis] = k1 * sub_square_buffer_shape[axis] + return tuple(sub_square_buffer_shape) + elif num_axes == 1: + smallest_chunk_axis = 0 + # Handle the case where the single axis is too large to fit in the buffer + if axis_sizes_bytes[0] > target_buffer_bytes: + k1 = math.floor(target_buffer_bytes / chunk_bytes) + return tuple( + [ + k1 * chunk_shape[0], + ] + ) + else: + raise ValueError(f"num_axes ({num_axes}) is less than one!") # Original one-shot estimation has good performance for certain shapes chunk_to_buffer_ratio = buffer_gb * 1e9 / chunk_bytes chunk_scaling_factor = math.floor(chunk_to_buffer_ratio ** (1 / num_axes)) unpadded_buffer_shape = [ - np.clip(a=int(x), a_min=self.chunk_shape[j], a_max=self.maxshape[j]) - for j, x in enumerate(chunk_scaling_factor * np.array(self.chunk_shape)) + np.clip(a=int(x), a_min=chunk_shape[j], a_max=maxshape[j]) + for j, x in enumerate(chunk_scaling_factor * np.array(chunk_shape)) ] - unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * self.dtype.itemsize + unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * dtype.itemsize # Method that starts by filling the smallest axis completely or calculates best partial fill - padded_buffer_shape = np.array(self.chunk_shape) - chunks_per_axis = np.ceil(maxshape / self.chunk_shape) + padded_buffer_shape = np.array(chunk_shape) + chunks_per_axis = np.ceil(maxshape / chunk_shape) small_axis_fill_size = chunk_bytes * min(chunks_per_axis) full_axes_used = np.zeros(shape=num_axes, dtype=bool) if small_axis_fill_size <= target_buffer_bytes: buffer_bytes = small_axis_fill_size - padded_buffer_shape[smallest_chunk_axis] = self.maxshape[smallest_chunk_axis] + padded_buffer_shape[smallest_chunk_axis] = maxshape[smallest_chunk_axis] full_axes_used[smallest_chunk_axis] = True for axis, chunks_on_axis in enumerate(chunks_per_axis): if full_axes_used[axis]: # If the smallest axis, skip since already used continue if chunks_on_axis * buffer_bytes <= target_buffer_bytes: # If multiple axes can be used together buffer_bytes *= chunks_on_axis - padded_buffer_shape[axis] = self.maxshape[axis] + padded_buffer_shape[axis] = maxshape[axis] else: # Found an axis that is too large to use with the rest of the buffer; calculate how much can be used k3 = math.floor(target_buffer_bytes / buffer_bytes) padded_buffer_shape[axis] *= k3 break - padded_buffer_bytes = math.prod(padded_buffer_shape) * self.dtype.itemsize + + padded_buffer_bytes = math.prod(padded_buffer_shape) * dtype.itemsize if padded_buffer_bytes >= unpadded_buffer_bytes: return tuple(padded_buffer_shape) @@ -88,7 +124,7 @@ def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]: class SliceableDataChunkIterator(GenericDataChunkIterator): """ - Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or an h5py.Dataset + Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or h5py.Dataset object. """ def __init__(self, data, **kwargs): diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 0982439bb..cb78a67a5 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -1,3 +1,4 @@ +from ._dataset_configuration import get_default_dataset_io_configurations from ._metadata_and_file_helpers import ( add_device_from_metadata, get_default_nwbfile_metadata, @@ -5,17 +6,17 @@ make_nwbfile_from_metadata, make_or_load_nwbfile, ) -from ._models._base_models import DatasetConfiguration, DatasetInfo +from ._models._base_models import DatasetInfo from ._models._hdf5_models import ( AVAILABLE_HDF5_COMPRESSION_METHODS, HDF5BackendConfiguration, - HDF5DatasetConfiguration, + HDF5DatasetIOConfiguration, ) from ._models._zarr_models import ( AVAILABLE_ZARR_COMPRESSION_METHODS, ZarrBackendConfiguration, - ZarrDatasetConfiguration, + ZarrDatasetIOConfiguration, ) -BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration) -BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) +BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) +DATASET_IO_CONFIGURATIONS = dict(hdf5=HDF5DatasetIOConfiguration, zarr=ZarrDatasetIOConfiguration) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py new file mode 100644 index 000000000..7ddadb7a0 --- /dev/null +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -0,0 +1,149 @@ +"""Collection of helper functions related to configuration of datasets dependent on backend.""" +from typing import Generator, Literal, Union + +import h5py +import numpy as np +import zarr +from hdmf.data_utils import DataChunkIterator, DataIO, GenericDataChunkIterator +from hdmf.utils import get_data_shape +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile, TimeSeries +from pynwb.base import DynamicTable + +from ._models._base_models import DatasetInfo, DatasetIOConfiguration +from ._models._hdf5_models import HDF5BackendConfiguration, HDF5DatasetIOConfiguration +from ._models._zarr_models import ZarrBackendConfiguration, ZarrDatasetIOConfiguration +from ..hdmf import SliceableDataChunkIterator + +BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetIOConfiguration, zarr=ZarrDatasetIOConfiguration) +BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) + + +def _get_io_mode(io: Union[NWBHDF5IO, NWBZarrIO]) -> str: + """NWBHDF5IO and NWBZarrIO have different ways of storing the io mode (e.g. "r", "a", "w") they used on a path.""" + if isinstance(io, NWBHDF5IO): + return io.mode + elif isinstance(io, NWBZarrIO): + return io._ZarrIO__mode + + +def _is_dataset_written_to_file( + candidate_dataset: Union[h5py.Dataset, zarr.Array], + backend: Literal["hdf5", "zarr"], + existing_file: Union[h5py.File, zarr.Group, None], +) -> bool: + """ + Determine if the neurodata object is already written to the file on disk. + + This object should then be skipped by the `get_io_datasets` function when working in append mode. + """ + return ( + isinstance(candidate_dataset, h5py.Dataset) # If the source data is an HDF5 Dataset + and backend == "hdf5" + and candidate_dataset.file == existing_file # If the source HDF5 Dataset is the appending NWBFile + ) or ( + isinstance(candidate_dataset, zarr.Array) # If the source data is an Zarr Array + and backend == "zarr" + and candidate_dataset.store == existing_file # If the source Zarr 'file' is the appending NWBFile + ) + + +def get_default_dataset_io_configurations( + nwbfile: NWBFile, + backend: Union[None, Literal["hdf5", "zarr"]] = None, # None for auto-detect from append mode, otherwise required +) -> Generator[DatasetIOConfiguration, None, None]: + """ + Generate DatasetIOConfiguration objects for wrapping NWB file objects with a specific backend. + + This method automatically detects all objects in an NWB file that can be wrapped in a DataIO. It supports auto-detection + of the backend if the NWB file is in append mode, otherwise it requires a backend specification. + + Parameters + ---------- + nwbfile : pynwb.NWBFile + An in-memory NWBFile object, either generated from the base class or read from an existing file of any backend. + backend : "hdf5" or "zarr" + Which backend format type you would like to use in configuring each datasets compression methods and options. + + Yields + ------ + DatasetIOConfiguration + A summary of each detected object that can be wrapped in a DataIO. + """ + DatasetIOConfigurationClass = BACKEND_TO_DATASET_CONFIGURATION[backend] + + if backend is None and nwbfile.read_io is None: + raise ValueError( + "Keyword argument `backend` (either 'hdf5' or 'zarr') must be specified if the `nwbfile` was not " + "read from an existing file!" + ) + if backend is None and nwbfile.read_io is not None and nwbfile.read_io.mode not in ("r+", "a"): + raise ValueError( + "Keyword argument `backend` (either 'hdf5' or 'zarr') must be specified if the `nwbfile` is being appended." + ) + + detected_backend = None + existing_file = None + if isinstance(nwbfile.read_io, NWBHDF5IO) and _get_io_mode(io=nwbfile.read_io) in ("r+", "a"): + detected_backend = "hdf5" + existing_file = nwbfile.read_io._file + elif isinstance(nwbfile.read_io, NWBZarrIO) and _get_io_mode(io=nwbfile.read_io) in ("r+", "a"): + detected_backend = "zarr" + existing_file = nwbfile.read_io.file.store + backend = backend or detected_backend + + if detected_backend is not None and detected_backend != backend: + raise ValueError( + f"Detected backend '{detected_backend}' for appending file, but specified `backend` " + f"({backend}) does not match! Set `backend=None` or remove the keyword argument to allow it to auto-detect." + ) + + for neurodata_object in nwbfile.objects.values(): + if isinstance(neurodata_object, DynamicTable): + dynamic_table = neurodata_object # for readability + + for column in dynamic_table.columns: + column_name = column.name + candidate_dataset = column.data # VectorData object + if _is_dataset_written_to_file( + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file + ): + continue # skip + + # Skip over columns that are already wrapped in DataIO + if isinstance(candidate_dataset, DataIO): + continue + + dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( + neurodata_object=column, field_name="data" + ) + + yield dataset_io_configuration + else: + # Primarily for TimeSeries, but also any extended class that has 'data' or 'timestamps' + # The most common example of this is ndx-events Events/LabeledEvents types + time_series = neurodata_object # for readability + + for field_name in ("data", "timestamps"): + if field_name not in time_series.fields: # timestamps is optional + continue + + candidate_dataset = getattr(time_series, field_name) + if _is_dataset_written_to_file( + candidate_dataset=candidate_dataset, backend=backend, existing_file=existing_file + ): + continue # skip + + # Skip over datasets that are already wrapped in DataIO + if isinstance(candidate_dataset, DataIO): + continue + + # Edge case of in-memory ImageSeries with external mode; data is in fields and is empty array + if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0: + continue # skip + + dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( + neurodata_object=time_series, field_name=field_name + ) + + yield dataset_io_configuration diff --git a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py index 72b364dea..aef29a62e 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_base_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_base_models.py @@ -6,8 +6,50 @@ import h5py import numcodecs import numpy as np +import zarr +from hdmf import Container from hdmf.container import DataIO +from hdmf.data_utils import DataChunkIterator, DataIO, GenericDataChunkIterator +from hdmf.utils import get_data_shape from pydantic import BaseModel, Field, root_validator +from pynwb import NWBHDF5IO, NWBFile + +from ...hdmf import SliceableDataChunkIterator + + +def _find_location_in_memory_nwbfile(current_location: str, neurodata_object: Container) -> str: + """ + Method for determining the location of a neurodata object within an in-memory NWBFile object. + + Distinct from methods from other packages, such as the NWB Inspector, which rely on such files being read from disk. + """ + parent = neurodata_object.parent + if isinstance(parent, NWBFile): + # Items in defined top-level places like acquisition, intervals, etc. do not act as 'containers' + # in that they do not set the `.parent` attribute; ask if object is in their in-memory dictionaries instead + for parent_field_name, parent_field_value in parent.fields.items(): + if isinstance(parent_field_value, dict) and neurodata_object.name in parent_field_value: + return parent_field_name + "/" + neurodata_object.name + "/" + current_location + return neurodata_object.name + "/" + current_location + return _find_location_in_memory_nwbfile( + current_location=neurodata_object.name + "/" + current_location, neurodata_object=parent + ) + + +def _infer_dtype_using_data_chunk_iterator(candidate_dataset: Union[h5py.Dataset, zarr.Array]): + """ + The DataChunkIterator has one of the best generic dtype inference, though logic is hard to peel out of it. + + It can fail in rare cases but not essential to our default configuration + """ + try: + data_type = DataChunkIterator(candidate_dataset).dtype + return data_type + except Exception as exception: + if str(exception) != "Data type could not be determined. Please specify dtype in DataChunkIterator init.": + raise exception + else: + return np.dtype("object") class DatasetInfo(BaseModel): @@ -61,8 +103,24 @@ def __init__(self, **values): values.update(dataset_name=dataset_name) super().__init__(**values) + @classmethod + def from_neurodata_object(cls, neurodata_object: Container, field_name: str) -> "DatasetInfo": + location = _find_location_in_memory_nwbfile(current_location=field_name, neurodata_object=neurodata_object) + candidate_dataset = getattr(neurodata_object, field_name) + + full_shape = get_data_shape(data=candidate_dataset) + dtype = _infer_dtype_using_data_chunk_iterator(candidate_dataset=candidate_dataset) + + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location=location, + full_shape=full_shape, + dtype=dtype, + ) + -class DatasetConfiguration(BaseModel, ABC): +class DatasetIOConfiguration(BaseModel, ABC): """A data model for configuring options about an object that will become a HDF5 or Zarr Dataset in the file.""" # TODO: When using Pydantic v2, remove @@ -182,13 +240,38 @@ def get_data_io_kwargs(self) -> Dict[str, Any]: """ raise NotImplementedError + @classmethod + def from_neurodata_object(cls, neurodata_object: Container, field_name: str) -> "DatasetIOConfiguration": + candidate_dataset = getattr(neurodata_object, field_name) + + dataset_info = DatasetInfo.from_neurodata_object(neurodata_object=neurodata_object, field_name=field_name) + + dtype = dataset_info.dtype + full_shape = dataset_info.full_shape + + if isinstance(candidate_dataset, GenericDataChunkIterator): + chunk_shape = candidate_dataset.chunk_shape + buffer_shape = candidate_dataset.buffer_shape + elif dtype != "unknown": + # TODO: eventually replace this with staticmethods on hdmf.data_utils.GenericDataChunkIterator + chunk_shape = SliceableDataChunkIterator.estimate_default_chunk_shape( + chunk_mb=10.0, maxshape=full_shape, dtype=np.dtype(dtype) + ) + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) + ) + else: + pass + + return cls(dataset_info=dataset_info, chunk_shape=chunk_shape, buffer_shape=buffer_shape) + class BackendConfiguration(BaseModel): """A model for matching collections of DatasetConfigurations to a specific backend.""" backend: Literal["hdf5", "zarr"] = Field(description="The name of the backend used to configure the NWBFile.") data_io_class: Type[DataIO] = Field(description="The DataIO class that is specific to this backend.") - dataset_configurations: Dict[str, DatasetConfiguration] = Field( + dataset_configurations: Dict[str, DatasetIOConfiguration] = Field( description=( "A mapping from object locations (e.g. `acquisition/TestElectricalSeriesAP/data`) " "to their DatasetConfiguration specification that contains all information " diff --git a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py index daf772688..b34671154 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_hdf5_models.py @@ -6,7 +6,7 @@ from pydantic import Field from pynwb import H5DataIO -from ._base_models import BackendConfiguration, DatasetConfiguration +from ._base_models import BackendConfiguration, DatasetIOConfiguration _base_hdf5_filters = set(h5py.filters.decode) _excluded_hdf5_filters = set( @@ -29,7 +29,7 @@ ) -class HDF5DatasetConfiguration(DatasetConfiguration): +class HDF5DatasetIOConfiguration(DatasetIOConfiguration): """A data model for configuring options about an object that will become a HDF5 Dataset in the file.""" # TODO: When using Pydantic v2, replace with `model_config = ConfigDict(...)` @@ -90,7 +90,7 @@ class HDF5BackendConfiguration(BackendConfiguration): data_io_class: Type[H5DataIO] = Field( # TODO: in pydantic v2 use property instead of class attribute default=H5DataIO, description="The DataIO class that is specific to HDF5." ) - dataset_configurations: Dict[str, HDF5DatasetConfiguration] = Field( + dataset_configurations: Dict[str, HDF5DatasetIOConfiguration] = Field( description=( "A mapping from object locations to their HDF5DatasetConfiguration specification that contains all " "information for writing the datasets to disk using the HDF5 backend." diff --git a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py index 760c7c2a9..14214b513 100644 --- a/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py +++ b/src/neuroconv/tools/nwb_helpers/_models/_zarr_models.py @@ -7,7 +7,7 @@ from hdmf_zarr import ZarrDataIO from pydantic import Field, root_validator -from ._base_models import BackendConfiguration, DatasetConfiguration +from ._base_models import BackendConfiguration, DatasetIOConfiguration _base_zarr_codecs = set(zarr.codec_registry.keys()) _lossy_zarr_codecs = set(("astype", "bitround", "quantize")) @@ -43,7 +43,7 @@ } -class ZarrDatasetConfiguration(DatasetConfiguration): +class ZarrDatasetIOConfiguration(DatasetIOConfiguration): """A data model for configuring options about an object that will become a Zarr Dataset in the file.""" # TODO: When using Pydantic v2, replace with `model_config = ConfigDict(...)` @@ -147,7 +147,7 @@ class ZarrBackendConfiguration(BackendConfiguration): data_io_class: Type[ZarrDataIO] = Field( default=ZarrDataIO, description="The DataIO class that is specific to Zarr." ) - dataset_configurations: Dict[str, ZarrDatasetConfiguration] = Field( + dataset_configurations: Dict[str, ZarrDatasetIOConfiguration] = Field( description=( "A mapping from object locations to their ZarrDatasetConfiguration specification that contains all " "information for writing the datasets to disk using the Zarr backend." diff --git a/src/neuroconv/tools/path_expansion.py b/src/neuroconv/tools/path_expansion.py index a08e5c717..2ce8bd86b 100644 --- a/src/neuroconv/tools/path_expansion.py +++ b/src/neuroconv/tools/path_expansion.py @@ -1,10 +1,11 @@ -"""Helpful classes for expanding file or folder paths on a system given a f-string rule for matching patterns.""" +"""Helpful classes for expanding file or folder paths on a system given an f-string rule for matching patterns.""" import abc import os +from datetime import date, datetime from pathlib import Path from typing import Dict, Iterable, List -from fparse import parse +from parse import parse from pydantic import DirectoryPath, FilePath from ..utils import DeepDict @@ -81,6 +82,8 @@ def expand_paths(self, source_data_spec: Dict[str, dict]) -> List[DeepDict]: for meta_key, meta_val in metadata.items(): super_key = standard_metadata.get(meta_key, non_standard_super) + if meta_key == "session_start_time" and isinstance(meta_val, date): + meta_val = datetime(meta_val.year, meta_val.month, meta_val.day) out[key]["metadata"][super_key][meta_key] = meta_val return list(dict(out).values()) diff --git a/src/neuroconv/tools/testing/__init__.py b/src/neuroconv/tools/testing/__init__.py index 502634466..2d5b06497 100644 --- a/src/neuroconv/tools/testing/__init__.py +++ b/src/neuroconv/tools/testing/__init__.py @@ -1,9 +1,9 @@ from ._mock._mock_dataset_models import ( mock_DatasetInfo, mock_HDF5BackendConfiguration, - mock_HDF5DatasetConfiguration, + mock_HDF5DatasetIOConfiguration, mock_ZarrBackendConfiguration, - mock_ZarrDatasetConfiguration, + mock_ZarrDatasetIOConfiguration, ) from .mock_files import generate_path_expander_demo_ibl from .mock_interfaces import MockBehaviorEventInterface, MockSpikeGLXNIDQInterface diff --git a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py index 6860f7078..e8ea80826 100644 --- a/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py +++ b/src/neuroconv/tools/testing/_mock/_mock_dataset_models.py @@ -9,9 +9,9 @@ AVAILABLE_ZARR_COMPRESSION_METHODS, DatasetInfo, HDF5BackendConfiguration, - HDF5DatasetConfiguration, + HDF5DatasetIOConfiguration, ZarrBackendConfiguration, - ZarrDatasetConfiguration, + ZarrDatasetIOConfiguration, ) @@ -30,14 +30,14 @@ def mock_DatasetInfo( ) -def mock_HDF5DatasetConfiguration( +def mock_HDF5DatasetIOConfiguration( compression_method: Union[ Literal[tuple(AVAILABLE_HDF5_COMPRESSION_METHODS.keys())], h5py._hl.filters.FilterRefBase, None ] = "gzip", compression_options: Union[Dict[str, Any], None] = None, -) -> HDF5DatasetConfiguration: - """Mock instance of a HDF5DatasetConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" - return HDF5DatasetConfiguration( +) -> HDF5DatasetIOConfiguration: + """Mock object of a HDF5DatasetIOConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" + return HDF5DatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), # ~10 MB buffer_shape=(1_250_000, 384), # ~1 GB @@ -46,7 +46,7 @@ def mock_HDF5DatasetConfiguration( ) -def mock_ZarrDatasetConfiguration( +def mock_ZarrDatasetIOConfiguration( compression_method: Union[ Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec, None ] = "gzip", @@ -55,9 +55,9 @@ def mock_ZarrDatasetConfiguration( Union[Literal[tuple(AVAILABLE_ZARR_COMPRESSION_METHODS.keys())], numcodecs.abc.Codec, None] ] = None, filter_options: Union[Iterable[Dict[str, Any]], None] = None, -) -> ZarrDatasetConfiguration: - """Mock instance of a ZarrDatasetConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" - return ZarrDatasetConfiguration( +) -> ZarrDatasetIOConfiguration: + """Mock object of a ZarrDatasetIOConfiguration with NeuroPixel-like values to show chunk/buffer recommendations.""" + return ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), # ~10 MB buffer_shape=(1_250_000, 384), # ~1 GB @@ -71,12 +71,12 @@ def mock_ZarrDatasetConfiguration( def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: """Mock instance of a HDF5BackendConfiguration with two NeuroPixel-like datasets.""" dataset_configurations = { - "acquisition/TestElectricalSeriesAP/data": HDF5DatasetConfiguration( + "acquisition/TestElectricalSeriesAP/data": HDF5DatasetIOConfiguration( dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), chunk_shape=(78_125, 64), # ~10 MB buffer_shape=(1_250_000, 384), # ~1 GB ), - "acquisition/TestElectricalSeriesLF/data": HDF5DatasetConfiguration( + "acquisition/TestElectricalSeriesLF/data": HDF5DatasetIOConfiguration( dataset_info=mock_DatasetInfo( object_id="bc37e164-519f-4b65-a976-206440f1d325", location="acquisition/TestElectricalSeriesLF/data", @@ -93,13 +93,13 @@ def mock_HDF5BackendConfiguration() -> HDF5BackendConfiguration: def mock_ZarrBackendConfiguration() -> ZarrBackendConfiguration: """Mock instance of a HDF5BackendConfiguration with several NeuroPixel-like datasets.""" dataset_configurations = { - "acquisition/TestElectricalSeriesAP/data": ZarrDatasetConfiguration( + "acquisition/TestElectricalSeriesAP/data": ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(location="acquisition/TestElectricalSeriesAP/data"), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), # ~1 GB filter_methods=["delta"], ), - "acquisition/TestElectricalSeriesLF/data": ZarrDatasetConfiguration( + "acquisition/TestElectricalSeriesLF/data": ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo( object_id="bc37e164-519f-4b65-a976-206440f1d325", location="acquisition/TestElectricalSeriesLF/data", diff --git a/src/neuroconv/tools/yaml_conversion_specification/__init__.py b/src/neuroconv/tools/yaml_conversion_specification/__init__.py index 4cad63e60..cd63559dc 100644 --- a/src/neuroconv/tools/yaml_conversion_specification/__init__.py +++ b/src/neuroconv/tools/yaml_conversion_specification/__init__.py @@ -1 +1 @@ -from .yaml_conversion_specification import run_conversion_from_yaml +from ._yaml_conversion_specification import run_conversion_from_yaml diff --git a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py similarity index 84% rename from src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py rename to src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py index b73e1e3ee..2c7e5b25c 100644 --- a/src/neuroconv/tools/yaml_conversion_specification/yaml_conversion_specification.py +++ b/src/neuroconv/tools/yaml_conversion_specification/_yaml_conversion_specification.py @@ -4,8 +4,6 @@ from typing import Optional import click -from dandi.metadata import _get_pynwb_metadata -from dandi.organize import create_unique_filenames_from_metadata from jsonschema import RefResolver, validate from ...nwbconverter import NWBConverter @@ -69,6 +67,8 @@ def run_conversion_from_yaml( If True, replaces any existing NWBFile at the nwbfile_path location, if save_to_file is True. If False, appends the existing NWBFile at the nwbfile_path location, if save_to_file is True. """ + from dandi.organize import create_unique_filenames_from_metadata + from dandi.pynwb_utils import _get_pynwb_metadata if data_folder_path is None: data_folder_path = Path(specification_file_path).parent @@ -125,18 +125,25 @@ def run_conversion_from_yaml( ) # To properly mimic a true dandi organization, the full directory must be populated with NWBFiles. all_nwbfile_paths = [nwbfile_path for nwbfile_path in output_folder_path.iterdir() if nwbfile_path.suffix == ".nwb"] - if any(["temp_nwbfile_name_" in nwbfile_path.stem for nwbfile_path in all_nwbfile_paths]): - dandi_metadata_list = [] - for nwbfile_path in all_nwbfile_paths: - dandi_metadata = _get_pynwb_metadata(path=nwbfile_path) - dandi_metadata.update(path=nwbfile_path) + nwbfile_paths_to_set = [ + nwbfile_path for nwbfile_path in all_nwbfile_paths if "temp_nwbfile_name_" in nwbfile_path.stem + ] + if any(nwbfile_paths_to_set): + dandi_metadata_list = list() + for nwbfile_path_to_set in nwbfile_paths_to_set: + dandi_metadata = _get_pynwb_metadata(path=nwbfile_path_to_set) + dandi_metadata.update(path=nwbfile_path_to_set) dandi_metadata_list.append(dandi_metadata) - named_dandi_metadata_list = create_unique_filenames_from_metadata(metadata=dandi_metadata_list) + dandi_metadata_with_set_paths = create_unique_filenames_from_metadata(metadata=dandi_metadata_list) - for named_dandi_metadata in named_dandi_metadata_list: - if "temp_nwbfile_name_" in named_dandi_metadata["path"].stem: - dandi_filename = named_dandi_metadata["dandi_filename"].replace(" ", "_") - assert ( - dandi_filename != ".nwb" - ), f"Not enough metadata available to assign name to {str(named_dandi_metadata['path'])}!" - named_dandi_metadata["path"].rename(str(output_folder_path / dandi_filename)) + for nwbfile_path_to_set, dandi_metadata_with_set_path in zip( + nwbfile_paths_to_set, dandi_metadata_with_set_paths + ): + dandi_filename = dandi_metadata_with_set_path["dandi_filename"] + + assert ( + dandi_filename != ".nwb" + ), f"Not enough metadata available to assign name to {str(nwbfile_path_to_set)}!" + + # Rename file on system + nwbfile_path_to_set.rename(str(output_folder_path / dandi_filename)) diff --git a/tests/imports.py b/tests/imports.py index 7048d76f4..656ddfea9 100644 --- a/tests/imports.py +++ b/tests/imports.py @@ -63,6 +63,7 @@ def test_tools(self): "deploy_process", "LocalPathExpander", "get_module", + "hdmf", ] assert sorted(current_structure) == sorted(expected_structure) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py similarity index 79% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py index 892638a2c..c8a6738b7 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_common_dataset_io_configuration_model.py @@ -4,21 +4,21 @@ import pytest from neuroconv.tools.nwb_helpers import ( - HDF5DatasetConfiguration, - ZarrDatasetConfiguration, + HDF5DatasetIOConfiguration, + ZarrDatasetIOConfiguration, ) from neuroconv.tools.testing import ( mock_DatasetInfo, - mock_HDF5DatasetConfiguration, - mock_ZarrDatasetConfiguration, + mock_HDF5DatasetIOConfiguration, + mock_ZarrDatasetIOConfiguration, ) @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_length_consistency( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -35,10 +35,10 @@ def test_validator_chunk_length_consistency( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_and_buffer_length_consistency( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -55,10 +55,10 @@ def test_validator_chunk_and_buffer_length_consistency( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_shape_nonpositive_elements( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -75,10 +75,10 @@ def test_validator_chunk_shape_nonpositive_elements( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_buffer_shape_nonpositive_elements( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -95,10 +95,10 @@ def test_validator_buffer_shape_nonpositive_elements( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_shape_exceeds_buffer_shape( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -115,10 +115,10 @@ def test_validator_chunk_shape_exceeds_buffer_shape( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_buffer_shape_exceeds_full_shape( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -135,10 +135,10 @@ def test_validator_buffer_shape_exceeds_full_shape( @pytest.mark.parametrize( - argnames="dataset_configuration_class", argvalues=[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + argnames="dataset_configuration_class", argvalues=[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ) def test_validator_chunk_dimensions_do_not_evenly_divide_buffer( - dataset_configuration_class: Union[HDF5DatasetConfiguration, ZarrDatasetConfiguration] + dataset_configuration_class: Union[HDF5DatasetIOConfiguration, ZarrDatasetIOConfiguration] ): with pytest.raises(ValueError) as error_info: dataset_configuration_class( @@ -155,10 +155,11 @@ def test_validator_chunk_dimensions_do_not_evenly_divide_buffer( @pytest.mark.parametrize( - argnames="mock_dataset_configuration", argvalues=[mock_HDF5DatasetConfiguration(), mock_ZarrDatasetConfiguration()] + argnames="mock_dataset_configuration", + argvalues=[mock_HDF5DatasetIOConfiguration(), mock_ZarrDatasetIOConfiguration()], ) def test_mutation_validation( - mock_dataset_configuration: Union[mock_HDF5DatasetConfiguration, mock_ZarrDatasetConfiguration] + mock_dataset_configuration: Union[mock_HDF5DatasetIOConfiguration, mock_ZarrDatasetIOConfiguration] ): """ Only testing on one dummy case to verify the root validator is triggered. diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py new file mode 100644 index 000000000..69545adbf --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations.py @@ -0,0 +1,300 @@ +"""Unit tests for `get_default_dataset_io_configurations`.""" +from typing import Literal + +import numpy as np +import pytest +from hdmf.common import VectorData +from hdmf.data_utils import DataChunkIterator +from nwbinspector.utils import is_module_installed +from pynwb.base import DynamicTable +from pynwb.behavior import CompassDirection +from pynwb.image import ImageSeries +from pynwb.misc import Units +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.behavior import mock_SpatialSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.hdmf import SliceableDataChunkIterator +from neuroconv.tools.nwb_helpers import ( + DATASET_IO_CONFIGURATIONS, + get_default_dataset_io_configurations, + get_module, +) + + +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_time_series(iterator: callable, backend: Literal["hdf5", "zarr"]): + array = np.array([[1, 2, 3], [4, 5, 6]]) + data = iterator(array) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=data) + nwbfile.add_acquisition(time_series) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.object_id == time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestTimeSeries/data" + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_external_image_series(backend: Literal["hdf5", "zarr"]): + nwbfile = mock_NWBFile() + image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) + nwbfile.add_acquisition(image_series) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 0 + + +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_dynamic_table(iterator: callable, backend: Literal["hdf5", "zarr"]): + array = np.array([0.1, 0.2, 0.3]) + data = iterator(array) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=data) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column], id=list(range(len(array)))) + nwbfile.add_acquisition(dynamic_table) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_ragged_units_table(backend: Literal["hdf5", "zarr"]): + nwbfile = mock_NWBFile() + units = Units(name="units", description="") + + spike_times = np.array([0.0, 1.0, 2.0]) + waveforms = np.array([[[1, 2, 3], [1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3], [1, 2, 3]]], dtype="int32") + units.add_unit(spike_times=spike_times, waveforms=waveforms) + + spike_times = np.array([3.0, 4.0]) + waveforms = np.array([[[4, 5], [4, 5], [4, 5]], [[4, 5], [4, 5], [4, 5]]], dtype="int32") + units.add_unit(spike_times=spike_times, waveforms=waveforms) + + nwbfile.units = units + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 5 + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/spike_times/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (5,) + assert dataset_configuration.dataset_info.dtype == np.dtype("float64") + assert dataset_configuration.chunk_shape == (5,) + assert dataset_configuration.buffer_shape == (5,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/spike_times_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (2,) + assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/waveforms/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (12, 3) + assert dataset_configuration.dataset_info.dtype == np.dtype("int32") + assert dataset_configuration.chunk_shape == (12, 3) + assert dataset_configuration.buffer_shape == (12, 3) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/waveforms_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (4,) + assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape == (4,) + assert dataset_configuration.buffer_shape == (4,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.location == "units/waveforms_index_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.full_shape == (2,) + assert dataset_configuration.dataset_info.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("iterator", [lambda x: x, SliceableDataChunkIterator, DataChunkIterator]) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_compass_direction(iterator: callable, backend: Literal["hdf5", "zarr"]): + array = np.array([[1, 2, 3], [4, 5, 6]]) + data = iterator(array) + + nwbfile = mock_NWBFile() + spatial_series = mock_SpatialSeries(name="TestSpatialSeries", data=data) + compass_direction = CompassDirection(name="TestCompassDirection", spatial_series=spatial_series) + behavior_module = get_module(nwbfile=nwbfile, name="behavior") + behavior_module.add(compass_direction) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.dataset_info.object_id == spatial_series.object_id + assert ( + dataset_configuration.dataset_info.location == "processing/behavior/TestCompassDirection/TestSpatialSeries/data" + ) + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.skipif( + not is_module_installed(module_name="ndx_events"), reason="The extra testing package 'ndx-events' is not installed!" +) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) +def test_configuration_on_ndx_events(backend: Literal["hdf5", "zarr"]): + from ndx_events import LabeledEvents + + # ndx_events data fields do not support wrapping in DataChunkIterators - data is nearly always small enough + # to fit entirely in memory + data = np.array([1, 2, 3], dtype="uint32") + timestamps = np.array([4.5, 6.7, 8.9]) + + nwbfile = mock_NWBFile() + labeled_events = LabeledEvents( + name="TestLabeledEvents", + description="", + timestamps=timestamps, + data=data, + labels=["response_left", "cue_onset", "cue_offset"], + ) + behavior_module = get_module(nwbfile=nwbfile, name="behavior") + behavior_module.add(labeled_events) + + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + # Note that the labels dataset is not caught since we search only for 'data' and 'timestamps' fields + assert len(dataset_configurations) == 2 + + data_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.dataset_name == "data" + ) + assert isinstance(data_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert data_dataset_configuration.dataset_info.object_id == labeled_events.object_id + assert data_dataset_configuration.dataset_info.location == "processing/behavior/TestLabeledEvents/data" + assert data_dataset_configuration.dataset_info.full_shape == data.shape + assert data_dataset_configuration.dataset_info.dtype == data.dtype + assert data_dataset_configuration.chunk_shape == data.shape + assert data_dataset_configuration.buffer_shape == data.shape + assert data_dataset_configuration.compression_method == "gzip" + assert data_dataset_configuration.compression_options is None + + if backend == "zarr": + assert data_dataset_configuration.filter_methods is None + assert data_dataset_configuration.filter_options is None + + timestamps_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.dataset_info.dataset_name == "timestamps" + ) + assert isinstance(timestamps_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert timestamps_dataset_configuration.dataset_info.object_id == labeled_events.object_id + assert timestamps_dataset_configuration.dataset_info.location == "processing/behavior/TestLabeledEvents/timestamps" + assert timestamps_dataset_configuration.dataset_info.full_shape == timestamps.shape + assert timestamps_dataset_configuration.dataset_info.dtype == timestamps.dtype + assert timestamps_dataset_configuration.chunk_shape == timestamps.shape + assert timestamps_dataset_configuration.buffer_shape == timestamps.shape + assert timestamps_dataset_configuration.compression_method == "gzip" + assert timestamps_dataset_configuration.compression_options is None + + if backend == "zarr": + assert timestamps_dataset_configuration.filter_methods is None + assert timestamps_dataset_configuration.filter_options is None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py new file mode 100644 index 000000000..3125bfc73 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_dataset_io_configurations_appended_files.py @@ -0,0 +1,146 @@ +""" +Unit tests for `get_default_dataset_io_configurations` operating on already written files open in append mode. +Mostly testing that the right objects are skipped from identification as candidates for configuration. +""" +from pathlib import Path + +import numpy as np +import pytest +from hdmf.common import VectorData +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile +from pynwb.base import DynamicTable +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ( + HDF5DatasetIOConfiguration, + ZarrDatasetIOConfiguration, + get_default_dataset_io_configurations, +) + + +def generate_nwbfile_with_existing_time_series() -> NWBFile: + nwbfile = mock_NWBFile() + array = np.array([[1, 2, 3], [4, 5, 6]]) + time_series = mock_TimeSeries(name="ExistingTimeSeries", data=array) + nwbfile.add_acquisition(time_series) + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_dataset_configurations_hdf5_nwbfile_.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_existing_time_series() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_dataset_configurations_zarr_nwbfile.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_nwbfile_with_existing_time_series() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_unwrapped_time_series_hdf5(hdf5_nwbfile_path): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetIOConfiguration) + assert dataset_configuration.dataset_info.object_id == new_time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_time_series_zarr(zarr_nwbfile_path): + array = np.array([[1, 2, 3], [4, 5, 6]]) + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + new_time_series = mock_TimeSeries(name="NewTimeSeries", data=array) + nwbfile.add_acquisition(new_time_series) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetIOConfiguration) + assert dataset_configuration.dataset_info.object_id == new_time_series.object_id + assert dataset_configuration.dataset_info.location == "acquisition/NewTimeSeries/data" + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +def test_unwrapped_dynamic_table_hdf5(hdf5_nwbfile_path): + array = np.array([0.1, 0.2, 0.3]) + + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="hdf5")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, HDF5DatasetIOConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + + +def test_unwrapped_dynamic_table_zarr(zarr_nwbfile_path): + array = np.array([0.1, 0.2, 0.3]) + + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + column = VectorData(name="TestColumn", description="", data=array.squeeze()) + dynamic_table = DynamicTable(name="TestDynamicTable", description="", columns=[column]) + nwbfile.add_acquisition(dynamic_table) + dataset_configurations = list(get_default_dataset_io_configurations(nwbfile=nwbfile, backend="zarr")) + + assert len(dataset_configurations) == 1 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, ZarrDatasetIOConfiguration) + assert dataset_configuration.dataset_info.object_id == column.object_id + assert dataset_configuration.dataset_info.location == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.dataset_info.full_shape == array.shape + assert dataset_configuration.dataset_info.dtype == array.dtype + assert dataset_configuration.chunk_shape == array.shape + assert dataset_configuration.buffer_shape == array.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_info_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py similarity index 100% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_info_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_info_model.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py similarity index 58% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py index fd9e624a3..33b32d10a 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py @@ -1,26 +1,26 @@ """Unit tests for the DatasetConfiguration Pydantic model.""" import pytest -from neuroconv.tools.nwb_helpers._models._base_models import DatasetConfiguration +from neuroconv.tools.nwb_helpers._models._base_models import DatasetIOConfiguration from neuroconv.tools.testing import mock_DatasetInfo def test_get_data_io_kwargs_abstract_error(): with pytest.raises(TypeError) as error_info: - DatasetConfiguration(dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384)) - assert "Can't instantiate abstract class DatasetConfiguration with abstract" in str(error_info.value) + DatasetIOConfiguration(dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384)) + assert "Can't instantiate abstract class DatasetIOConfiguration with abstract" in str(error_info.value) def test_get_data_io_kwargs_not_implemented(): - class TestDatasetConfiguration(DatasetConfiguration): + class TestDatasetIOConfiguration(DatasetIOConfiguration): def get_data_io_kwargs(self): super().get_data_io_kwargs() - dataset_configuration = TestDatasetConfiguration( + dataset_io_configuration = TestDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), ) with pytest.raises(NotImplementedError): - dataset_configuration.get_data_io_kwargs() + dataset_io_configuration.get_data_io_kwargs() diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py similarity index 95% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py index 2d6242ad1..7377ff1b8 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_backend_configuration_model.py @@ -6,7 +6,7 @@ def test_hdf5_backend_configuration_print(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" + """Test the printout display of a HDF5BackendConfiguration model looks nice.""" hdf5_backend_configuration = mock_HDF5BackendConfiguration() with patch("sys.stdout", new=StringIO()) as out: diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py similarity index 73% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py index d6de7ab4c..b31387fbf 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_hdf5_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_hdf5_dataset_io_configuration_model.py @@ -1,16 +1,14 @@ -"""Unit tests for the HDF5DatasetConfiguration Pydantic model.""" +"""Unit tests for the HDF5DatasetIOConfiguration Pydantic model.""" from io import StringIO from unittest.mock import patch -import pytest - from neuroconv.tools.nwb_helpers import AVAILABLE_HDF5_COMPRESSION_METHODS -from neuroconv.tools.testing import mock_HDF5DatasetConfiguration +from neuroconv.tools.testing import mock_HDF5DatasetIOConfiguration def test_hdf5_dataset_configuration_print(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration() + """Test the printout display of a HDF5DatasetIOConfiguration model looks nice.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration() with patch("sys.stdout", new=StringIO()) as out: print(hdf5_dataset_configuration) @@ -35,8 +33,8 @@ def test_hdf5_dataset_configuration_print(): def test_hdf5_dataset_configuration_print_with_compression_options(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration(compression_options=dict(level=5)) + """Test the printout display of a HDF5DatasetIOConfiguration model looks nice.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration(compression_options=dict(level=5)) with patch("sys.stdout", new=StringIO()) as out: print(hdf5_dataset_configuration) @@ -62,8 +60,8 @@ def test_hdf5_dataset_configuration_print_with_compression_options(): def test_hdf5_dataset_configuration_print_with_compression_disabled(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration(compression_method=None) + """Test the printout display of a HDF5DatasetIOConfiguration model looks nice.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration(compression_method=None) with patch("sys.stdout", new=StringIO()) as out: print(hdf5_dataset_configuration) @@ -86,12 +84,12 @@ def test_hdf5_dataset_configuration_print_with_compression_disabled(): def test_hdf5_dataset_configuration_repr(): - """Test the programmatic repr of a HDF5DatasetConfiguration model is more dataclass-like.""" - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration() + """Test the programmatic repr of a HDF5DatasetIOConfiguration model is more dataclass-like.""" + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration() # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects expected_repr = ( - "HDF5DatasetConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " + "HDF5DatasetIOConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " "location='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " "full_shape=(1800000, 384)), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " "compression_options=None)" @@ -108,7 +106,7 @@ def test_default_compression_is_always_available(): def test_get_data_io_kwargs(): - hdf5_dataset_configuration = mock_HDF5DatasetConfiguration() + hdf5_dataset_configuration = mock_HDF5DatasetIOConfiguration() assert hdf5_dataset_configuration.get_data_io_kwargs() == dict( chunks=(78125, 64), compression="gzip", compression_opts=None diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py similarity index 95% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py index e8017c719..da417710c 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_backend_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_backend_configuration_model.py @@ -6,7 +6,7 @@ def test_zarr_backend_configuration_print(): - """Test the printout display of a HDF5DatasetConfiguration model looks nice.""" + """Test the printout display of a ZarrBackendConfiguration model looks nice.""" zarr_backend_configuration = mock_ZarrBackendConfiguration() with patch("sys.stdout", new=StringIO()) as out: diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_dataset_configuration_model.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py similarity index 79% rename from tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_dataset_configuration_model.py rename to tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py index 8ddc5bf7e..e99c1dbca 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_zarr_dataset_configuration_model.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_models/test_zarr_dataset_io_configuration_model.py @@ -1,4 +1,4 @@ -"""Unit tests for the ZarrDatasetConfiguration Pydantic model.""" +"""Unit tests for the ZarrDatasetIOConfiguration Pydantic model.""" from io import StringIO from unittest.mock import patch @@ -7,14 +7,14 @@ from neuroconv.tools.nwb_helpers import ( AVAILABLE_ZARR_COMPRESSION_METHODS, - ZarrDatasetConfiguration, + ZarrDatasetIOConfiguration, ) -from neuroconv.tools.testing import mock_DatasetInfo, mock_ZarrDatasetConfiguration +from neuroconv.tools.testing import mock_DatasetInfo, mock_ZarrDatasetIOConfiguration -def test_zarr_dataset_configuration_print(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration() +def test_zarr_dataset_io_configuration_print(): + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration() with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -39,8 +39,8 @@ def test_zarr_dataset_configuration_print(): def test_zarr_dataset_configuration_print_with_compression_options(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration(compression_options=dict(level=5)) + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration(compression_options=dict(level=5)) with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -66,8 +66,8 @@ def test_zarr_dataset_configuration_print_with_compression_options(): def test_zarr_dataset_configuration_print_with_compression_disabled(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration(compression_method=None) + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration(compression_method=None) with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -90,8 +90,8 @@ def test_zarr_dataset_configuration_print_with_compression_disabled(): def test_zarr_dataset_configuration_print_with_filter_methods(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration(filter_methods=["delta"]) + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration(filter_methods=["delta"]) with patch("sys.stdout", new=StringIO()) as out: print(zarr_dataset_configuration) @@ -118,8 +118,8 @@ def test_zarr_dataset_configuration_print_with_filter_methods(): def test_zarr_dataset_configuration_print_with_filter_options(): - """Test the printout display of a ZarrDatasetConfiguration model looks nice.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration( + """Test the printout display of a ZarrDatasetIOConfiguration model looks nice.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration( filter_methods=["blosc"], filter_options=[dict(clevel=5)] ) @@ -149,12 +149,12 @@ def test_zarr_dataset_configuration_print_with_filter_options(): def test_zarr_dataset_configuration_repr(): - """Test the programmatic repr of a ZarrDatasetConfiguration model is more dataclass-like.""" - zarr_dataset_configuration = mock_ZarrDatasetConfiguration() + """Test the programmatic repr of a ZarrDatasetIOConfiguration model is more dataclass-like.""" + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration() # Important to keep the `repr` unmodified for appearance inside iterables of DatasetInfo objects expected_repr = ( - "ZarrDatasetConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " + "ZarrDatasetIOConfiguration(dataset_info=DatasetInfo(object_id='481a0860-3a0c-40ec-b931-df4a3e9b101f', " "location='acquisition/TestElectricalSeries/data', dataset_name='data', dtype=dtype('int16'), " "full_shape=(1800000, 384)), chunk_shape=(78125, 64), buffer_shape=(1250000, 384), compression_method='gzip', " "compression_options=None, filter_methods=None, filter_options=None)" @@ -164,7 +164,7 @@ def test_zarr_dataset_configuration_repr(): def test_validator_filter_options_has_methods(): with pytest.raises(ValueError) as error_info: - ZarrDatasetConfiguration( + ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), @@ -181,7 +181,7 @@ def test_validator_filter_options_has_methods(): def test_validator_filter_methods_length_match_options(): with pytest.raises(ValueError) as error_info: - ZarrDatasetConfiguration( + ZarrDatasetIOConfiguration( dataset_info=mock_DatasetInfo(), chunk_shape=(78_125, 64), buffer_shape=(1_250_000, 384), @@ -205,7 +205,7 @@ def test_default_compression_is_always_available(): def test_get_data_io_kwargs(): - zarr_dataset_configuration = mock_ZarrDatasetConfiguration() + zarr_dataset_configuration = mock_ZarrDatasetIOConfiguration() assert zarr_dataset_configuration.get_data_io_kwargs() == dict( chunks=(78125, 64), compressor=GZip(level=1), filters=None diff --git a/tests/test_on_data/conversion_specifications/GIN_conversion_specification_missing_nwbfile_names.yml b/tests/test_on_data/conversion_specifications/GIN_conversion_specification_missing_nwbfile_names.yml index 7ba295bcc..ff17b0f52 100644 --- a/tests/test_on_data/conversion_specifications/GIN_conversion_specification_missing_nwbfile_names.yml +++ b/tests/test_on_data/conversion_specifications/GIN_conversion_specification_missing_nwbfile_names.yml @@ -28,6 +28,17 @@ experiments: sex: F age: P35D species: Mus musculus + - source_data: + ap: + file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.ap.bin + metadata: + NWBFile: + session_start_time: "2020-11-09T21:19:09+00:00" + Subject: + subject_id: Mouse 1 + sex: F + age: P35D + species: Mus musculus - nwbfile_name: example_defined_name metadata: NWBFile: diff --git a/tests/test_on_data/test_yaml_conversion_specification.py b/tests/test_on_data/test_yaml_conversion_specification.py index 5e44fac7c..f2abf210f 100644 --- a/tests/test_on_data/test_yaml_conversion_specification.py +++ b/tests/test_on_data/test_yaml_conversion_specification.py @@ -2,6 +2,7 @@ import unittest from datetime import datetime from pathlib import Path +from typing import Union import pytest from hdmf.testing import TestCase @@ -51,7 +52,9 @@ def test_run_conversion_from_yaml(): overwrite=True, ) - with NWBHDF5IO(path=OUTPUT_PATH / "example_converter_spec_1.nwb", mode="r") as io: + nwbfile_path_1 = OUTPUT_PATH / "example_converter_spec_1.nwb" + assert nwbfile_path_1.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path_1}'!" + with NWBHDF5IO(path=nwbfile_path_1, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" @@ -60,7 +63,9 @@ def test_run_conversion_from_yaml(): assert nwbfile.subject.subject_id == "1" assert "ElectricalSeriesAP" in nwbfile.acquisition - with NWBHDF5IO(path=OUTPUT_PATH / "example_converter_spec_2.nwb", mode="r") as io: + nwbfile_path_2 = OUTPUT_PATH / "example_converter_spec_2.nwb" + assert nwbfile_path_2.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path_2}'!" + with NWBHDF5IO(path=nwbfile_path_2, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" @@ -68,7 +73,9 @@ def test_run_conversion_from_yaml(): assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-10T21:19:09+00:00") assert nwbfile.subject.subject_id == "002" - with NWBHDF5IO(path=OUTPUT_PATH / "example_converter_spec_3.nwb", mode="r") as io: + nwbfile_path_3 = OUTPUT_PATH / "example_converter_spec_3.nwb" + assert nwbfile_path_3.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path_3}'!" + with NWBHDF5IO(path=nwbfile_path_3, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Auto-generated by neuroconv" assert nwbfile.lab == "My Lab" @@ -93,7 +100,9 @@ def test_run_conversion_from_yaml_default_nwbfile_name(self): overwrite=True, ) - with NWBHDF5IO(path=self.test_folder / "sub-Mouse_1_ses-20201009T211909.nwb", mode="r") as io: + nwbfile_path = self.test_folder / "sub-Mouse-1_ses-20201009T211909.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" @@ -101,14 +110,31 @@ def test_run_conversion_from_yaml_default_nwbfile_name(self): assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-09T21:19:09+00:00") assert nwbfile.subject.subject_id == "Mouse 1" assert "ElectricalSeriesAP" in nwbfile.acquisition - with NWBHDF5IO(path=self.test_folder / "example_defined_name.nwb", mode="r") as io: + + nwbfile_path = self.test_folder / "sub-Mouse-1_ses-20201109T211909.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: + nwbfile = io.read() + assert nwbfile.session_description == "Subject navigating a Y-shaped maze." + assert nwbfile.lab == "My Lab" + assert nwbfile.institution == "My Institution" + assert nwbfile.session_start_time == datetime.fromisoformat("2020-11-09T21:19:09+00:00") + assert nwbfile.subject.subject_id == "Mouse 1" + assert "ElectricalSeriesAP" in nwbfile.acquisition + + nwbfile_path = self.test_folder / "example_defined_name.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" assert nwbfile.institution == "My Institution" assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-10T21:19:09+00:00") assert nwbfile.subject.subject_id == "MyMouse002" - with NWBHDF5IO(path=self.test_folder / "sub-Subject_Name_ses-20201011T211909.nwb", mode="r") as io: + + nwbfile_path = self.test_folder / "sub-Subject-Name_ses-20201011T211909.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Auto-generated by neuroconv" assert nwbfile.lab == "My Lab" @@ -136,7 +162,7 @@ def test_run_conversion_from_yaml_no_nwbfile_name_or_other_metadata_assertion(se overwrite=True, ) - def test_run_conversion_from_yaml(self): + def test_run_conversion_from_yaml_on_behavior(self): path_to_test_yml_files = Path(__file__).parent / "conversion_specifications" yaml_file_path = path_to_test_yml_files / "GIN_conversion_specification_videos.yml" run_conversion_from_yaml( diff --git a/tests/test_on_data/test_yaml_conversion_specification_cli.py b/tests/test_on_data/test_yaml_conversion_specification_cli.py index b65fb4c7c..bdadc9072 100644 --- a/tests/test_on_data/test_yaml_conversion_specification_cli.py +++ b/tests/test_on_data/test_yaml_conversion_specification_cli.py @@ -24,7 +24,9 @@ def test_run_conversion_from_yaml_cli(self): ) ) - with NWBHDF5IO(path=self.test_folder / "example_converter_spec_1.nwb", mode="r") as io: + nwbfile_path = self.test_folder / "example_converter_spec_1.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" @@ -32,14 +34,20 @@ def test_run_conversion_from_yaml_cli(self): assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-09T21:19:09+00:00") assert nwbfile.subject.subject_id == "1" assert "ElectricalSeriesAP" in nwbfile.acquisition - with NWBHDF5IO(path=self.test_folder / "example_converter_spec_2.nwb", mode="r") as io: + + nwbfile_path = self.test_folder / "example_converter_spec_2.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" assert nwbfile.institution == "My Institution" assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-10T21:19:09+00:00") assert nwbfile.subject.subject_id == "002" - with NWBHDF5IO(path=self.test_folder / "example_converter_spec_3.nwb", mode="r") as io: + + nwbfile_path = self.test_folder / "example_converter_spec_3.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Auto-generated by neuroconv" assert nwbfile.lab == "My Lab" @@ -60,7 +68,9 @@ def test_run_conversion_from_yaml_default_nwbfile_name(self): ) ) - with NWBHDF5IO(path=self.test_folder / "sub-Mouse_1_ses-20201009T211909.nwb", mode="r") as io: + nwbfile_path = self.test_folder / "sub-Mouse-1_ses-20201009T211909.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" @@ -68,14 +78,31 @@ def test_run_conversion_from_yaml_default_nwbfile_name(self): assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-09T21:19:09+00:00") assert nwbfile.subject.subject_id == "Mouse 1" assert "ElectricalSeriesAP" in nwbfile.acquisition - with NWBHDF5IO(path=self.test_folder / "example_defined_name.nwb", mode="r") as io: + + nwbfile_path = self.test_folder / "sub-Mouse-1_ses-20201109T211909.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: + nwbfile = io.read() + assert nwbfile.session_description == "Subject navigating a Y-shaped maze." + assert nwbfile.lab == "My Lab" + assert nwbfile.institution == "My Institution" + assert nwbfile.session_start_time == datetime.fromisoformat("2020-11-09T21:19:09+00:00") + assert nwbfile.subject.subject_id == "Mouse 1" + assert "ElectricalSeriesAP" in nwbfile.acquisition + + nwbfile_path = self.test_folder / "example_defined_name.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Subject navigating a Y-shaped maze." assert nwbfile.lab == "My Lab" assert nwbfile.institution == "My Institution" assert nwbfile.session_start_time == datetime.fromisoformat("2020-10-10T21:19:09+00:00") assert nwbfile.subject.subject_id == "MyMouse002" - with NWBHDF5IO(path=self.test_folder / "sub-Subject_Name_ses-20201011T211909.nwb", mode="r") as io: + + nwbfile_path = self.test_folder / "sub-Subject-Name_ses-20201011T211909.nwb" + assert nwbfile_path.exists(), f"`run_conversion_from_yaml` failed to create the file at '{nwbfile_path}'! " + with NWBHDF5IO(path=nwbfile_path, mode="r") as io: nwbfile = io.read() assert nwbfile.session_description == "Auto-generated by neuroconv" assert nwbfile.lab == "My Lab"