From 5b0271e12b5422d2fb5b6a613eabb3abcfbb1345 Mon Sep 17 00:00:00 2001 From: h-mayorquin Date: Wed, 6 Dec 2023 08:49:29 +0100 Subject: [PATCH 1/7] failing test --- tests/test_minimal/test_tools/test_expand_paths.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_minimal/test_tools/test_expand_paths.py b/tests/test_minimal/test_tools/test_expand_paths.py index 99d396382..76d4746b2 100644 --- a/tests/test_minimal/test_tools/test_expand_paths.py +++ b/tests/test_minimal/test_tools/test_expand_paths.py @@ -13,11 +13,11 @@ def test_only_folder_match(tmpdir): base_directory = Path(tmpdir) - sub_directory1 = base_directory / "a_simple_pattern_1" - sub_directory2 = base_directory / "a_simple_pattern_2" + sub_directory1 = base_directory / "subject1" / "a_simple_pattern_1" + sub_directory2 = base_directory / "subject2" / "a_simple_pattern_2" - sub_directory1.mkdir(exist_ok=True) - sub_directory2.mkdir(exist_ok=True) + sub_directory1.mkdir(exist_ok=True, parents=True) + sub_directory2.mkdir(exist_ok=True, parents=True) # Add files with the same name to both folders file1 = sub_directory1 / "a_simple_pattern_1.bin" @@ -28,8 +28,8 @@ def test_only_folder_match(tmpdir): file2.touch() # Add another sub-nested folder with a folder - sub_directory3 = sub_directory1 / "a_simple_pattern_3" - sub_directory3.mkdir(exist_ok=True) + sub_directory3 = sub_directory1 / "intermediate_nested" / "a_simple_pattern_3" + sub_directory3.mkdir(exist_ok=True, parents=True) file3 = sub_directory3 / "a_simple_pattern_3.bin" file3.touch() @@ -37,7 +37,7 @@ def test_only_folder_match(tmpdir): source_data_spec = { "a_source": { "base_directory": base_directory, - "folder_path": "a_simple_pattern_{session_id}", + "folder_path": "{subject_id}/a_simple_pattern_{session_id}", } } From 5317d865d4ba4d44a9279d7086da16d70e009a32 Mon Sep 17 00:00:00 2001 From: h-mayorquin Date: Wed, 6 Dec 2023 09:14:42 +0100 Subject: [PATCH 2/7] modify test path test --- .../test_tools/test_expand_paths.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/test_minimal/test_tools/test_expand_paths.py b/tests/test_minimal/test_tools/test_expand_paths.py index 76d4746b2..c4683e6b2 100644 --- a/tests/test_minimal/test_tools/test_expand_paths.py +++ b/tests/test_minimal/test_tools/test_expand_paths.py @@ -13,7 +13,7 @@ def test_only_folder_match(tmpdir): base_directory = Path(tmpdir) - sub_directory1 = base_directory / "subject1" / "a_simple_pattern_1" + sub_directory1 = base_directory / "subject1" / "a_simple_pattern_1" sub_directory2 = base_directory / "subject2" / "a_simple_pattern_2" sub_directory1.mkdir(exist_ok=True, parents=True) @@ -41,13 +41,12 @@ def test_only_folder_match(tmpdir): } } - # Instantiate LocalPathExpander - path_expander = LocalPathExpander() metadata_list = path_expander.expand_paths(source_data_spec) folder_paths = [metadata_match["source_data"]["a_source"]["folder_path"] for metadata_match in metadata_list] - expected = {str(sub_directory1), str(sub_directory2), str(sub_directory3)} + # Note that sub_directory3 is not included because it does not conform to the pattern + expected = {str(sub_directory1), str(sub_directory2)} assert set(folder_paths) == expected @@ -55,11 +54,11 @@ def test_only_folder_match(tmpdir): def test_only_file_match(tmpdir): base_directory = Path(tmpdir) - sub_directory1 = base_directory / "a_simple_pattern_1" - sub_directory2 = base_directory / "a_simple_pattern_2" + sub_directory1 = base_directory / "subject1" / "a_simple_pattern_1" + sub_directory2 = base_directory / "subject2" / "a_simple_pattern_2" - sub_directory1.mkdir(exist_ok=True) - sub_directory2.mkdir(exist_ok=True) + sub_directory1.mkdir(exist_ok=True, parents=True) + sub_directory2.mkdir(exist_ok=True, parents=True) # Add files with the same name to both folders file1 = sub_directory1 / "a_simple_pattern_1.bin" @@ -70,8 +69,8 @@ def test_only_file_match(tmpdir): file2.touch() # Add another sub-nested folder with a folder - sub_directory3 = sub_directory1 / "a_simple_pattern_3" - sub_directory3.mkdir(exist_ok=True) + sub_directory3 = sub_directory1 / "intermediate_nested" / "a_simple_pattern_3" + sub_directory3.mkdir(exist_ok=True, parents=True) file3 = sub_directory3 / "a_simple_pattern_3.bin" file3.touch() @@ -79,17 +78,16 @@ def test_only_file_match(tmpdir): source_data_spec = { "a_source": { "base_directory": base_directory, - "file_path": "a_simple_pattern_{session_id}.bin", + "file_path": "{subject_id}/{a_parent_folder}/a_simple_pattern_{session_id}.bin", } } - # Instantiate LocalPathExpander - path_expander = LocalPathExpander() metadata_list = path_expander.expand_paths(source_data_spec) file_paths = [metadata_match["source_data"]["a_source"]["file_path"] for metadata_match in metadata_list] - expected = {str(file1), str(file2), str(file3)} + # Note that file3 is not included because it does not conform to the pattern + expected = {str(file1), str(file2)} assert set(file_paths) == expected From abe5b4ed9e294a0a50ffcadb3874a1105c984639 Mon Sep 17 00:00:00 2001 From: h-mayorquin Date: Wed, 6 Dec 2023 09:14:56 +0100 Subject: [PATCH 3/7] pass test --- src/neuroconv/tools/path_expansion.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/neuroconv/tools/path_expansion.py b/src/neuroconv/tools/path_expansion.py index f056b6a0e..29ceb05e3 100644 --- a/src/neuroconv/tools/path_expansion.py +++ b/src/neuroconv/tools/path_expansion.py @@ -19,7 +19,10 @@ def extract_metadata(self, base_directory: DirectoryPath, format_: str): for filepath in self.list_directory(base_directory=Path(base_directory)): result = parse(format_, filepath) if result: - yield filepath, result.named + named_result = result.named + no_field_in_metadata_contains_os_sep = all(os.sep not in str(val) for val in named_result.values()) + if no_field_in_metadata_contains_os_sep: + yield filepath, named_result @abc.abstractmethod def list_directory(self, base_directory: DirectoryPath) -> Iterable[FilePath]: From e5dc64960627177a24e68757edb8e06286587791 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Wed, 6 Dec 2023 09:33:00 +0100 Subject: [PATCH 4/7] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb927f3b2..89ec0ec3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Upcoming ### Bug fixes -* LocalPathExpander matches only `folder_paths` or `file_paths` if that is indicated in the passed specification. [PR #675](https://github.com/catalystneuro/neuroconv/pull/675) +* LocalPathExpander matches only `folder_paths` or `file_paths` if that is indicated in the passed specification. [PR #679](https://github.com/catalystneuro/neuroconv/pull/675) and [PR #675](https://github.com/catalystneuro/neuroconv/pull/679 ### Features From 8efd9e5f078e824b59d8074e7ab26a49f1e4a023 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Thu, 7 Dec 2023 08:18:42 +0100 Subject: [PATCH 5/7] test and docstring --- src/neuroconv/tools/path_expansion.py | 25 ++++++++++++ .../test_tools/test_expand_paths.py | 39 ++++++++++++++++--- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/path_expansion.py b/src/neuroconv/tools/path_expansion.py index 29ceb05e3..e4c16cdf9 100644 --- a/src/neuroconv/tools/path_expansion.py +++ b/src/neuroconv/tools/path_expansion.py @@ -13,6 +13,31 @@ class AbstractPathExpander(abc.ABC): def extract_metadata(self, base_directory: DirectoryPath, format_: str): + """ + Uses the parse library to extract metadata from file paths in the base_directory. + + This method iterates over files in `base_directory`, parsing each file path according to `format_`. + The format string is adjusted to the current operating system's path separator. The method yields + each file path and its corresponding parsed metadata. To constrain metadata matches to only the + name of the file or folder/directory, the method checks that the metadata does not contain the + OS path separator (e.g., '/' or '\\'). + + Parameters + ---------- + base_directory : DirectoryPath + The base directory from which to list files for metadata extraction. It should be a path-like + object that is convertible to a `pathlib.Path`. + format_ : str + The format string used for parsing the file paths. This string can represent a path in any + OS format, and is adjusted internally to match the current OS's path separator. + + Yields + ------ + Tuple[Path, Dict[str, Any]] + A tuple containing the file path as a `Path` object and a dictionary of the named metadata + extracted from the file path. + """ + format_ = format_.replace("\\", os.sep) # Actual character is a single back-slash; first is an escape for that format_ = format_.replace("/", os.sep) # our f-string uses '/' to communicate os-independent separators diff --git a/tests/test_minimal/test_tools/test_expand_paths.py b/tests/test_minimal/test_tools/test_expand_paths.py index c4683e6b2..085e5df06 100644 --- a/tests/test_minimal/test_tools/test_expand_paths.py +++ b/tests/test_minimal/test_tools/test_expand_paths.py @@ -42,14 +42,24 @@ def test_only_folder_match(tmpdir): } path_expander = LocalPathExpander() - metadata_list = path_expander.expand_paths(source_data_spec) - folder_paths = [metadata_match["source_data"]["a_source"]["folder_path"] for metadata_match in metadata_list] + matches_list = path_expander.expand_paths(source_data_spec) + folder_paths = [match["source_data"]["a_source"]["folder_path"] for match in matches_list] # Note that sub_directory3 is not included because it does not conform to the pattern expected = {str(sub_directory1), str(sub_directory2)} - assert set(folder_paths) == expected + metadata_list = [match["metadata"].to_dict() for match in matches_list] + expected_metadata = [ + {"Subject": {"subject_id": "subject1"}, "NWBFile": {"session_id": "1"}}, + {"Subject": {"subject_id": "subject2"}, "NWBFile": {"session_id": "2"}}, + ] + + # Sort both lists by subject id to ensure order is the same + metadata_list = sorted(metadata_list, key=lambda x: x["Subject"]["subject_id"]) + expected_metadata = sorted(expected_metadata, key=lambda x: x["Subject"]["subject_id"]) + assert metadata_list == expected_metadata + def test_only_file_match(tmpdir): base_directory = Path(tmpdir) @@ -83,13 +93,32 @@ def test_only_file_match(tmpdir): } path_expander = LocalPathExpander() - metadata_list = path_expander.expand_paths(source_data_spec) - file_paths = [metadata_match["source_data"]["a_source"]["file_path"] for metadata_match in metadata_list] + matches_list = path_expander.expand_paths(source_data_spec) + file_paths = [match["source_data"]["a_source"]["file_path"] for match in matches_list] # Note that file3 is not included because it does not conform to the pattern expected = {str(file1), str(file2)} assert set(file_paths) == expected + metadata_list = [match["metadata"].to_dict() for match in matches_list] + expected_metadata = [ + { + "Subject": {"subject_id": "subject1"}, + "NWBFile": {"session_id": "1"}, + "extras": {"a_parent_folder": "a_simple_pattern_1"}, + }, + { + "Subject": {"subject_id": "subject2"}, + "NWBFile": {"session_id": "2"}, + "extras": {"a_parent_folder": "a_simple_pattern_2"}, + }, + ] + + # Sort both lists by subject id to ensure order is the same + metadata_list = sorted(metadata_list, key=lambda x: x["Subject"]["subject_id"]) + expected_metadata = sorted(expected_metadata, key=lambda x: x["Subject"]["subject_id"]) + assert metadata_list == expected_metadata + def test_expand_paths(tmpdir): expander = LocalPathExpander() From cd18734d5df41da9fdf08bab7873748c2468443e Mon Sep 17 00:00:00 2001 From: bendichter Date: Fri, 8 Dec 2023 12:55:55 -0500 Subject: [PATCH 6/7] factor out file creation and add another test case --- .../test_tools/test_expand_paths.py | 96 +++++++++++-------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/tests/test_minimal/test_tools/test_expand_paths.py b/tests/test_minimal/test_tools/test_expand_paths.py index 085e5df06..e8c4a4659 100644 --- a/tests/test_minimal/test_tools/test_expand_paths.py +++ b/tests/test_minimal/test_tools/test_expand_paths.py @@ -2,36 +2,51 @@ import unittest from datetime import datetime from pathlib import Path - -import pytest +from typing import List, Tuple from neuroconv.tools import LocalPathExpander from neuroconv.tools.testing import generate_path_expander_demo_ibl from neuroconv.utils import NWBMetaDataEncoder -def test_only_folder_match(tmpdir): - base_directory = Path(tmpdir) - - sub_directory1 = base_directory / "subject1" / "a_simple_pattern_1" - sub_directory2 = base_directory / "subject2" / "a_simple_pattern_2" +def create_test_directories_and_files( + base_directory: Path, directories_and_files: List[Tuple[List[str], List[str]]] +) -> None: + """ + Create test directories and files in a way that is compatible across different + operating systems. + + Parameters + ---------- + base_directory : Path + The base directory under which all subdirectories and files will be created. + directories_and_files : List[Tuple[List[str], List[str]]] + A list where each element is a tuple. The first element of the tuple is a list + of directory components, and the second element is a list of file names to be + created in that directory. + """ + for directory_components, files in directories_and_files: + # Create directory using Path for OS compatibility + full_directory_path = base_directory.joinpath(*directory_components) + full_directory_path.mkdir(parents=True, exist_ok=True) + + # Create files in the directory + for file in files: + (full_directory_path / file).touch() - sub_directory1.mkdir(exist_ok=True, parents=True) - sub_directory2.mkdir(exist_ok=True, parents=True) - # Add files with the same name to both folders - file1 = sub_directory1 / "a_simple_pattern_1.bin" - file2 = sub_directory2 / "a_simple_pattern_2.bin" +def test_only_folder_match(tmpdir): + base_directory = Path(tmpdir) - # Create files - file1.touch() - file2.touch() + # Define the directories and files to be created + directories_and_files = [ + (["subject1", "a_simple_pattern_1"], ["a_simple_pattern_1.bin"]), # matches + (["subject1"], ["a_simple_pattern_file.bin"]), # matches query but is a file + (["subject2", "a_simple_pattern_2", "nested_directory"], []), # match should not contain nested folder + ] - # Add another sub-nested folder with a folder - sub_directory3 = sub_directory1 / "intermediate_nested" / "a_simple_pattern_3" - sub_directory3.mkdir(exist_ok=True, parents=True) - file3 = sub_directory3 / "a_simple_pattern_3.bin" - file3.touch() + # Create test directories and files + create_test_directories_and_files(base_directory, directories_and_files) # Specify source data (note this assumes the files are arranged in the same way as in the example data) source_data_spec = { @@ -46,7 +61,10 @@ def test_only_folder_match(tmpdir): folder_paths = [match["source_data"]["a_source"]["folder_path"] for match in matches_list] # Note that sub_directory3 is not included because it does not conform to the pattern - expected = {str(sub_directory1), str(sub_directory2)} + expected = { + str(base_directory.joinpath("subject1", "a_simple_pattern_1")), + str(base_directory.joinpath("subject2", "a_simple_pattern_2")), + } assert set(folder_paths) == expected metadata_list = [match["metadata"].to_dict() for match in matches_list] @@ -64,25 +82,18 @@ def test_only_folder_match(tmpdir): def test_only_file_match(tmpdir): base_directory = Path(tmpdir) - sub_directory1 = base_directory / "subject1" / "a_simple_pattern_1" - sub_directory2 = base_directory / "subject2" / "a_simple_pattern_2" - - sub_directory1.mkdir(exist_ok=True, parents=True) - sub_directory2.mkdir(exist_ok=True, parents=True) - - # Add files with the same name to both folders - file1 = sub_directory1 / "a_simple_pattern_1.bin" - file2 = sub_directory2 / "a_simple_pattern_2.bin" - - # Create files - file1.touch() - file2.touch() + # Define the directories and files to be created + directories_and_files = [ + (["subject1", "a_simple_pattern_1"], ["a_simple_pattern_1.bin"]), # matches + (["subject2", "a_simple_pattern_2"], ["a_simple_pattern_2.bin"]), # matches + ( # intermediate nested folder breaks match + ["subject1", "intermediate_nested", "a_simple_pattern_3"], + ["a_simple_pattern_3.bin"], + ), + ] - # Add another sub-nested folder with a folder - sub_directory3 = sub_directory1 / "intermediate_nested" / "a_simple_pattern_3" - sub_directory3.mkdir(exist_ok=True, parents=True) - file3 = sub_directory3 / "a_simple_pattern_3.bin" - file3.touch() + # Create test directories and files + create_test_directories_and_files(base_directory, directories_and_files) # Specify source data (note this assumes the files are arranged in the same way as in the example data) source_data_spec = { @@ -94,10 +105,13 @@ def test_only_file_match(tmpdir): path_expander = LocalPathExpander() matches_list = path_expander.expand_paths(source_data_spec) - file_paths = [match["source_data"]["a_source"]["file_path"] for match in matches_list] + file_paths = set(match["source_data"]["a_source"]["file_path"] for match in matches_list) # Note that file3 is not included because it does not conform to the pattern - expected = {str(file1), str(file2)} + expected = { + str(base_directory / "subject1" / "a_simple_pattern_1" / "a_simple_pattern_1.bin"), + str(base_directory / "subject2" / "a_simple_pattern_2" / "a_simple_pattern_2.bin"), + } assert set(file_paths) == expected metadata_list = [match["metadata"].to_dict() for match in matches_list] From 24a669e2e3c6d478b655baa21aa573b681ffc701 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Sun, 10 Dec 2023 17:23:34 -0500 Subject: [PATCH 7/7] Update tests/test_minimal/test_tools/test_expand_paths.py --- tests/test_minimal/test_tools/test_expand_paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_minimal/test_tools/test_expand_paths.py b/tests/test_minimal/test_tools/test_expand_paths.py index e8c4a4659..69842dba9 100644 --- a/tests/test_minimal/test_tools/test_expand_paths.py +++ b/tests/test_minimal/test_tools/test_expand_paths.py @@ -112,7 +112,7 @@ def test_only_file_match(tmpdir): str(base_directory / "subject1" / "a_simple_pattern_1" / "a_simple_pattern_1.bin"), str(base_directory / "subject2" / "a_simple_pattern_2" / "a_simple_pattern_2.bin"), } - assert set(file_paths) == expected + assert file_paths == expected metadata_list = [match["metadata"].to_dict() for match in matches_list] expected_metadata = [