From 5555fba93af52627defa88e8c479ff4e2d909936 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:42:32 +0100 Subject: [PATCH] fix output_path with timestamp error. added function to generate configs on desired cache dirs. --- .../cellfinder/cellfinder_main.py | 108 ++++++------ .../test_cellfinder_workflow.py | 164 +++++------------- 2 files changed, 102 insertions(+), 170 deletions(-) diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py index bfabec75..2c4bf30e 100644 --- a/brainglobe_workflows/cellfinder/cellfinder_main.py +++ b/brainglobe_workflows/cellfinder/cellfinder_main.py @@ -29,47 +29,47 @@ logger = logging.getLogger(__name__) # Default config -DATA_URL = "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip" -DATA_HASH = "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" -CELLFINDER_CACHE_DIR = Path.home() / ".cellfinder_benchmarks" - -DEFAULT_CONFIG_DICT = { - "install_path": CELLFINDER_CACHE_DIR, - "data_url": DATA_URL, - "data_hash": DATA_HASH, - "local_path": CELLFINDER_CACHE_DIR / "cellfinder_test_data", - "signal_parent_dir": str( - CELLFINDER_CACHE_DIR / "cellfinder_test_data" / "signal" - ), - "background_parent_dir": str( - CELLFINDER_CACHE_DIR / "cellfinder_test_data" / "background" - ), - "output_path": CELLFINDER_CACHE_DIR / "cellfinder_output", - "detected_cells_filepath": ( - CELLFINDER_CACHE_DIR / "cellfinder_output" / "detected_cells.xml" - ), - "voxel_sizes": [5, 2, 2], # microns - "start_plane": 0, - "end_plane": -1, - "trained_model": None, # if None, it will use a default model - "model_weights": None, - "model": "resnet50_tv", - "batch_size": 32, - "n_free_cpus": 2, - "network_voxel_sizes": [5, 1, 1], - "soma_diameter": 16, - "ball_xy_size": 6, - "ball_z_size": 15, - "ball_overlap_fraction": 0.6, - "log_sigma_size": 0.2, - "n_sds_above_mean_thresh": 10, - "soma_spread_factor": 1.4, - "max_cluster_size": 100000, - "cube_width": 50, - "cube_height": 50, - "cube_depth": 20, - "network_depth": "50", -} +CELLFINDER_CACHE_DIR = Path.home() / ".cellfinder_workflows" + + +def default_config_dict(CELLFINDER_CACHE_DIR): + return { + "install_path": CELLFINDER_CACHE_DIR, + "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip", + "data_hash": ( + "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" + ), + "local_path": CELLFINDER_CACHE_DIR / "cellfinder_test_data", + "signal_parent_dir": str( + CELLFINDER_CACHE_DIR / "cellfinder_test_data" / "signal" + ), + "background_parent_dir": str( + CELLFINDER_CACHE_DIR / "cellfinder_test_data" / "background" + ), + "output_path_basename": CELLFINDER_CACHE_DIR / "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [5, 2, 2], # microns + "start_plane": 0, + "end_plane": -1, + "trained_model": None, # if None, it will use a default model + "model_weights": None, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [5, 1, 1], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50", + } @dataclass @@ -90,8 +90,8 @@ class CellfinderConfig: local_path: Pathlike signal_parent_dir: str background_parent_dir: str - output_path: Pathlike - detected_cells_filepath: Pathlike + output_path_basename: Pathlike + detected_cells_filename: Pathlike # preprocessing parameters voxel_sizes: Tuple[float, float, float] @@ -120,6 +120,7 @@ class CellfinderConfig: list_signal_files: Optional[list] = None list_background_files: Optional[list] = None + output_path: Optional[Pathlike] = None def example_cellfinder_script(): @@ -159,21 +160,23 @@ def run_workflow_from_cellfinder_run(cfg): ) # Save results to xml file - save_cells(detected_cells, cfg.detected_cells_filepath) + save_cells(detected_cells, cfg.output_path / cfg.detected_cells_filename) -def setup_workflow(): +def setup_workflow(cellfinder_cache_dir=CELLFINDER_CACHE_DIR): """Prepare configuration to run workflow This includes - instantiating the config dictionary, - checking if the input data exists locally, and fetching from GIN repository otherwise, - - creating the directory for the output of the workflow if it doesn't exist + - creating a timestamped directory for the output of the workflow if + it doesn't exist and adding it to the config To instantiate the config dictionary, we first check if an environment variable "CELLFINDER_CONFIG_PATH" pointing to a config json file exists. If not, the default config (DEFAULT_CONFIG_DICT) is used. + Returns ------- config : CellfinderConfig @@ -182,7 +185,7 @@ def setup_workflow(): """ # Define config - # if environment variable defined + # if environment variable defined, that prevails if "CELLFINDER_CONFIG_PATH" in os.environ.keys(): input_config_path = Path(os.environ["CELLFINDER_CONFIG_PATH"]) assert input_config_path.exists() @@ -198,9 +201,9 @@ def setup_workflow(): "Configuration retrieved from " f'{os.environ["CELLFINDER_CONFIG_PATH"]}' ) - # else use the default config + # else use the default config, with the cellfinder cache directory provided else: - config = CellfinderConfig(**DEFAULT_CONFIG_DICT) + config = CellfinderConfig(**default_config_dict(cellfinder_cache_dir)) logger.info("Using default configuration") # Retrieve and add lists of input data to config if neither are defined @@ -210,9 +213,12 @@ def setup_workflow(): # Create output directory if it doesn't exist, timestamped timestamp = datetime.datetime.now() timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S") - (Path(str(config.output_path) + "_" + timestamp_formatted)).mkdir( - parents=True, exist_ok=True + output_path_timestamped = Path( + str(config.output_path) + timestamp_formatted ) + output_path_timestamped.mkdir(parents=True, exist_ok=True) + # add to config + config.output_path = output_path_timestamped return config diff --git a/tests/test_integration/test_cellfinder_workflow.py b/tests/test_integration/test_cellfinder_workflow.py index dd51eb31..765939ca 100644 --- a/tests/test_integration/test_cellfinder_workflow.py +++ b/tests/test_integration/test_cellfinder_workflow.py @@ -6,6 +6,7 @@ import pytest from brainglobe_workflows.cellfinder.cellfinder_main import ( + default_config_dict, run_workflow_from_cellfinder_run, setup_workflow, ) @@ -26,58 +27,7 @@ def cellfinder_cache_dir(tmp_path): @pytest.fixture() -def config_from_dict(cellfinder_cache_dir): - DATA_URL = "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip" - DATA_HASH = ( - "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" - ) - CELLFINDER_CACHE_DIR = cellfinder_cache_dir - - workflow_config = { - "install_path": CELLFINDER_CACHE_DIR, - "data_url": DATA_URL, - "data_hash": DATA_HASH, - "local_path": CELLFINDER_CACHE_DIR / "cellfinder_test_data", - "signal_parent_dir": str( - CELLFINDER_CACHE_DIR / "cellfinder_test_data" / "signal" - ), - "background_parent_dir": str( - CELLFINDER_CACHE_DIR / "cellfinder_test_data" / "background" - ), - "output_path": CELLFINDER_CACHE_DIR / "cellfinder_output", - "detected_cells_filepath": ( - CELLFINDER_CACHE_DIR / "cellfinder_output" / "detected_cells.xml" - ), - "voxel_sizes": [5, 2, 2], # microns - "start_plane": 0, - "end_plane": -1, - "trained_model": None, # if None, it will use a default model - "model_weights": None, - "model": "resnet50_tv", - "batch_size": 32, - "n_free_cpus": 2, - "network_voxel_sizes": [5, 1, 1], - "soma_diameter": 16, - "ball_xy_size": 6, - "ball_z_size": 15, - "ball_overlap_fraction": 0.6, - "log_sigma_size": 0.2, - "n_sds_above_mean_thresh": 10, - "soma_spread_factor": 1.4, - "max_cluster_size": 100000, - "cube_width": 50, - "cube_height": 50, - "cube_depth": 20, - "network_depth": "50", - } - return workflow_config - - -@pytest.fixture() -def config_from_env_var(tmp_path, config_from_dict): - # create a temp json file to dump config data - input_config_path = tmp_path / "input_config.json" - +def config_from_env_var(tmp_path, cellfinder_cache_dir): # dump config from fixture into a json file # ensure all Paths are JSON serializable def prep_json(obj): @@ -86,8 +36,17 @@ def prep_json(obj): else: return json.JSONEncoder.default(obj) + # create a temp json file to dump config data + input_config_path = tmp_path / "input_config.json" + + # alter config if required by the test + # - missing signal directory + # - missing background directory + config_dict = default_config_dict(cellfinder_cache_dir) + + # dump config into json with open(input_config_path, "w") as js: - json.dump(config_from_dict, js, default=prep_json) + json.dump(config_dict, js, default=prep_json) # define environment variable pointing to this json file os.environ["CELLFINDER_CONFIG_PATH"] = str(input_config_path) @@ -97,23 +56,7 @@ def prep_json(obj): del os.environ["CELLFINDER_CONFIG_PATH"] -def test_run_with_predefined_default_config( - config_from_dict, caplog, logger_str -): - # run setup and workflow - with caplog.at_level( - logging.DEBUG, logger=logger_str - ): # temporarily sets the log level for the given logger - cfg = setup_workflow(config_from_dict) - run_workflow_from_cellfinder_run(cfg) - - # check log - assert "Using default configuration" in caplog.messages - - -def test_run_with_env_var_defined_config( - config_from_env_var, caplog, logger_str -): +def test_run_with_env_var_config(config_from_env_var, caplog, logger_str): # check environment variable exists assert "CELLFINDER_CONFIG_PATH" in os.environ.keys() @@ -127,64 +70,47 @@ def test_run_with_env_var_defined_config( "Configuration retrieved from " f'{os.environ["CELLFINDER_CONFIG_PATH"]}' in caplog.messages ) - - -def test_setup_with_missing_signal_data(config_from_dict, caplog, logger_str): - # check neither signal or background data exist locally, - assert not Path(config_from_dict["signal_parent_dir"]).exists() - assert not Path(config_from_dict["background_parent_dir"]).exists() - - # create a directory for the background only - Path(config_from_dict["background_parent_dir"]).mkdir( - parents=True, exist_ok=True - ) - - # run setup - # context manager temporarily sets the log level for the given logger - with caplog.at_level(logging.DEBUG, logger=logger_str): - cfg = setup_workflow(config_from_dict) - - # check log-- when run as a suite, both directories exist already? assert ( - f"The directory {cfg.signal_parent_dir} " - "does not exist" in caplog.messages - ) - - -def test_setup_with_missing_background_data( - config_from_dict, caplog, logger_str -): - # check neither signal or background data exist locally, - assert not Path(config_from_dict["signal_parent_dir"]).exists() - assert not Path(config_from_dict["background_parent_dir"]).exists() - - # create a directory for the signal, but not for the background - Path(config_from_dict["signal_parent_dir"]).mkdir( - parents=True, exist_ok=True - ) - - # run setup - with caplog.at_level(logging.DEBUG, logger=logger_str): - cfg = setup_workflow(config_from_dict) - - # check log - assert ( - f"The directory {cfg.background_parent_dir} " - "does not exist" in caplog.messages + "Fetching input data from the " + "provided GIN repository" in caplog.messages ) -def test_setup_fetching_from_GIN(config_from_dict, caplog, logger_str): - # check neither signal or background data exist locally before setup - assert not Path(config_from_dict["signal_parent_dir"]).exists() - assert not Path(config_from_dict["background_parent_dir"]).exists() +def test_run_with_default_config(cellfinder_cache_dir, caplog, logger_str): + # check environment var is not defined + assert "CELLFINDER_CONFIG_PATH" not in os.environ.keys() - # run setup + # run setup and workflow with caplog.at_level(logging.DEBUG, logger=logger_str): - setup_workflow(config_from_dict) + cfg = setup_workflow(cellfinder_cache_dir) + run_workflow_from_cellfinder_run(cfg) # check log + assert "Using default configuration" in caplog.messages assert ( "Fetching input data from the " "provided GIN repository" in caplog.messages ) + + +# test running on local data? +# def test_run_with_default_config_local( +# caplog, logger_str +# ): +# # check environment var is not defined +# assert "CELLFINDER_CONFIG_PATH" not in os.environ.keys() + +# # run setup and workflow +# # do not pass a cellfinder_cache_dir location to use default +# with caplog.at_level( +# logging.DEBUG, logger=logger_str +# ): +# cfg = setup_workflow() +# run_workflow_from_cellfinder_run(cfg) + +# # check log +# assert "Using default configuration" in caplog.messages +# assert ( +# "Fetching input data from the " +# "local directories" in caplog.messages +# )