From 960b32818dfe96d928ec8f3cdaafd44a39377dc0 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:24:53 +0100
Subject: [PATCH 01/24] update gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index aedc8d7c..4ef8a0a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,3 +81,6 @@ venv/
 
 # written by setuptools_scm
 **/_version.py
+
+brainglobe_workflows/cellfinder/notebooks/
+**/**/.cellfinder_workflows

From 995a7838b90943600e5c1a6e74af4ed7efa8a614 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 21 Sep 2023 12:02:16 +0100
Subject: [PATCH 02/24] add asv benchmarks structure

---
 .gitignore               |   4 +-
 MANIFEST.in              |   5 +
 asv.conf.json            | 194 +++++++++++++++++++++++++++++++++++++++
 benchmarks/__init__.py   |   0
 benchmarks/benchmarks.py |  27 ++++++
 5 files changed, 227 insertions(+), 3 deletions(-)
 create mode 100644 asv.conf.json
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/benchmarks.py

diff --git a/.gitignore b/.gitignore
index 4ef8a0a4..ac4340c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,6 +81,4 @@ venv/
 
 # written by setuptools_scm
 **/_version.py
-
-brainglobe_workflows/cellfinder/notebooks/
-**/**/.cellfinder_workflows
+benchmarks/results/*
diff --git a/MANIFEST.in b/MANIFEST.in
index 34cf45e6..27538bc8 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -9,3 +9,8 @@ recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-exclude docs *
 recursive-exclude tests *
+
+include *.json
+recursive-include benchmarks *.json
+recursive-include benchmarks *.py
+recursive-exclude benchmarks/results *
diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 00000000..3417dec3
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,194 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "brainglobe_workflows",
+
+    // The project's homepage
+    "project_url": "https://github.com/brainglobe/brainglobe-workflows",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building the project.
+    // See asv.conf.json documentation.
+    // To build the package using pyproject.toml (PEP518), uncomment the following lines
+    "build_command": [
+        "python -m pip install build",
+        "python -m build",
+        "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+    ],
+    // To build the package using setuptools and a setup.py file, uncomment the following lines
+    // "build_command": [
+    //     "python setup.py build",
+    //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // Customizable commands for installing and uninstalling the project.
+    // See asv.conf.json documentation.
+    "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+    "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    "branches": ["main"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv", "mamba" (above 3.8)
+    // or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "https://github.com/brainglobe/brainglobe-workflows/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    "pythons": ["3.8"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge", "defaults"],
+
+    // A conda environment file that is used for environment creation.
+    // "conda_environment_file": "environment.yml",
+
+    // The matrix of dependencies to test.  Each key of the "req"
+    // requirements dictionary is the name of a package (in PyPI) and
+    // the values are version numbers.  An empty list or empty string
+    // indicates to just test against the default (latest)
+    // version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed
+    // via pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+    // environment variables to pass to build and benchmark commands.
+    // An environment will be created for every combination of the
+    // cartesian product of the "@env" variables in this matrix.
+    // Variables in "@env_nobuild" will be passed to every environment
+    // during the benchmark phase, but will not trigger creation of
+    // new environments.  A value of ``null`` means that the variable
+    // will not be set for the current combination.
+    //
+    // "matrix": {
+    //     "req": {
+    //         "numpy": ["1.6", "1.7"],
+    //         "six": ["", null],  // test with and without six installed
+    //         "pip+emcee": [""]   // emcee is only available for install with pip.
+    //     },
+    //     "env": {"ENV_VAR_1": ["val1", "val2"]},
+    //     "env_nobuild": {"ENV_VAR_2": ["val3", null]},
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    // - req
+    //     Required packages
+    // - env
+    //     Environment variables
+    // - env_nobuild
+    //     Non-build environment variables
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda
+    //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": "benchmarks/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": "benchmarks/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
new file mode 100644
index 00000000..9fe21ab6
--- /dev/null
+++ b/benchmarks/benchmarks.py
@@ -0,0 +1,27 @@
+# Write the benchmarking functions here.
+# See "Writing benchmarks" in the asv docs for more information.
+
+
+class TimeSuite:
+    """
+    An example benchmark that times the performance of various kinds
+    of iterating over dictionaries in Python.
+    """
+
+    def setup(self):
+        self.d = {}
+        for x in range(500):
+            self.d[x] = None
+
+    def time_keys(self):
+        for key in self.d.keys():
+            pass
+
+    def time_values(self):
+        for value in self.d.values():
+            pass
+
+    def time_range(self):
+        d = self.d
+        for key in range(500):
+            d[key]

From c342dbdc4554adb6e5302d24386cf08c5fc407af Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 21 Sep 2023 15:33:01 +0100
Subject: [PATCH 03/24] edit asv: add force reinstall to pip command and use
 package from branch

---
 asv.conf.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv.conf.json b/asv.conf.json
index 3417dec3..67ce0a69 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -34,12 +34,12 @@
 
     // Customizable commands for installing and uninstalling the project.
     // See asv.conf.json documentation.
-    "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+    "install_command": ["in-dir={env_dir} python -mpip install --force-reinstall {wheel_file}"],
     "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "default" (for mercurial).
-    "branches": ["main"], // for git
+    "branches": ["smg/cellfinder-workflow"], // for git
     // "branches": ["default"],    // for mercurial
 
     // The DVCS being used.  If not set, it will be automatically
@@ -65,7 +65,7 @@
 
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
-    "pythons": ["3.8"],
+    "pythons": ["3.10"],
 
     // The list of conda channel names to be searched for benchmark
     // dependency packages in the specified order

From 19ffcc49503d3e23275b5c81a7cc2eff8a4f30a6 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 21 Sep 2023 15:45:09 +0100
Subject: [PATCH 04/24] add cellfinder core dependency


From 962e129d60585307145e4ce749c525a31392aae0 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 22 Sep 2023 12:13:39 +0100
Subject: [PATCH 05/24] add first draft of workflow benchmarks

---
 benchmarks/benchmarks.py           |  27 -------
 benchmarks/cellfinder/__init__.py  |   0
 benchmarks/cellfinder/workflows.py | 114 +++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 27 deletions(-)
 delete mode 100644 benchmarks/benchmarks.py
 create mode 100644 benchmarks/cellfinder/__init__.py
 create mode 100644 benchmarks/cellfinder/workflows.py

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
deleted file mode 100644
index 9fe21ab6..00000000
--- a/benchmarks/benchmarks.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Write the benchmarking functions here.
-# See "Writing benchmarks" in the asv docs for more information.
-
-
-class TimeSuite:
-    """
-    An example benchmark that times the performance of various kinds
-    of iterating over dictionaries in Python.
-    """
-
-    def setup(self):
-        self.d = {}
-        for x in range(500):
-            self.d[x] = None
-
-    def time_keys(self):
-        for key in self.d.keys():
-            pass
-
-    def time_values(self):
-        for value in self.d.values():
-            pass
-
-    def time_range(self):
-        d = self.d
-        for key in range(500):
-            d[key]
diff --git a/benchmarks/cellfinder/__init__.py b/benchmarks/cellfinder/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py
new file mode 100644
index 00000000..3a43e1c5
--- /dev/null
+++ b/benchmarks/cellfinder/workflows.py
@@ -0,0 +1,114 @@
+# Write the benchmarking functions here.
+# See "Writing benchmarks" in the asv docs for more information.
+
+from brainglobe_utils.IO.cells import save_cells
+from cellfinder_core.main import main as cellfinder_run
+from cellfinder_core.tools.IO import read_with_dask
+
+from brainglobe_workflows.cellfinder.cellfinder_main import (
+    Workflow,
+    workflow_from_cellfinder_run,
+)
+
+
+class TimeBenchmark:
+    """
+    Base class with sensible options
+    See https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes
+
+    The sample_time, number, repeat, and timer attributes can be adjusted in
+    the setup() routine, which can be useful for parameterized benchmarks
+
+    Other attributes for time benchmarks not specified in this class:
+    - number: the number of iterations in each sample. If number is specified,
+    sample_time is ignored. Note that setup and teardown are not run between
+    iterations: setup runs first, then the timed benchmark routine is called
+    number times, and after that teardown runs.
+    - timer: timeit.default_timer by default
+
+    Notes about some of the default attributes for time benchmarks:
+      - warmup_time: asv will spend this time (in seconds) in calling the
+        benchmarked function repeatedly, before starting to run the
+        actual benchmark
+
+      - repeat: when not provided (repeat set to 0):
+        - if rounds==1 the default is
+            (min_repeat, max_repeat, max_time) = (1, 10, 20.0),
+        - if rounds != 1 the default is
+            (min_repeat, max_repeat, max_time) = (1, 5, 10.0)
+
+      - sample: `number` is determined so that each sample takes
+        approx sample_time=10ms
+    """
+
+    timeout = 60  # default: 60
+    version = None  # default: None (i.e. hash of source code)
+
+    # time benchmarks
+    warmup_time = 0.1  # default:0.1;
+    rounds = 2  # default:2
+    repeat = 0  # default: 0 samples to collect per round.
+    sample_time = 10  # default 10 ms; `
+    min_run_count = 2  # default:2
+
+
+# I dont know how to have a common part for the setup fn for all
+# without doing cahce
+def setup_cache():
+    cfg = Workflow()
+    cfg.setup_parameters()
+    cfg.setup_input_data()
+    return cfg
+
+
+class TimeFullWorkflow(TimeBenchmark):
+    def time_workflow_from_cellfinder_run(self, cfg):
+        workflow_from_cellfinder_run(cfg)
+
+    #  def teardown(self, model_name): -- after each benchmark or after all?
+    #     # remove .cellfinder-benchmarks dir after benchmarks
+    #     shutil.rmtree(self.install_path)
+
+
+class TimeReadInputDask(TimeBenchmark):
+    def time_read_signal_w_dask(self, cfg):
+        read_with_dask(cfg.signal_parent_dir)
+
+    def time_read_background_w_dask(self, cfg):
+        read_with_dask(cfg.background_parent_dir)
+
+    #  def teardown(self, model_name): -- after each benchmark or after all?
+    #     # remove .cellfinder-benchmarks dir after benchmarks
+    #     shutil.rmtree(self.install_path)
+
+
+class TimeCellfinderRun(TimeBenchmark):
+    def setup(self, cfg):
+        self.signal_array = read_with_dask(cfg.signal_parent_dir)
+        self.background_array = read_with_dask(cfg.background_parent_dir)
+
+    def time_cellfinder_run(self, cfg):
+        cellfinder_run(
+            self.signal_array, self.background_array, cfg.voxel_sizes
+        )
+
+    #  def teardown(self, model_name): -- after each benchmark or after all?
+    #     # remove .cellfinder-benchmarks dir after benchmarks
+    #     shutil.rmtree(self.install_path)
+
+
+class TimeSaveCells(TimeBenchmark):
+    def setup(self, cfg):
+        signal_array = read_with_dask(cfg.signal_parent_dir)
+        background_array = read_with_dask(cfg.background_parent_dir)
+
+        self.detected_cells = cellfinder_run(
+            signal_array, background_array, cfg.voxel_sizes
+        )
+
+    def time_save_cells(self, cfg):
+        save_cells(self.detected_cells, cfg.detected_cells_filepath)
+
+    #  def teardown(self, model_name): -- after each benchmark or after all?
+    #     # remove .cellfinder-benchmarks dir after benchmarks
+    #     shutil.rmtree(self.install_path)

From 8f231a13e6c7b488e50088c79efd0ae993bd5448 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:53:26 +0100
Subject: [PATCH 06/24] replace setup_cache for setup classmethod approach. add
 teardown function.

---
 benchmarks/cellfinder/workflows.py | 72 ++++++++++++------------------
 1 file changed, 28 insertions(+), 44 deletions(-)

diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py
index 3a43e1c5..b00189cb 100644
--- a/benchmarks/cellfinder/workflows.py
+++ b/benchmarks/cellfinder/workflows.py
@@ -1,5 +1,4 @@
-# Write the benchmarking functions here.
-# See "Writing benchmarks" in the asv docs for more information.
+import shutil
 
 from brainglobe_utils.IO.cells import save_cells
 from cellfinder_core.main import main as cellfinder_run
@@ -43,72 +42,57 @@ class TimeBenchmark:
 
     timeout = 60  # default: 60
     version = None  # default: None (i.e. hash of source code)
-
-    # time benchmarks
     warmup_time = 0.1  # default:0.1;
     rounds = 2  # default:2
     repeat = 0  # default: 0 samples to collect per round.
     sample_time = 10  # default 10 ms; `
     min_run_count = 2  # default:2
 
+    @classmethod
+    def setup(self):
+        cfg = Workflow()
+        cfg.setup_parameters()
+        cfg.setup_input_data()
+        self.cfg = cfg
 
-# I dont know how to have a common part for the setup fn for all
-# without doing cahce
-def setup_cache():
-    cfg = Workflow()
-    cfg.setup_parameters()
-    cfg.setup_input_data()
-    return cfg
+    def teardown(self):
+        shutil.rmtree(self.cfg.install_path)
 
 
 class TimeFullWorkflow(TimeBenchmark):
-    def time_workflow_from_cellfinder_run(self, cfg):
-        workflow_from_cellfinder_run(cfg)
-
-    #  def teardown(self, model_name): -- after each benchmark or after all?
-    #     # remove .cellfinder-benchmarks dir after benchmarks
-    #     shutil.rmtree(self.install_path)
+    def time_workflow_from_cellfinder_run(self):
+        workflow_from_cellfinder_run(self.cfg)
 
 
 class TimeReadInputDask(TimeBenchmark):
-    def time_read_signal_w_dask(self, cfg):
-        read_with_dask(cfg.signal_parent_dir)
-
-    def time_read_background_w_dask(self, cfg):
-        read_with_dask(cfg.background_parent_dir)
+    def time_read_signal_w_dask(self):
+        read_with_dask(self.cfg.signal_parent_dir)
 
-    #  def teardown(self, model_name): -- after each benchmark or after all?
-    #     # remove .cellfinder-benchmarks dir after benchmarks
-    #     shutil.rmtree(self.install_path)
+    def time_read_background_w_dask(self):
+        read_with_dask(self.cfg.background_parent_dir)
 
 
 class TimeCellfinderRun(TimeBenchmark):
-    def setup(self, cfg):
-        self.signal_array = read_with_dask(cfg.signal_parent_dir)
-        self.background_array = read_with_dask(cfg.background_parent_dir)
+    def setup(self):
+        TimeBenchmark.setup()
+        self.signal_array = read_with_dask(self.cfg.signal_parent_dir)
+        self.background_array = read_with_dask(self.cfg.background_parent_dir)
 
-    def time_cellfinder_run(self, cfg):
+    def time_cellfinder_run(self):
         cellfinder_run(
-            self.signal_array, self.background_array, cfg.voxel_sizes
+            self.signal_array, self.background_array, self.cfg.voxel_sizes
         )
 
-    #  def teardown(self, model_name): -- after each benchmark or after all?
-    #     # remove .cellfinder-benchmarks dir after benchmarks
-    #     shutil.rmtree(self.install_path)
-
 
 class TimeSaveCells(TimeBenchmark):
-    def setup(self, cfg):
-        signal_array = read_with_dask(cfg.signal_parent_dir)
-        background_array = read_with_dask(cfg.background_parent_dir)
+    def setup(self):
+        TimeBenchmark.setup()
+        signal_array = read_with_dask(self.cfg.signal_parent_dir)
+        background_array = read_with_dask(self.cfg.background_parent_dir)
 
         self.detected_cells = cellfinder_run(
-            signal_array, background_array, cfg.voxel_sizes
+            signal_array, background_array, self.cfg.voxel_sizes
         )
 
-    def time_save_cells(self, cfg):
-        save_cells(self.detected_cells, cfg.detected_cells_filepath)
-
-    #  def teardown(self, model_name): -- after each benchmark or after all?
-    #     # remove .cellfinder-benchmarks dir after benchmarks
-    #     shutil.rmtree(self.install_path)
+    def time_save_cells(self):
+        save_cells(self.detected_cells, self.cfg.detected_cells_filepath)

From 2c503d50b83984aeab8d049f4e8d9af7d77999eb Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:54:55 +0100
Subject: [PATCH 07/24] small edits to comments on default values

---
 benchmarks/cellfinder/workflows.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py
index b00189cb..94dc876a 100644
--- a/benchmarks/cellfinder/workflows.py
+++ b/benchmarks/cellfinder/workflows.py
@@ -40,12 +40,12 @@ class TimeBenchmark:
         approx sample_time=10ms
     """
 
-    timeout = 60  # default: 60
+    timeout = 60  # default: 60 s
     version = None  # default: None (i.e. hash of source code)
     warmup_time = 0.1  # default:0.1;
     rounds = 2  # default:2
-    repeat = 0  # default: 0 samples to collect per round.
-    sample_time = 10  # default 10 ms; `
+    repeat = 0  # default: 0
+    sample_time = 10  # default: 10 ms; `
     min_run_count = 2  # default:2
 
     @classmethod

From 675de5cb568b1b4c8c43d1cf8e17a815c8aefe00 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 22 Sep 2023 15:16:35 +0100
Subject: [PATCH 08/24] set sample time to s

---
 benchmarks/cellfinder/workflows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py
index 94dc876a..309e91f1 100644
--- a/benchmarks/cellfinder/workflows.py
+++ b/benchmarks/cellfinder/workflows.py
@@ -45,7 +45,7 @@ class TimeBenchmark:
     warmup_time = 0.1  # default:0.1;
     rounds = 2  # default:2
     repeat = 0  # default: 0
-    sample_time = 10  # default: 10 ms; `
+    sample_time = 0.01  # default: 10 ms = 0.01 s;
     min_run_count = 2  # default:2
 
     @classmethod

From 5bee73c952b106e63db9271a677ea65c7bf14d1f Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 22 Sep 2023 16:02:57 +0100
Subject: [PATCH 09/24] edit docstring

---
 benchmarks/cellfinder/workflows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py
index 309e91f1..9e7b6ea5 100644
--- a/benchmarks/cellfinder/workflows.py
+++ b/benchmarks/cellfinder/workflows.py
@@ -36,7 +36,7 @@ class TimeBenchmark:
         - if rounds != 1 the default is
             (min_repeat, max_repeat, max_time) = (1, 5, 10.0)
 
-      - sample: `number` is determined so that each sample takes
+      - sample_time: `number` is determined so that each sample takes
         approx sample_time=10ms
     """
 

From 34c0ef05065d5bef3fc6d4bce64c1239e918db85 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:11:31 +0100
Subject: [PATCH 10/24] dependency injection for passing custom CLI arguments
 during benchmarking

---
 brainglobe_workflows/cellfinder/cellfinder_main.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py
index 04ec7663..dff60dff 100644
--- a/brainglobe_workflows/cellfinder/cellfinder_main.py
+++ b/brainglobe_workflows/cellfinder/cellfinder_main.py
@@ -334,7 +334,7 @@ def retrieve_input_data(config: CellfinderConfig) -> CellfinderConfig:
     return config
 
 
-def parse_cli_arguments() -> argparse.Namespace:
+def parse_cli_arguments(argv=None) -> argparse.Namespace:
     """Define argument parser for cellfinder
     workflow script.
 
@@ -347,6 +347,11 @@ def parse_cli_arguments() -> argparse.Namespace:
     args : argparse.Namespace
         command line input arguments parsed
     """
+
+    # command line input arguments: sys.argv in most cases except for testing
+    # see https://paiml.com/docs/home/books/testing-in-python/chapter08-monkeypatching/#the-simplest-monkeypatching
+    argv = argv or sys.argv
+
     # initialise argument parser
     parser = argparse.ArgumentParser(
         description=(
@@ -368,7 +373,7 @@ def parse_cli_arguments() -> argparse.Namespace:
     )
 
     # build parser object
-    args = parser.parse_args()
+    args = parser.parse_args(argv[1:])
 
     # print error if required arguments not provided
     if not args.config:

From 1b9c806a5e841ce5c7bc26d72cd2c6166adef6d7 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:45:49 +0100
Subject: [PATCH 11/24] change how sys.argv is passed

---
 brainglobe_workflows/cellfinder/cellfinder_main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py
index dff60dff..b76500b7 100644
--- a/brainglobe_workflows/cellfinder/cellfinder_main.py
+++ b/brainglobe_workflows/cellfinder/cellfinder_main.py
@@ -350,7 +350,7 @@ def parse_cli_arguments(argv=None) -> argparse.Namespace:
 
     # command line input arguments: sys.argv in most cases except for testing
     # see https://paiml.com/docs/home/books/testing-in-python/chapter08-monkeypatching/#the-simplest-monkeypatching
-    argv = argv or sys.argv
+    argv = argv or sys.argv[1:]
 
     # initialise argument parser
     parser = argparse.ArgumentParser(
@@ -373,7 +373,7 @@ def parse_cli_arguments(argv=None) -> argparse.Namespace:
     )
 
     # build parser object
-    args = parser.parse_args(argv[1:])
+    args = parser.parse_args(argv)
 
     # print error if required arguments not provided
     if not args.config:

From 666df24422a104103b4e31bc3d2a7d5fd23dc95c Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 9 Oct 2023 18:58:52 +0100
Subject: [PATCH 12/24] define logger in a scope common to all setup steps

---
 .../cellfinder/cellfinder_main.py             | 533 +++++++++---------
 1 file changed, 271 insertions(+), 262 deletions(-)

diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py
index b76500b7..fd19db34 100644
--- a/brainglobe_workflows/cellfinder/cellfinder_main.py
+++ b/brainglobe_workflows/cellfinder/cellfinder_main.py
@@ -96,33 +96,272 @@ class CellfinderConfig:
     detected_cells_path: Pathlike = ""
 
 
-def setup_logger() -> logging.Logger:
-    """Setup a logger for this script
+def setup(argv=None) -> CellfinderConfig:
+    def parse_cli_arguments(argv_) -> argparse.Namespace:
+        """Define argument parser for cellfinder
+        workflow script.
+
+        It expects a path to a json file with the
+        parameters required to run the workflow.
+        If none is provided, the default
+
+        Returns
+        -------
+        args : argparse.Namespace
+            command line input arguments parsed
+        """
+        # initialise argument parser
+        parser = argparse.ArgumentParser(
+            description=(
+                "To launch the workflow with "
+                "a specific set of input parameters, run: "
+                "`python cellfinder_main.py --config path/to/config.json`"
+                "where path/to/input/config.json is the json file "
+                "containing the workflow parameters."
+            )
+        )
+        # add arguments
+        parser.add_argument(
+            "-c",
+            "--config",
+            default=str(DEFAULT_JSON_CONFIG_PATH),
+            type=str,
+            metavar="CONFIG",  # a name for usage messages
+            help="",
+        )
 
-    The logger's level is set to DEBUG, and it
-    is linked to a handler that writes to the
-    console and whose level is
+        # build parser object
+        args = parser.parse_args(argv_)
 
-    Returns
-    -------
-    logging.Logger
-        a logger object
-    """
-    # define handler that writes to stdout
-    console_handler = logging.StreamHandler(sys.stdout)
-    console_format = logging.Formatter("%(name)s %(levelname)s: %(message)s")
-    console_handler.setFormatter(console_format)
+        # print error if required arguments not provided
+        if not args.config:
+            logger.error("Paths to input config not provided.")
+            parser.print_help()
+
+        return args
+
+    def setup_logger() -> logging.Logger:
+        """Setup a logger for this script
+
+        The logger's level is set to DEBUG, and it
+        is linked to a handler that writes to the
+        console and whose level is
+
+        Returns
+        -------
+        logging.Logger
+            a logger object
+        """
+        # define handler that writes to stdout
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_format = logging.Formatter(
+            "%(name)s %(levelname)s: %(message)s"
+        )
+        console_handler.setFormatter(console_format)
+
+        # define logger and link to handler
+        logger = logging.getLogger(
+            __name__
+        )  # if imported as a module, the logger is named after the module
+        logger.setLevel(logging.DEBUG)
+        logger.addHandler(console_handler)
+        return logger
+
+    def setup_workflow(input_config_path: Path) -> CellfinderConfig:
+        """Run setup steps prior to executing the workflow
+
+        These setup steps include:
+        - instantiating a CellfinderConfig object with the required parameters,
+        - checking if the input data exists locally, and fetching from
+        GIN repository otherwise,
+        - adding the path to the input data files to the config, and
+        - creating a timestamped directory for the output of the workflow if
+        it doesn't exist and adding its path to the config
+
+        Parameters
+        ----------
+        input_config_path : Path
+            path to the input config file
+
+        Returns
+        -------
+        config : CellfinderConfig
+            a dataclass whose attributes are the parameters
+            for running cellfinder.
+        """
+
+        # Check config file exists
+        assert input_config_path.exists()
+
+        # Instantiate a CellfinderConfig from the input json file
+        # (assumes config is json serializable)
+        with open(input_config_path) as cfg:
+            config_dict = json.load(cfg)
+        config = CellfinderConfig(**config_dict)
+
+        # Print info logs for status
+        logger.info(f"Input config read from {input_config_path}")
+        if input_config_path == DEFAULT_JSON_CONFIG_PATH:
+            logger.info("Using default config file")
+
+        # Retrieve and add lists of input data to the config,
+        # if these are defined yet
+        if not (config.list_signal_files and config.list_signal_files):
+            # build fullpaths to inputs
+            config.signal_dir_path = str(
+                Path(config.install_path)
+                / config.extract_dir_relative
+                / config.signal_subdir
+            )
+            config.background_dir_path = str(
+                Path(config.install_path)
+                / config.extract_dir_relative
+                / config.background_subdir
+            )
+            # retrieve data
+            config = retrieve_input_data(config)
+
+        # Create timestamped output directory if it doesn't exist
+        timestamp = datetime.datetime.now()
+        timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S")
+        output_path_timestamped = Path(config.install_path) / (
+            str(config.output_path_basename_relative) + timestamp_formatted
+        )
+        output_path_timestamped.mkdir(parents=True, exist_ok=True)
+
+        # Add output path and output file path to config
+        config.output_path = output_path_timestamped
+        config.detected_cells_path = (
+            config.output_path / config.detected_cells_filename
+        )
+
+        return config
+
+    def retrieve_input_data(config: CellfinderConfig) -> CellfinderConfig:
+        """
+        Adds the lists of input data files (signal and background)
+        to the config.
+
+        It first checks if the input data exists locally.
+        - If both directories (signal and background) exist, the lists of
+        signal and background files are added to the config.
+        - If exactly one of the input data directories is missing, an error
+        message is logged.
+        - If neither of them exist, the data is retrieved from the provided GIN
+        repository. If no URL or hash to GIN is provided, an error is shown.
+
+        Parameters
+        ----------
+        config : CellfinderConfig
+            a dataclass whose attributes are the parameters
+            for running cellfinder.
+
+        Returns
+        -------
+        config : CellfinderConfig
+            a dataclass whose attributes are the parameters
+            for running cellfinder.
+        """
+        # Check if input data (signal and background) exist locally.
+        # If both directories exist, get list of signal and background files
+        if (
+            Path(config.signal_dir_path).exists()
+            and Path(config.background_dir_path).exists()
+        ):
+            logger.info("Fetching input data from the local directories")
+
+            config.list_signal_files = [
+                f
+                for f in Path(config.signal_dir_path).resolve().iterdir()
+                if f.is_file()
+            ]
+            config.list_background_files = [
+                f
+                for f in Path(config.background_dir_path).resolve().iterdir()
+                if f.is_file()
+            ]
 
-    # define logger and link to handler
-    logger = logging.getLogger(
-        __name__
-    )  # if imported as a module, the logger is named after the module
-    logger.setLevel(logging.DEBUG)
-    logger.addHandler(console_handler)
-    return logger
+        # If exactly one of the input data directories is missing, print error
+        elif (
+            Path(config.signal_dir_path).resolve().exists()
+            or Path(config.background_dir_path).resolve().exists()
+        ):
+            if not Path(config.signal_dir_path).resolve().exists():
+                logger.error(
+                    f"The directory {config.signal_dir_path} does not exist"
+                )
+            else:
+                logger.error(
+                    f"The directory {config.background_dir_path} "
+                    "does not exist"
+                )
+
+        # If neither of them exist, retrieve data from GIN repository
+        else:
+            # check if GIN URL and hash are defined (log error otherwise)
+            if (not config.data_url) or (not config.data_hash):
+                logger.error(
+                    "Input data not found locally, and URL/hash to "
+                    "GIN repository not provided"
+                )
+
+            else:
+                # get list of files in GIN archive with pooch.retrieve
+                list_files_archive = pooch.retrieve(
+                    url=config.data_url,
+                    known_hash=config.data_hash,
+                    path=config.install_path,  # zip will be downloaded here
+                    progressbar=True,
+                    processor=pooch.Unzip(
+                        extract_dir=config.extract_dir_relative
+                        # path to unzipped dir,
+                        # *relative* to the path set in 'path'
+                    ),
+                )
+                logger.info(
+                    "Fetching input data from the provided GIN repository"
+                )
+
+                # Check signal and background parent directories exist now
+                assert Path(config.signal_dir_path).resolve().exists()
+                assert Path(config.background_dir_path).resolve().exists()
+
+                # Add signal files to config
+                config.list_signal_files = [
+                    f
+                    for f in list_files_archive
+                    if f.startswith(
+                        str(Path(config.signal_dir_path).resolve())
+                    )  # if str(config.signal_dir_path) in f
+                ]
+
+                # Add background files to config
+                config.list_background_files = [
+                    f
+                    for f in list_files_archive
+                    if f.startswith(
+                        str(Path(config.background_dir_path).resolve())
+                    )  # if str(config.background_dir_path) in f
+                ]
+
+        return config
+
+    # parse command line input arguments:
+    # sys.argv in most cases except for testing
+    # see https://paiml.com/docs/home/books/testing-in-python/chapter08-monkeypatching/#the-simplest-monkeypatching
+    argv = argv or sys.argv[1:]
+    args = parse_cli_arguments(argv)
+
+    # setup logger
+    logger = setup_logger()
 
+    # run setup steps and return config
+    cfg = setup_workflow(Path(args.config))
 
-def run_workflow_from_cellfinder_run(config: CellfinderConfig):
+    return cfg
+
+
+def run_workflow_from_cellfinder_run(cfg: CellfinderConfig):
     """
     Run workflow based on the cellfinder_core.main.main()
     function.
@@ -131,265 +370,35 @@ def run_workflow_from_cellfinder_run(config: CellfinderConfig):
     1. Read the input signal and background data as two separate
        Dask arrays.
     2. Run the main cellfinder pipeline on the input Dask arrays,
-       with the parameters defined in the input configuration (config).
+       with the parameters defined in the input configuration (cfg).
     3. Save the detected cells as an xml file to the location specified in
-       the input configuration (config).
+       the input configuration (cfg).
 
     Parameters
     ----------
-    config : CellfinderConfig
+    cfg : CellfinderConfig
         a class with the required setup methods and parameters for
         the cellfinder workflow
     """
     # Read input data as Dask arrays
-    signal_array = read_with_dask(config.signal_dir_path)
-    background_array = read_with_dask(config.background_dir_path)
+    signal_array = read_with_dask(cfg.signal_dir_path)
+    background_array = read_with_dask(cfg.background_dir_path)
 
     # Run main analysis using `cellfinder_run`
     detected_cells = cellfinder_run(
-        signal_array, background_array, config.voxel_sizes
+        signal_array, background_array, cfg.voxel_sizes
     )
 
     # Save results to xml file
     save_cells(
         detected_cells,
-        config.detected_cells_path,
-    )
-
-
-def setup_workflow(input_config_path: Path) -> CellfinderConfig:
-    """Run setup steps prior to executing the workflow
-
-    These setup steps include:
-    - instantiating a CellfinderConfig object with the required parameters,
-    - checking if the input data exists locally, and fetching from
-      GIN repository otherwise,
-    - adding the path to the input data files to the config, and
-    - creating a timestamped directory for the output of the workflow if
-      it doesn't exist and adding its path to the config
-
-    Parameters
-    ----------
-    input_config_path : Path
-        path to the input config file
-
-    Returns
-    -------
-    config : CellfinderConfig
-        a dataclass whose attributes are the parameters
-        for running cellfinder.
-    """
-
-    # Check config file exists
-    assert input_config_path.exists()
-
-    # Instantiate a CellfinderConfig from the input json file
-    # (assumes config is json serializable)
-    with open(input_config_path) as c:
-        config_dict = json.load(c)
-    config = CellfinderConfig(**config_dict)
-
-    # Print info logs for status
-    logger.info(f"Input config read from {input_config_path}")
-    if input_config_path == DEFAULT_JSON_CONFIG_PATH:
-        logger.info("Using default config file")
-
-    # Retrieve and add lists of input data to the config,
-    # if these are defined yet
-    if not (config.list_signal_files and config.list_background_files):
-        # build fullpaths to inputs
-        config.signal_dir_path = str(
-            Path(config.install_path)
-            / config.extract_dir_relative
-            / config.signal_subdir
-        )
-        config.background_dir_path = str(
-            Path(config.install_path)
-            / config.extract_dir_relative
-            / config.background_subdir
-        )
-        # retrieve data
-        config = retrieve_input_data(config)
-
-    # Create timestamped output directory if it doesn't exist
-    timestamp = datetime.datetime.now()
-    timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S")
-    output_path_timestamped = Path(config.install_path) / (
-        str(config.output_path_basename_relative) + timestamp_formatted
-    )
-    output_path_timestamped.mkdir(parents=True, exist_ok=True)
-
-    # Add output path and output file path to config
-    config.output_path = output_path_timestamped
-    config.detected_cells_path = (
-        config.output_path / config.detected_cells_filename
-    )
-
-    return config
-
-
-def retrieve_input_data(config: CellfinderConfig) -> CellfinderConfig:
-    """
-    Adds the lists of input data files (signal and background) to the config.
-
-    It first checks if the input data exists locally.
-    - If both directories (signal and background) exist, the lists of signal
-      and background files are added to the config.
-    - If exactly one of the input data directories is missing, an error
-      message is logged.
-    - If neither of them exist, the data is retrieved from the provided GIN
-      repository. If no URL or hash to GIN is provided, an error is shown.
-
-    Parameters
-    ----------
-    config : CellfinderConfig
-        a dataclass whose attributes are the parameters
-        for running cellfinder.
-
-    Returns
-    -------
-    config : CellfinderConfig
-        a dataclass whose attributes are the parameters
-        for running cellfinder.
-    """
-    # Check if input data (signal and background) exist locally.
-    # If both directories exist, get list of signal and background files
-    if (
-        Path(config.signal_dir_path).exists()
-        and Path(config.background_dir_path).exists()
-    ):
-        logger.info("Fetching input data from the local directories")
-
-        config.list_signal_files = [
-            f
-            for f in Path(config.signal_dir_path).resolve().iterdir()
-            if f.is_file()
-        ]
-        config.list_background_files = [
-            f
-            for f in Path(config.background_dir_path).resolve().iterdir()
-            if f.is_file()
-        ]
-
-    # If exactly one of the input data directories is missing, print error
-    elif (
-        Path(config.signal_dir_path).resolve().exists()
-        or Path(config.background_dir_path).resolve().exists()
-    ):
-        if not Path(config.signal_dir_path).resolve().exists():
-            logger.error(
-                f"The directory {config.signal_dir_path} does not exist"
-            )
-        else:
-            logger.error(
-                f"The directory {config.background_dir_path} does not exist"
-            )
-
-    # If neither of them exist, retrieve data from GIN repository
-    else:
-        # check if GIN URL and hash are defined (log error otherwise)
-        if (not config.data_url) or (not config.data_hash):
-            logger.error(
-                "Input data not found locally, and URL/hash to "
-                "GIN repository not provided"
-            )
-
-        else:
-            # get list of files in GIN archive with pooch.retrieve
-            list_files_archive = pooch.retrieve(
-                url=config.data_url,
-                known_hash=config.data_hash,
-                path=config.install_path,  # zip will be downloaded here
-                progressbar=True,
-                processor=pooch.Unzip(
-                    extract_dir=config.extract_dir_relative
-                    # path to unzipped dir,
-                    # *relative* to the path set in 'path'
-                ),
-            )
-            logger.info("Fetching input data from the provided GIN repository")
-
-            # Check signal and background parent directories exist now
-            assert Path(config.signal_dir_path).resolve().exists()
-            assert Path(config.background_dir_path).resolve().exists()
-
-            # Add signal files to config
-            config.list_signal_files = [
-                f
-                for f in list_files_archive
-                if f.startswith(
-                    str(Path(config.signal_dir_path).resolve())
-                )  # if str(config.signal_dir_path) in f
-            ]
-
-            # Add background files to config
-            config.list_background_files = [
-                f
-                for f in list_files_archive
-                if f.startswith(
-                    str(Path(config.background_dir_path).resolve())
-                )  # if str(config.background_dir_path) in f
-            ]
-
-    return config
-
-
-def parse_cli_arguments(argv=None) -> argparse.Namespace:
-    """Define argument parser for cellfinder
-    workflow script.
-
-    It expects a path to a json file with the
-    parameters required to run the workflow.
-    If none is provided, the default
-
-    Returns
-    -------
-    args : argparse.Namespace
-        command line input arguments parsed
-    """
-
-    # command line input arguments: sys.argv in most cases except for testing
-    # see https://paiml.com/docs/home/books/testing-in-python/chapter08-monkeypatching/#the-simplest-monkeypatching
-    argv = argv or sys.argv[1:]
-
-    # initialise argument parser
-    parser = argparse.ArgumentParser(
-        description=(
-            "To launch the workflow with "
-            "a desired set of input parameters, run:"
-            " `python cellfinder_main.py --config path/to/input/config.json` "
-            "where path/to/input/config.json is the json file "
-            "containing the workflow parameters."
-        )
-    )
-    # add arguments
-    parser.add_argument(
-        "-c",
-        "--config",
-        default=str(DEFAULT_JSON_CONFIG_PATH),
-        type=str,
-        metavar="CONFIG",  # a name for usage messages
-        help="",
+        cfg.detected_cells_path,
     )
 
-    # build parser object
-    args = parser.parse_args(argv)
-
-    # print error if required arguments not provided
-    if not args.config:
-        logger.error("Paths to input config not provided.")
-        parser.print_help()
-
-    return args
-
 
 if __name__ == "__main__":
-    # setup logger
-    logger = setup_logger()
-
-    # parse command line arguments
-    args = parse_cli_arguments()
+    # run setup
+    cfg = setup()
 
     # run workflow
-    config = setup_workflow(Path(args.config))
-    run_workflow_from_cellfinder_run(config)  # only this will be benchmarked
+    run_workflow_from_cellfinder_run(cfg)  # only this will be benchmarked

From 94c9972ee80c6ea83461864ef68f8c33c6c33e87 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 9 Oct 2023 19:41:34 +0100
Subject: [PATCH 13/24] skeleton for benchmarking workflow (WIP)

---
 benchmarks/cellfinder.py           | 106 +++++++++++++++++++++++++++++
 benchmarks/cellfinder/__init__.py  |   0
 benchmarks/cellfinder/workflows.py |  98 --------------------------
 benchmarks/example.py              |  23 +++++++
 4 files changed, 129 insertions(+), 98 deletions(-)
 create mode 100644 benchmarks/cellfinder.py
 delete mode 100644 benchmarks/cellfinder/__init__.py
 delete mode 100644 benchmarks/cellfinder/workflows.py
 create mode 100644 benchmarks/example.py

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
new file mode 100644
index 00000000..c18b0e44
--- /dev/null
+++ b/benchmarks/cellfinder.py
@@ -0,0 +1,106 @@
+import shutil
+
+from brainglobe_workflows.cellfinder.cellfinder_main import (
+    run_workflow_from_cellfinder_run,
+)
+from brainglobe_workflows.cellfinder.cellfinder_main import (
+    setup as setup_workflow,
+)
+
+
+class TimeBenchmark:
+    """
+    Base class with sensible options
+    See https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes
+
+    The sample_time, number, repeat, and timer attributes can be adjusted in
+    the setup() routine, which can be useful for parameterized benchmarks
+
+    Other attributes for time benchmarks not specified in this class:
+    - number: the number of iterations in each sample. If number is specified,
+    sample_time is ignored. Note that setup and teardown are not run between
+    iterations: setup runs first, then the timed benchmark routine is called
+    number times, and after that teardown runs.
+    - timer: timeit.default_timer by default
+
+    Notes about some of the default attributes for time benchmarks:
+      - warmup_time: asv will spend this time (in seconds) in calling the
+        benchmarked function repeatedly, before starting to run the
+        actual benchmark
+
+      - repeat: when not provided (repeat set to 0):
+        - if rounds==1 the default is
+            (min_repeat, max_repeat, max_time) = (1, 10, 20.0),
+        - if rounds != 1 the default is
+            (min_repeat, max_repeat, max_time) = (1, 5, 10.0)
+
+      - sample_time: `number` is determined so that each sample takes
+        approx sample_time=10ms
+    """
+
+    timeout = 600  # default: 60 s
+    version = None  # default: None (i.e. hash of source code)
+    warmup_time = 0.1  # default:0.1;
+    rounds = 2  # default:2
+    repeat = 0  # default: 0
+    sample_time = 0.01  # default: 10 ms = 0.01 s;
+    min_run_count = 2  # default:2
+
+    # @classmethod ---> this was to reuse this setup fn for other benchmarls
+    def setup_cache(
+        self,
+    ):  # ---> cache so that we dont download data several times?
+        # monkeypatch command line arguments
+        # run setup
+        cfg = setup_workflow(
+            [
+                "--config",
+                "/Users/sofia/Documents_local/project_BrainGlobe_workflows/"
+                "brainglobe-workflows/brainglobe_workflows/cellfinder/default_config.json",
+            ]
+        )
+        self.cfg = cfg
+
+    def teardown(self):
+        shutil.rmtree(self.cfg.install_path)
+
+
+class TimeFullWorkflow(TimeBenchmark):
+    def time_workflow_from_cellfinder_run(self):
+        run_workflow_from_cellfinder_run(self.cfg)
+
+
+# class TimeReadInputDask(TimeBenchmark):
+#     def time_read_signal_w_dask(self):
+#         read_with_dask(self.cfg.signal_parent_dir)
+
+#     def time_read_background_w_dask(self):
+#         read_with_dask(self.cfg.background_parent_dir)
+
+
+# class TimeCellfinderRun(TimeBenchmark):
+#     def setup(self):
+#         TimeBenchmark.setup()
+#         self.signal_array = read_with_dask(self.cfg.signal_parent_dir)
+#         self.background_array = read_with_dask(
+#           self.cfg.background_parent_dir
+#         )
+
+#     def time_cellfinder_run(self):
+#         cellfinder_run(
+#             self.signal_array, self.background_array, self.cfg.voxel_sizes
+#         )
+
+
+# class TimeSaveCells(TimeBenchmark):
+#     def setup(self):
+#         TimeBenchmark.setup()
+#         signal_array = read_with_dask(self.cfg.signal_parent_dir)
+#         background_array = read_with_dask(self.cfg.background_parent_dir)
+
+#         self.detected_cells = cellfinder_run(
+#             signal_array, background_array, self.cfg.voxel_sizes
+#         )
+
+#     def time_save_cells(self):
+#         save_cells(self.detected_cells, self.cfg.detected_cells_filepath)
diff --git a/benchmarks/cellfinder/__init__.py b/benchmarks/cellfinder/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/benchmarks/cellfinder/workflows.py b/benchmarks/cellfinder/workflows.py
deleted file mode 100644
index 9e7b6ea5..00000000
--- a/benchmarks/cellfinder/workflows.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import shutil
-
-from brainglobe_utils.IO.cells import save_cells
-from cellfinder_core.main import main as cellfinder_run
-from cellfinder_core.tools.IO import read_with_dask
-
-from brainglobe_workflows.cellfinder.cellfinder_main import (
-    Workflow,
-    workflow_from_cellfinder_run,
-)
-
-
-class TimeBenchmark:
-    """
-    Base class with sensible options
-    See https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes
-
-    The sample_time, number, repeat, and timer attributes can be adjusted in
-    the setup() routine, which can be useful for parameterized benchmarks
-
-    Other attributes for time benchmarks not specified in this class:
-    - number: the number of iterations in each sample. If number is specified,
-    sample_time is ignored. Note that setup and teardown are not run between
-    iterations: setup runs first, then the timed benchmark routine is called
-    number times, and after that teardown runs.
-    - timer: timeit.default_timer by default
-
-    Notes about some of the default attributes for time benchmarks:
-      - warmup_time: asv will spend this time (in seconds) in calling the
-        benchmarked function repeatedly, before starting to run the
-        actual benchmark
-
-      - repeat: when not provided (repeat set to 0):
-        - if rounds==1 the default is
-            (min_repeat, max_repeat, max_time) = (1, 10, 20.0),
-        - if rounds != 1 the default is
-            (min_repeat, max_repeat, max_time) = (1, 5, 10.0)
-
-      - sample_time: `number` is determined so that each sample takes
-        approx sample_time=10ms
-    """
-
-    timeout = 60  # default: 60 s
-    version = None  # default: None (i.e. hash of source code)
-    warmup_time = 0.1  # default:0.1;
-    rounds = 2  # default:2
-    repeat = 0  # default: 0
-    sample_time = 0.01  # default: 10 ms = 0.01 s;
-    min_run_count = 2  # default:2
-
-    @classmethod
-    def setup(self):
-        cfg = Workflow()
-        cfg.setup_parameters()
-        cfg.setup_input_data()
-        self.cfg = cfg
-
-    def teardown(self):
-        shutil.rmtree(self.cfg.install_path)
-
-
-class TimeFullWorkflow(TimeBenchmark):
-    def time_workflow_from_cellfinder_run(self):
-        workflow_from_cellfinder_run(self.cfg)
-
-
-class TimeReadInputDask(TimeBenchmark):
-    def time_read_signal_w_dask(self):
-        read_with_dask(self.cfg.signal_parent_dir)
-
-    def time_read_background_w_dask(self):
-        read_with_dask(self.cfg.background_parent_dir)
-
-
-class TimeCellfinderRun(TimeBenchmark):
-    def setup(self):
-        TimeBenchmark.setup()
-        self.signal_array = read_with_dask(self.cfg.signal_parent_dir)
-        self.background_array = read_with_dask(self.cfg.background_parent_dir)
-
-    def time_cellfinder_run(self):
-        cellfinder_run(
-            self.signal_array, self.background_array, self.cfg.voxel_sizes
-        )
-
-
-class TimeSaveCells(TimeBenchmark):
-    def setup(self):
-        TimeBenchmark.setup()
-        signal_array = read_with_dask(self.cfg.signal_parent_dir)
-        background_array = read_with_dask(self.cfg.background_parent_dir)
-
-        self.detected_cells = cellfinder_run(
-            signal_array, background_array, self.cfg.voxel_sizes
-        )
-
-    def time_save_cells(self):
-        save_cells(self.detected_cells, self.cfg.detected_cells_filepath)
diff --git a/benchmarks/example.py b/benchmarks/example.py
new file mode 100644
index 00000000..3556e449
--- /dev/null
+++ b/benchmarks/example.py
@@ -0,0 +1,23 @@
+class TimeSuite:
+    """
+    An example benchmark that times the performance of various kinds
+    of iterating over dictionaries in Python.
+    """
+
+    def setup(self):
+        self.d = {}
+        for x in range(500):
+            self.d[x] = None
+
+    def time_keys(self):
+        for key in self.d.keys():
+            pass
+
+    def time_values(self):
+        for value in self.d.values():
+            pass
+
+    def time_range(self):
+        d = self.d
+        for key in range(500):
+            d[key]

From e455546b085130571d6ae30a080413c38b9a6a70 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 9 Oct 2023 19:53:07 +0100
Subject: [PATCH 14/24] replace setup_cache for cache (WIP)

---
 benchmarks/cellfinder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index c18b0e44..a23b8c83 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -46,8 +46,8 @@ class TimeBenchmark:
     sample_time = 0.01  # default: 10 ms = 0.01 s;
     min_run_count = 2  # default:2
 
-    # @classmethod ---> this was to reuse this setup fn for other benchmarls
-    def setup_cache(
+    @classmethod  # ---> this was to reuse this setup fn for other benchmarks
+    def setup(
         self,
     ):  # ---> cache so that we dont download data several times?
         # monkeypatch command line arguments

From f1f0c9c29057fc55982a7a29739395d6ef8c2329 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Oct 2023 14:30:50 +0100
Subject: [PATCH 15/24] define setup_cache to download the data

---
 benchmarks/cellfinder.py | 73 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 9 deletions(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index a23b8c83..a1007768 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -1,15 +1,22 @@
+import json
 import shutil
+from pathlib import Path
+
+import pooch
 
 from brainglobe_workflows.cellfinder.cellfinder_main import (
+    CellfinderConfig,
     run_workflow_from_cellfinder_run,
 )
 from brainglobe_workflows.cellfinder.cellfinder_main import (
-    setup as setup_workflow,
+    setup as setup_cellfinder_workflow,
 )
 
 
-class TimeBenchmark:
+class TimeBenchmarkPrepGIN:
     """
+    Setup_cache function downloads the data from GIN
+
     Base class with sensible options
     See https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes
 
@@ -46,26 +53,74 @@ class TimeBenchmark:
     sample_time = 0.01  # default: 10 ms = 0.01 s;
     min_run_count = 2  # default:2
 
-    @classmethod  # ---> this was to reuse this setup fn for other benchmarks
-    def setup(
+    input_config_path = (
+        "/Users/sofia/Documents_local/project_BrainGlobe_workflows/"
+        "brainglobe-workflows/brainglobe_workflows/cellfinder/default_config.json"
+    )
+
+    def setup_cache(
         self,
     ):  # ---> cache so that we dont download data several times?
+        """
+        We force a download of the data here
+
+        setup_cache method only performs the setup calculation once and
+        then caches the result to disk.
+
+        It is run only once also for repeated benchmarks and profiling.
+        """
+        print("RUN SETUP CACHE")
+        # download the data here?
+        # Check config file exists
+        assert Path(self.input_config_path).exists()
+
+        # Instantiate a CellfinderConfig from the input json file
+        # (assumes config is json serializable)
+        with open(self.input_config_path) as cfg:
+            config_dict = json.load(cfg)
+        config = CellfinderConfig(**config_dict)
+
+        # download data
+        # get list of files in GIN archive with pooch.retrieve
+        _ = pooch.retrieve(
+            url=config.data_url,
+            known_hash=config.data_hash,
+            path=config.install_path,
+            progressbar=True,
+            processor=pooch.Unzip(extract_dir=config.extract_dir_relative),
+        )
+
+        # paths to input data should now exist in config
+        assert Path(config.signal_dir_path).exists()
+        assert Path(config.background_dir_path).exists()
+
+        return
+
+    def setup(self):
+        """ """
         # monkeypatch command line arguments
         # run setup
-        cfg = setup_workflow(
+        print("RUN SETUP")
+        cfg = setup_cellfinder_workflow(
             [
                 "--config",
-                "/Users/sofia/Documents_local/project_BrainGlobe_workflows/"
-                "brainglobe-workflows/brainglobe_workflows/cellfinder/default_config.json",
+                self.input_config_path,  # ----should work without path too!
             ]
         )
         self.cfg = cfg
 
     def teardown(self):
-        shutil.rmtree(self.cfg.install_path)
+        """
+        Remove the cellfinder benchmarks cache directory
+        (typically .cellfinder_benchmarks)
+        """
+        print("RUN TEARDOWN")
+        shutil.rmtree(
+            Path(self.cfg.output_path).resolve()
+        )  # ---- remove all but input data? i.e., remove output only
 
 
-class TimeFullWorkflow(TimeBenchmark):
+class TimeFullWorkflow(TimeBenchmarkPrepGIN):
     def time_workflow_from_cellfinder_run(self):
         run_workflow_from_cellfinder_run(self.cfg)
 

From 1dc352fb6e81e8906b694e88291f2ec07af21066 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:25:09 +0100
Subject: [PATCH 16/24] add dosctrings and comments

---
 benchmarks/cellfinder.py | 130 ++++++++++++++++++++++++---------------
 1 file changed, 79 insertions(+), 51 deletions(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index a1007768..6b818ec0 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -15,62 +15,89 @@
 
 class TimeBenchmarkPrepGIN:
     """
-    Setup_cache function downloads the data from GIN
 
-    Base class with sensible options
-    See https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes
-
-    The sample_time, number, repeat, and timer attributes can be adjusted in
-    the setup() routine, which can be useful for parameterized benchmarks
-
-    Other attributes for time benchmarks not specified in this class:
-    - number: the number of iterations in each sample. If number is specified,
-    sample_time is ignored. Note that setup and teardown are not run between
-    iterations: setup runs first, then the timed benchmark routine is called
-    number times, and after that teardown runs.
-    - timer: timeit.default_timer by default
-
-    Notes about some of the default attributes for time benchmarks:
-      - warmup_time: asv will spend this time (in seconds) in calling the
-        benchmarked function repeatedly, before starting to run the
-        actual benchmark
-
-      - repeat: when not provided (repeat set to 0):
+    A base class with sensible options for timing the cellfinder workflow.
+
+    It includes:
+     - a setup_cache function that downloads the GIN data specified in the
+       default_config.json to a local directory (created by asv). This function
+       runs only once before all repeats of the benchmark.
+    -  a setup function, that runs the setup steps for the workflow.
+    - a teardown function, that removes the output directory.
+
+    Notes
+    -----
+    The class includes some predefined attributes for timing benchmarks. For
+    the full list see
+    https://asv.readthedocs.io/en/stable/benchmarks.html#benchmark-attributes
+
+    Some asv benchmarking nomenclature:
+    - repeat: a benchmark repeat is made up of the following steps:
+      1- the `setup` is run,
+      2- then the timed benchmark routine is called for `n` iterations, and
+      3- finally that teardown function is run.
+      Each repeat generates a sample, which is the average time that the
+      routine took across all iterations. A new process is started for each
+      repeat of each benchmark. A calibration phase before running the repeat
+      computes the number of iterations that will be executed. Each benchmark
+      is run for a number of repeats. The setup_cache function is run only once
+      for all repeats of a benchmark (but it is discarded before the next
+      benchmark). By default `repeat` is set to 0, which means:
         - if rounds==1 the default is
             (min_repeat, max_repeat, max_time) = (1, 10, 20.0),
         - if rounds != 1 the default is
             (min_repeat, max_repeat, max_time) = (1, 5, 10.0)
 
-      - sample_time: `number` is determined so that each sample takes
-        approx sample_time=10ms
+    - iterations (`number`): the number of iterations in each sample. Note that
+      `setup` and `teardown` are not run between iterations. asv will
+      automatically select the number of iterations so that each sample takes
+      approximately `sample_time` seconds.
+
+    - round: at each round, each benchmark is run for the specified number of
+      repeats. The idea is that we sample each benchmark over longer periods of
+      background performance variations.
+
+    - warmup time: asv will spend this time (in seconds) in calling the
+      benchmarked function repeatedly, before starting to run the actual
+      benchmark. If not specified, warmup_time defaults to 0.1 seconds
+
     """
 
+    # Timing attributes
     timeout = 600  # default: 60 s
-    version = None  # default: None (i.e. hash of source code)
-    warmup_time = 0.1  # default:0.1;
-    rounds = 2  # default:2
-    repeat = 0  # default: 0
+    version = (
+        None  # benchmark version. Default:None (i.e. hash of source code)
+    )
+    warmup_time = 0.1  # seconds
+    rounds = 2
+    repeat = 0
     sample_time = 0.01  # default: 10 ms = 0.01 s;
     min_run_count = 2  # default:2
 
+    # Custom attributes
     input_config_path = (
-        "/Users/sofia/Documents_local/project_BrainGlobe_workflows/"
-        "brainglobe-workflows/brainglobe_workflows/cellfinder/default_config.json"
+        Path(__file__).parents[1]
+        / "brainglobe_workflows/cellfinder/default_config.json"
     )
 
     def setup_cache(
         self,
-    ):  # ---> cache so that we dont download data several times?
+    ):
         """
-        We force a download of the data here
+        Download the input data from the GIN repository to the local
+        directory specified in the default_config.json
+
+        Notes
+        -----
+        The `setup_cache` method only performs the computations once
+        per benchmark round and then caches the result to disk [1]_. It cannot
+        be parametrised [2]_.
 
-        setup_cache method only performs the setup calculation once and
-        then caches the result to disk.
 
-        It is run only once also for repeated benchmarks and profiling.
+        [1] https://asv.readthedocs.io/en/latest/writing_benchmarks.html#setup-and-teardown-functions
+        [2] https://asv.readthedocs.io/en/latest/writing_benchmarks.html#parameterized-benchmarks
         """
-        print("RUN SETUP CACHE")
-        # download the data here?
+
         # Check config file exists
         assert Path(self.input_config_path).exists()
 
@@ -80,8 +107,7 @@ def setup_cache(
             config_dict = json.load(cfg)
         config = CellfinderConfig(**config_dict)
 
-        # download data
-        # get list of files in GIN archive with pooch.retrieve
+        # Download data with pooch
         _ = pooch.retrieve(
             url=config.data_url,
             known_hash=config.data_hash,
@@ -90,34 +116,36 @@ def setup_cache(
             processor=pooch.Unzip(extract_dir=config.extract_dir_relative),
         )
 
-        # paths to input data should now exist in config
+        # Check paths to input data should now exist in config
         assert Path(config.signal_dir_path).exists()
         assert Path(config.background_dir_path).exists()
 
-        return
-
     def setup(self):
-        """ """
-        # monkeypatch command line arguments
-        # run setup
-        print("RUN SETUP")
+        """
+        Run the cellfinder workflow setup steps.
+
+        The command line input arguments are injected as dependencies.
+        """
+
+        # Run setup
         cfg = setup_cellfinder_workflow(
             [
                 "--config",
-                self.input_config_path,  # ----should work without path too!
+                self.input_config_path,
             ]
         )
+
+        # Save configuration as attribute
         self.cfg = cfg
 
     def teardown(self):
         """
-        Remove the cellfinder benchmarks cache directory
-        (typically .cellfinder_benchmarks)
+        Remove the cellfinder output directory.
+
+        The input data is kept for all repeats of the same benchmark,
+        to avoid repeated downloads from GIN.
         """
-        print("RUN TEARDOWN")
-        shutil.rmtree(
-            Path(self.cfg.output_path).resolve()
-        )  # ---- remove all but input data? i.e., remove output only
+        shutil.rmtree(Path(self.cfg.output_path).resolve())
 
 
 class TimeFullWorkflow(TimeBenchmarkPrepGIN):

From b7ddd731f778e7f16eabcee1723c9f766b8d60fa Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:45:36 +0100
Subject: [PATCH 17/24] make input path string

---
 benchmarks/cellfinder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index 6b818ec0..59ff9a51 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -75,7 +75,7 @@ class TimeBenchmarkPrepGIN:
     min_run_count = 2  # default:2
 
     # Custom attributes
-    input_config_path = (
+    input_config_path = str(
         Path(__file__).parents[1]
         / "brainglobe_workflows/cellfinder/default_config.json"
     )

From db2ac7614a5d1fdebc999f4af0f65638798184e1 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:46:03 +0100
Subject: [PATCH 18/24] add benchmarks for reading with dask

---
 benchmarks/cellfinder.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index 59ff9a51..9572ee0d 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 import pooch
+from cellfinder_core.tools.IO import read_with_dask
 
 from brainglobe_workflows.cellfinder.cellfinder_main import (
     CellfinderConfig,
@@ -16,7 +17,7 @@
 class TimeBenchmarkPrepGIN:
     """
 
-    A base class with sensible options for timing the cellfinder workflow.
+    A base class for timing benchmarks for the cellfinder workflow.
 
     It includes:
      - a setup_cache function that downloads the GIN data specified in the
@@ -149,16 +150,27 @@ def teardown(self):
 
 
 class TimeFullWorkflow(TimeBenchmarkPrepGIN):
+    """Time the full cellfinder workflow.
+
+    It includes reading the signal and background arrays with dask,
+    detecting the cells and saving the results to an XML file
+
+    Parameters
+    ----------
+    TimeBenchmarkPrepGIN : _type_
+        A base class for timing benchmarks for the cellfinder workflow.
+    """
+
     def time_workflow_from_cellfinder_run(self):
         run_workflow_from_cellfinder_run(self.cfg)
 
 
-# class TimeReadInputDask(TimeBenchmark):
-#     def time_read_signal_w_dask(self):
-#         read_with_dask(self.cfg.signal_parent_dir)
+class TimeReadInputDask(TimeBenchmarkPrepGIN):
+    def time_read_signal_w_dask(self):
+        read_with_dask(self.cfg.signal_dir_path)
 
-#     def time_read_background_w_dask(self):
-#         read_with_dask(self.cfg.background_parent_dir)
+    def time_read_background_w_dask(self):
+        read_with_dask(self.cfg.background_dir_path)
 
 
 # class TimeCellfinderRun(TimeBenchmark):

From 774d8cc9e0bc27cda4c82320bf399b0800d43b7c Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 11 Oct 2023 16:45:41 +0100
Subject: [PATCH 19/24] add benchmarks for detecting and saving cells

---
 benchmarks/cellfinder.py | 77 +++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index 9572ee0d..e94355ac 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 
 import pooch
+from brainglobe_utils.IO.cells import save_cells
+from cellfinder_core.main import main as cellfinder_run
 from cellfinder_core.tools.IO import read_with_dask
 
 from brainglobe_workflows.cellfinder.cellfinder_main import (
@@ -121,6 +123,7 @@ def setup_cache(
         assert Path(config.signal_dir_path).exists()
         assert Path(config.background_dir_path).exists()
 
+    # @classmethod
     def setup(self):
         """
         Run the cellfinder workflow setup steps.
@@ -150,7 +153,8 @@ def teardown(self):
 
 
 class TimeFullWorkflow(TimeBenchmarkPrepGIN):
-    """Time the full cellfinder workflow.
+    """
+    Time the full cellfinder workflow.
 
     It includes reading the signal and background arrays with dask,
     detecting the cells and saving the results to an XML file
@@ -166,36 +170,61 @@ def time_workflow_from_cellfinder_run(self):
 
 
 class TimeReadInputDask(TimeBenchmarkPrepGIN):
-    def time_read_signal_w_dask(self):
+    """
+    Time the reading input data operations with dask
+
+    Parameters
+    ----------
+    TimeBenchmarkPrepGIN : _type_
+        A base class for timing benchmarks for the cellfinder workflow.
+    """
+
+    def time_read_signal_with_dask(self):
         read_with_dask(self.cfg.signal_dir_path)
 
-    def time_read_background_w_dask(self):
+    def time_read_background_with_dask(self):
         read_with_dask(self.cfg.background_dir_path)
 
 
-# class TimeCellfinderRun(TimeBenchmark):
-#     def setup(self):
-#         TimeBenchmark.setup()
-#         self.signal_array = read_with_dask(self.cfg.signal_parent_dir)
-#         self.background_array = read_with_dask(
-#           self.cfg.background_parent_dir
-#         )
+class TimeDetectCells(TimeBenchmarkPrepGIN):
+    """
+    Time the cell detection main pipeline (`cellfinder_run`)
 
-#     def time_cellfinder_run(self):
-#         cellfinder_run(
-#             self.signal_array, self.background_array, self.cfg.voxel_sizes
-#         )
+    Parameters
+    ----------
+    TimeBenchmarkPrepGIN : _type_
+        A base class for timing benchmarks for the cellfinder workflow.
+    """
 
+    # extend basic setup function
+    def setup(self):
+        # basic setup
+        TimeBenchmarkPrepGIN.setup(self)
+
+        # add input data as arrays to config
+        self.signal_array = read_with_dask(self.cfg.signal_dir_path)
+        self.background_array = read_with_dask(self.cfg.background_dir_path)
 
-# class TimeSaveCells(TimeBenchmark):
-#     def setup(self):
-#         TimeBenchmark.setup()
-#         signal_array = read_with_dask(self.cfg.signal_parent_dir)
-#         background_array = read_with_dask(self.cfg.background_parent_dir)
+    def time_cellfinder_run(self):
+        cellfinder_run(
+            self.signal_array, self.background_array, self.cfg.voxel_sizes
+        )
 
-#         self.detected_cells = cellfinder_run(
-#             signal_array, background_array, self.cfg.voxel_sizes
-#         )
 
-#     def time_save_cells(self):
-#         save_cells(self.detected_cells, self.cfg.detected_cells_filepath)
+class TimeSaveCells(TimeBenchmarkPrepGIN):
+    # extend basic setup function
+    def setup(self):
+        # basic setup
+        TimeBenchmarkPrepGIN.setup(self)
+
+        # add input data as arrays to config
+        self.signal_array = read_with_dask(self.cfg.signal_dir_path)
+        self.background_array = read_with_dask(self.cfg.background_dir_path)
+
+        # detect cells
+        self.detected_cells = cellfinder_run(
+            self.signal_array, self.background_array, self.cfg.voxel_sizes
+        )
+
+    def time_save_cells(self):
+        save_cells(self.detected_cells, self.cfg.detected_cells_path)

From 5b46de9d5c39ee6585f7828c3ed61e774ff0593b Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Oct 2023 12:28:00 +0100
Subject: [PATCH 20/24] import default input json

---
 benchmarks/cellfinder.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/benchmarks/cellfinder.py b/benchmarks/cellfinder.py
index e94355ac..76d364bc 100644
--- a/benchmarks/cellfinder.py
+++ b/benchmarks/cellfinder.py
@@ -8,6 +8,7 @@
 from cellfinder_core.tools.IO import read_with_dask
 
 from brainglobe_workflows.cellfinder.cellfinder_main import (
+    DEFAULT_JSON_CONFIG_PATH,
     CellfinderConfig,
     run_workflow_from_cellfinder_run,
 )
@@ -78,10 +79,7 @@ class TimeBenchmarkPrepGIN:
     min_run_count = 2  # default:2
 
     # Custom attributes
-    input_config_path = str(
-        Path(__file__).parents[1]
-        / "brainglobe_workflows/cellfinder/default_config.json"
-    )
+    input_config_path = str(DEFAULT_JSON_CONFIG_PATH)
 
     def setup_cache(
         self,
@@ -123,7 +121,6 @@ def setup_cache(
         assert Path(config.signal_dir_path).exists()
         assert Path(config.background_dir_path).exists()
 
-    # @classmethod
     def setup(self):
         """
         Run the cellfinder workflow setup steps.

From 76352ac031154257269505d3db3b9a45a90b8e5a Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 12 Oct 2023 12:28:08 +0100
Subject: [PATCH 21/24] remove example

---
 benchmarks/example.py | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 benchmarks/example.py

diff --git a/benchmarks/example.py b/benchmarks/example.py
deleted file mode 100644
index 3556e449..00000000
--- a/benchmarks/example.py
+++ /dev/null
@@ -1,23 +0,0 @@
-class TimeSuite:
-    """
-    An example benchmark that times the performance of various kinds
-    of iterating over dictionaries in Python.
-    """
-
-    def setup(self):
-        self.d = {}
-        for x in range(500):
-            self.d[x] = None
-
-    def time_keys(self):
-        for key in self.d.keys():
-            pass
-
-    def time_values(self):
-        for value in self.d.values():
-            pass
-
-    def time_range(self):
-        d = self.d
-        for key in range(500):
-            d[key]

From c5b02d7634b7a42a6ef69bbd56b64feac3a3c818 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 13 Oct 2023 17:37:56 +0100
Subject: [PATCH 22/24] point to main in asv config

---
 asv.conf.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv.conf.json b/asv.conf.json
index 67ce0a69..d620a545 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -39,7 +39,7 @@
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "default" (for mercurial).
-    "branches": ["smg/cellfinder-workflow"], // for git
+    "branches": ["main"], // for git
     // "branches": ["default"],    // for mercurial
 
     // The DVCS being used.  If not set, it will be automatically

From 682c51aa8ddbed98a1594eae81945a26f8d94529 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 13 Oct 2023 17:50:48 +0100
Subject: [PATCH 23/24] point to correct branch in asv

---
 asv.conf.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv.conf.json b/asv.conf.json
index d620a545..22bb5885 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -39,7 +39,7 @@
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "default" (for mercurial).
-    "branches": ["main"], // for git
+    "branches": ["smg/cellfinder-workflow-cli-w-tests"], // for git
     // "branches": ["default"],    // for mercurial
 
     // The DVCS being used.  If not set, it will be automatically

From d83f97ba87ce4298084b16e45745cc33834b6a5e Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 13 Oct 2023 17:54:16 +0100
Subject: [PATCH 24/24] point to the actual branch in asv (Friday brain)

---
 asv.conf.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv.conf.json b/asv.conf.json
index 22bb5885..a053875b 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -39,7 +39,7 @@
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "default" (for mercurial).
-    "branches": ["smg/cellfinder-workflow-cli-w-tests"], // for git
+    "branches": ["smg/cellfinder-cli-benchmark"], // for git
     // "branches": ["default"],    // for mercurial
 
     // The DVCS being used.  If not set, it will be automatically