From 8d0ddaa678df593af865d68a5ad06930ea8143c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Silva-S=C3=A1nchez?= <31875788+DSilva27@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:11:01 -0400 Subject: [PATCH 1/9] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d1bb35..e3c79b8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,6 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - - id: check-added-large-files - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.3.4 From bf28c54392de920d1c76dfb19848662450df735f Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 13:34:35 -0400 Subject: [PATCH 2/9] update .gitignore so it does not include testing data --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index d7aaad0..90c4962 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +test/data + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 401cea8f9e5cc3d9261663149e1f41e175ff81c8 Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 13:35:24 -0400 Subject: [PATCH 3/9] update .gitignore so it does not include testing data --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 90c4962..e9c1531 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -test/data +tests/data # Byte-compiled / optimized / DLL files __pycache__/ From 761f25abd1bcea81217c0db6e78a7e48fe69449b Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 13:35:57 -0400 Subject: [PATCH 4/9] update .gitignore so it does not include testing data --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e9c1531..ad8eb9b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ tests/data +tests/results # Byte-compiled / optimized / DLL files __pycache__/ From 7ee6824ebadb8dd43f168adc559b20f35100d32b Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 13:38:42 -0400 Subject: [PATCH 5/9] update .gitignore so it does not include testing data --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ad8eb9b..c29950d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ -tests/data +tests/data/dataset_2_submissions +tests/data/Ground_truth tests/results +tests/data/unprocessed_dataset_2_submissions/submission_x/*.mrc +tests/data/unprocessed_dataset_2_submissions/submission_x/populations.txt # Byte-compiled / optimized / DLL files __pycache__/ From 5b00f48faca6bc2101858ed463ca019d2874b182 Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 13:39:56 -0400 Subject: [PATCH 6/9] remove hard coded path to populations.txt --- .../_preprocessing/dataloader.py | 16 ++-- .../submission_x/submission_config.json | 5 +- tutorials/1_tutorial_preprocessing.ipynb | 79 +++++++++++++------ 3 files changed, 70 insertions(+), 30 deletions(-) diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py index 2593c2a..afc54ae 100644 --- a/src/cryo_challenge/_preprocessing/dataloader.py +++ b/src/cryo_challenge/_preprocessing/dataloader.py @@ -25,7 +25,9 @@ class SubmissionPreprocessingDataLoader(Dataset): def __init__(self, submission_config): self.submission_config = submission_config - self.submission_paths, self.gt_path = self.extract_submission_paths() + self.submission_paths, self.population_files, self.gt_path = ( + self.extract_submission_paths() + ) self.subs_index = [int(idx) for idx in list(self.submission_config.keys())[1:]] path_to_gt_ref = os.path.join( self.gt_path, self.submission_config["gt"]["ref_align_fname"] @@ -65,6 +67,8 @@ def validate_submission_config(self): raise ValueError(f"Pixel size not found for submission {key}") if "align" not in value.keys(): raise ValueError(f"Align not found for submission {key}") + if "populations_file" not in value.keys(): + raise ValueError(f"Population file not found for submission {key}") if not os.path.exists(value["path"]): raise ValueError(f"Path {value['path']} does not exist") @@ -135,13 +139,16 @@ def help(cls): def extract_submission_paths(self): submission_paths = [] + population_files = [] for key, value in self.submission_config.items(): if key == "gt": gt_path = value["path"] else: submission_paths.append(value["path"]) - return submission_paths, gt_path + population_files.append(value["populations_file"]) + + return submission_paths, population_files, gt_path def __len__(self): return len(self.submission_paths) @@ -154,10 +161,7 @@ def __getitem__(self, idx): assert len(vol_paths) > 0, "No volumes found in submission directory" - populations = np.loadtxt( - os.path.join(self.submission_paths[idx], "populations.txt") - ) - populations = torch.from_numpy(populations) + populations = torch.from_numpy(np.loadtxt(self.population_files[idx])) vol0 = mrcfile.open(vol_paths[0], mode="r") volumes = torch.zeros( diff --git a/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json b/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json index 87184aa..a77e68f 100644 --- a/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json +++ b/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json @@ -11,6 +11,7 @@ "align": 1, "box_size": 244, "pixel_size": 2.146, - "path": "tests/data/unprocessed_dataset_2_submissions/submission_x" + "path": "tests/data/unprocessed_dataset_2_submissions/submission_x", + "populations_file": "tests/data/unprocessed_dataset_2_submissions/submission_x/populations.txt" } -} \ No newline at end of file +} diff --git a/tutorials/1_tutorial_preprocessing.ipynb b/tutorials/1_tutorial_preprocessing.ipynb index 0c718e4..f8f0cc3 100644 --- a/tutorials/1_tutorial_preprocessing.ipynb +++ b/tutorials/1_tutorial_preprocessing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-17T15:40:12.854854Z", @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-17T15:40:20.557563Z", @@ -30,7 +30,6 @@ "import os\n", "import torch\n", "import matplotlib.pyplot as plt\n", - "import numpy as np\n", "import yaml\n", "from ipyfilechooser import FileChooser" ] @@ -80,6 +79,17 @@ "display(submission1_path)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select path to populations (submission 1)\n", + "submission1_pop_path = FileChooser(path_to_sub_set.selected_path)\n", + "display(submission1_pop_path)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -97,6 +107,26 @@ "display(submission2_path)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select path to populations (submission 2)\n", + "submission2_pop_path = FileChooser(path_to_sub_set.selected_path)\n", + "display(submission2_pop_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "submission2_pop_path.selected" + ] + }, { "cell_type": "code", "execution_count": null, @@ -116,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-13T07:40:59.387306Z", @@ -139,6 +169,7 @@ " \"box_size\": 144,\n", " \"pixel_size\": 1.073 * 2,\n", " \"path\": submission1_path.selected_path,\n", + " \"populations_file\": submission1_pop_path.selected,\n", " },\n", " 1: {\n", " \"name\": \"submission2\",\n", @@ -146,13 +177,14 @@ " \"box_size\": 288,\n", " \"pixel_size\": 1.073,\n", " \"path\": submission2_path.selected_path,\n", + " \"populations_file\": submission2_pop_path.selected,\n", " },\n", "}" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-13T07:41:01.194466Z", @@ -174,17 +206,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After you create your submission_config, simply grab a copy of the file \"config_preproc.yaml\" from the provided config_files, and change the path for the \"submission_config_file\" to the file we created in the previous cell. Also change the path for the output. The rest of the parameters you can leave untouched. Please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. Then simply run\n", + "Lastly, to run the preprocessing pipeline follow these steps\n", "\n", - "```bash\n", - "cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml\n", - "```\n", + "0. Make sure to activate your environment and have the package installed!\n", "\n", - "Note: make sure to activate your environment and have the package installed!\n", + "1. Grab a copy of the file `config_preproc.yaml`from our config file templates.\n", "\n", - "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n", + "2. In the copied config file, update the value of `submission_config_file` to match the path to the file we created in the last cell.\n", "\n", - "IMPORTANT: The execution of the previous program relies on the existence of file to be saved at {{ submission1_path.selected_path }} with a specific formatting. The file must be named \"populations.txt\", and should be formatted as a single row/column CSV file containing the populations computed from your results. If the previous file is not included, the execution of the program will result in a runtime error." + "3. Optionally, change the other parameters. \n", + " * Most of the parameters (BOT_* and thresh_percentile) are for the alignment. For details on how they work, please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. \n", + "\n", + " * The other parameters are self explanatory, \"seed_flavor_assignment\" changes which submission gets assigned which ice cream flavor, keep this if you want to revert anonymity.\n", + "\n", + "4. Run the command: `cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml`\n", + "\n", + "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n" ] }, { @@ -201,13 +238,13 @@ "# Select path to Config file\n", "# An example of this file is available in the path ../config_files/config_preproc.yaml\n", "config_preproc_path = FileChooser(os.path.expanduser(\"~\"))\n", - "config_preproc_path.filter_pattern = '*.yaml'\n", + "config_preproc_path.filter_pattern = \"*.yaml\"\n", "display(config_preproc_path)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-13T07:43:16.259106Z", @@ -224,7 +261,7 @@ "if os.path.isabs(output_path):\n", " full_output_path = output_path\n", "else:\n", - " full_output_path = os.path.join(os.getcwd(), '..', output_path)" + " full_output_path = os.path.join(os.getcwd(), \"..\", output_path)" ] }, { @@ -240,12 +277,10 @@ "source": [ "n_submissions = 2 # change this to however many submissions you preprocessed\n", "\n", - "fig, ax = plt.subplots(2, 6, figsize=(20, 8)) # change values here too\n", + "fig, ax = plt.subplots(1, 2, figsize=(10, 4)) # change values here too\n", "\n", "for i in range(n_submissions):\n", - " idx = np.random.randint(\n", - " 0, 20\n", - " ) # doing random volumes to check that everything went fine\n", + " idx = 0\n", "\n", " submission = torch.load(os.path.join(full_output_path, f\"submission_{i}.pt\"))\n", " print(submission[\"volumes\"].shape, submission[\"id\"])\n", @@ -256,9 +291,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "cryo-challenge-kernel", "language": "python", - "name": "python3" + "name": "cryo-challenge-kernel" }, "language_info": { "codemirror_mode": { @@ -270,7 +305,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.10.10" } }, "nbformat": 4, From 4d57abf3f197bef1c4f671b94f0d97ec1808dd87 Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 14:06:24 -0400 Subject: [PATCH 7/9] turned on validator in the dataloader class --- .../_preprocessing/dataloader.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py index afc54ae..e294518 100644 --- a/src/cryo_challenge/_preprocessing/dataloader.py +++ b/src/cryo_challenge/_preprocessing/dataloader.py @@ -25,6 +25,8 @@ class SubmissionPreprocessingDataLoader(Dataset): def __init__(self, submission_config): self.submission_config = submission_config + self.validate_submission_config() + self.submission_paths, self.population_files, self.gt_path = ( self.extract_submission_paths() ) @@ -55,12 +57,16 @@ def validate_submission_config(self): raise ValueError("Box size not found for ground truth") if "pixel_size" not in value.keys(): raise ValueError("Pixel size not found for ground truth") + if "ref_align_fname" not in value.keys(): + raise ValueError( + "Reference align file name not found for ground truth" + ) continue else: if "path" not in value.keys(): raise ValueError(f"Path not found for submission {key}") - if "id" not in value.keys(): - raise ValueError(f"ID not found for submission {key}") + if "name" not in value.keys(): + raise ValueError(f"Name not found for submission {key}") if "box_size" not in value.keys(): raise ValueError(f"Box size not found for submission {key}") if "pixel_size" not in value.keys(): @@ -76,11 +82,10 @@ def validate_submission_config(self): if not os.path.isdir(value["path"]): raise ValueError(f"Path {value['path']} is not a directory") - ids = list(self.submission_config.keys())[1:] - if ids != list(range(len(ids))): - raise ValueError( - "Submission IDs should be integers starting from 0 and increasing by 1" - ) + if not os.path.exists(value["populations_file"]): + raise ValueError( + f"Population file {value['populations_file']} does not exist" + ) return From 705445d27c1d894142652632348bc1990a9eac33 Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 14:17:08 -0400 Subject: [PATCH 8/9] turned on validator in the dataloader class --- .github/workflows/testing.yml | 9 ++++----- src/cryo_challenge/_preprocessing/dataloader.py | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 829a237..1f7a934 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v3 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -47,16 +47,15 @@ jobs: python -m pip install --upgrade pip pip install . pip install pytest omegaconf - + - name: Get test data from OSF if: ${{ steps.cache_test_data.outputs.cache-hit != 'true' }} run: | sh tests/scripts/fetch_test_data.sh - + - name: Test with pytest run: | - pytest tests/test_preprocessing.py + pytest tests/test_preprocessing.py -rP pytest tests/test_svd.py pytest tests/test_map_to_map.py pytest tests/test_distribution_to_distribution.py - diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py index e294518..ae9561e 100644 --- a/src/cryo_challenge/_preprocessing/dataloader.py +++ b/src/cryo_challenge/_preprocessing/dataloader.py @@ -25,6 +25,7 @@ class SubmissionPreprocessingDataLoader(Dataset): def __init__(self, submission_config): self.submission_config = submission_config + print(self.submission_config) self.validate_submission_config() self.submission_paths, self.population_files, self.gt_path = ( From 5b073772cc56e7a39d903cf56ee95c4511002398 Mon Sep 17 00:00:00 2001 From: DSilva27 Date: Thu, 11 Jul 2024 14:38:36 -0400 Subject: [PATCH 9/9] remove prints for debugging --- .github/workflows/testing.yml | 2 +- src/cryo_challenge/_preprocessing/dataloader.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 1f7a934..d68e647 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -55,7 +55,7 @@ jobs: - name: Test with pytest run: | - pytest tests/test_preprocessing.py -rP + pytest tests/test_preprocessing.py pytest tests/test_svd.py pytest tests/test_map_to_map.py pytest tests/test_distribution_to_distribution.py diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py index ae9561e..e294518 100644 --- a/src/cryo_challenge/_preprocessing/dataloader.py +++ b/src/cryo_challenge/_preprocessing/dataloader.py @@ -25,7 +25,6 @@ class SubmissionPreprocessingDataLoader(Dataset): def __init__(self, submission_config): self.submission_config = submission_config - print(self.submission_config) self.validate_submission_config() self.submission_paths, self.population_files, self.gt_path = (