diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py index 4cc75d6..2de0f0c 100644 --- a/src/cryo_challenge/_preprocessing/dataloader.py +++ b/src/cryo_challenge/_preprocessing/dataloader.py @@ -25,7 +25,11 @@ class SubmissionPreprocessingDataLoader(Dataset): def __init__(self, submission_config): self.submission_config = submission_config - self.submission_paths, self.gt_path = self.extract_submission_paths() + self.validate_submission_config() + + self.submission_paths, self.population_files, self.gt_path = ( + self.extract_submission_paths() + ) self.subs_index = [int(idx) for idx in list(self.submission_config.keys())[1:]] path_to_gt_ref = os.path.join( self.gt_path, self.submission_config["gt"]["ref_align_fname"] @@ -53,18 +57,24 @@ def validate_submission_config(self): raise ValueError("Box size not found for ground truth") if "pixel_size" not in value.keys(): raise ValueError("Pixel size not found for ground truth") + if "ref_align_fname" not in value.keys(): + raise ValueError( + "Reference align file name not found for ground truth" + ) continue else: if "path" not in value.keys(): raise ValueError(f"Path not found for submission {key}") - if "id" not in value.keys(): - raise ValueError(f"ID not found for submission {key}") + if "name" not in value.keys(): + raise ValueError(f"Name not found for submission {key}") if "box_size" not in value.keys(): raise ValueError(f"Box size not found for submission {key}") if "pixel_size" not in value.keys(): raise ValueError(f"Pixel size not found for submission {key}") if "align" not in value.keys(): raise ValueError(f"Align not found for submission {key}") + if "populations_file" not in value.keys(): + raise ValueError(f"Population file not found for submission {key}") if "flip" not in value.keys(): raise ValueError(f"Flip not found for submission {key}") @@ -74,11 +84,10 @@ def validate_submission_config(self): if not os.path.isdir(value["path"]): raise ValueError(f"Path {value['path']} is not a directory") - ids = list(self.submission_config.keys())[1:] - if ids != list(range(len(ids))): - raise ValueError( - "Submission IDs should be integers starting from 0 and increasing by 1" - ) + if not os.path.exists(value["populations_file"]): + raise ValueError( + f"Population file {value['populations_file']} does not exist" + ) return @@ -137,13 +146,16 @@ def help(cls): def extract_submission_paths(self): submission_paths = [] + population_files = [] for key, value in self.submission_config.items(): if key == "gt": gt_path = value["path"] else: submission_paths.append(value["path"]) - return submission_paths, gt_path + population_files.append(value["populations_file"]) + + return submission_paths, population_files, gt_path def __len__(self): return len(self.submission_paths) @@ -156,10 +168,7 @@ def __getitem__(self, idx): assert len(vol_paths) > 0, "No volumes found in submission directory" - populations = np.loadtxt( - os.path.join(self.submission_paths[idx], "populations.txt") - ) - populations = torch.from_numpy(populations) + populations = torch.from_numpy(np.loadtxt(self.population_files[idx])) vol0 = mrcfile.open(vol_paths[0], mode="r") volumes = torch.zeros( diff --git a/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json b/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json index 354060e..9e6d56a 100644 --- a/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json +++ b/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json @@ -12,6 +12,7 @@ "box_size": 244, "pixel_size": 2.146, "path": "tests/data/unprocessed_dataset_2_submissions/submission_x", + "populations_file": "tests/data/unprocessed_dataset_2_submissions/submission_x/populations.txt", "flip": 1 } } diff --git a/tutorials/1_tutorial_preprocessing.ipynb b/tutorials/1_tutorial_preprocessing.ipynb index 84db8c9..bac0593 100644 --- a/tutorials/1_tutorial_preprocessing.ipynb +++ b/tutorials/1_tutorial_preprocessing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-17T15:40:12.854854Z", @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-17T15:40:20.557563Z", @@ -30,7 +30,6 @@ "import os\n", "import torch\n", "import matplotlib.pyplot as plt\n", - "import numpy as np\n", "import yaml\n", "from ipyfilechooser import FileChooser" ] @@ -80,6 +79,17 @@ "display(submission1_path)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select path to populations (submission 1)\n", + "submission1_pop_path = FileChooser(path_to_sub_set.selected_path)\n", + "display(submission1_pop_path)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -97,6 +107,26 @@ "display(submission2_path)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select path to populations (submission 2)\n", + "submission2_pop_path = FileChooser(path_to_sub_set.selected_path)\n", + "display(submission2_pop_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "submission2_pop_path.selected" + ] + }, { "cell_type": "code", "execution_count": null, @@ -116,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-13T07:40:59.387306Z", @@ -140,6 +170,7 @@ " \"box_size\": 144,\n", " \"pixel_size\": 1.073 * 2,\n", " \"path\": submission1_path.selected_path,\n", + " \"populations_file\": submission1_pop_path.selected,\n", " },\n", " 1: {\n", " \"name\": \"submission2\",\n", @@ -148,13 +179,14 @@ " \"box_size\": 288,\n", " \"pixel_size\": 1.073,\n", " \"path\": submission2_path.selected_path,\n", + " \"populations_file\": submission2_pop_path.selected,\n", " },\n", "}" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-13T07:41:01.194466Z", @@ -176,17 +208,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After you create your submission_config, simply grab a copy of the file \"config_preproc.yaml\" from the provided config_files, and change the path for the \"submission_config_file\" to the file we created in the previous cell. Also change the path for the output. The rest of the parameters you can leave untouched. Please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. Then simply run\n", + "Lastly, to run the preprocessing pipeline follow these steps\n", "\n", - "```bash\n", - "cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml\n", - "```\n", + "0. Make sure to activate your environment and have the package installed!\n", "\n", - "Note: make sure to activate your environment and have the package installed!\n", + "1. Grab a copy of the file `config_preproc.yaml`from our config file templates.\n", "\n", - "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n", + "2. In the copied config file, update the value of `submission_config_file` to match the path to the file we created in the last cell.\n", "\n", - "IMPORTANT: The execution of the previous program relies on the existence of file to be saved at {{ submission1_path.selected_path }} with a specific formatting. The file must be named \"populations.txt\", and should be formatted as a single row/column CSV file containing the populations computed from your results. If the previous file is not included, the execution of the program will result in a runtime error." + "3. Optionally, change the other parameters. \n", + " * Most of the parameters (BOT_* and thresh_percentile) are for the alignment. For details on how they work, please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. \n", + "\n", + " * The other parameters are self explanatory, \"seed_flavor_assignment\" changes which submission gets assigned which ice cream flavor, keep this if you want to revert anonymity.\n", + "\n", + "4. Run the command: `cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml`\n", + "\n", + "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n" ] }, { @@ -209,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-06-13T07:43:16.259106Z", @@ -242,12 +279,10 @@ "source": [ "n_submissions = 2 # change this to however many submissions you preprocessed\n", "\n", - "fig, ax = plt.subplots(2, 6, figsize=(20, 8)) # change values here too\n", + "fig, ax = plt.subplots(1, 2, figsize=(10, 4)) # change values here too\n", "\n", "for i in range(n_submissions):\n", - " idx = np.random.randint(\n", - " 0, 20\n", - " ) # doing random volumes to check that everything went fine\n", + " idx = 0\n", "\n", " submission = torch.load(os.path.join(full_output_path, f\"submission_{i}.pt\"))\n", " print(submission[\"volumes\"].shape, submission[\"id\"])\n", @@ -258,9 +293,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "cryo-challenge-kernel", "language": "python", - "name": "python3" + "name": "cryo-challenge-kernel" }, "language_info": { "codemirror_mode": { @@ -272,7 +307,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.17" + "version": "3.10.10" } }, "nbformat": 4,