Merge pull request #40 from flatironinstitute/39-fix-small-bug-in-pre…

…processing-pipeline preprocessing update and small path #39
flatironinstitute · Aug 5, 2024 · 26151d3 · 26151d3
2 parents e7b3304 + bb5c2f1
commit 26151d3
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 33 deletions.
diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py
@@ -25,7 +25,11 @@ class SubmissionPreprocessingDataLoader(Dataset):
 
     def __init__(self, submission_config):
         self.submission_config = submission_config
-        self.submission_paths, self.gt_path = self.extract_submission_paths()
+        self.validate_submission_config()
+
+        self.submission_paths, self.population_files, self.gt_path = (
+            self.extract_submission_paths()
+        )
         self.subs_index = [int(idx) for idx in list(self.submission_config.keys())[1:]]
         path_to_gt_ref = os.path.join(
             self.gt_path, self.submission_config["gt"]["ref_align_fname"]
@@ -53,18 +57,24 @@ def validate_submission_config(self):
                     raise ValueError("Box size not found for ground truth")
                 if "pixel_size" not in value.keys():
                     raise ValueError("Pixel size not found for ground truth")
+                if "ref_align_fname" not in value.keys():
+                    raise ValueError(
+                        "Reference align file name not found for ground truth"
+                    )
                 continue
             else:
                 if "path" not in value.keys():
                     raise ValueError(f"Path not found for submission {key}")
-                if "id" not in value.keys():
-                    raise ValueError(f"ID not found for submission {key}")
+                if "name" not in value.keys():
+                    raise ValueError(f"Name not found for submission {key}")
                 if "box_size" not in value.keys():
                     raise ValueError(f"Box size not found for submission {key}")
                 if "pixel_size" not in value.keys():
                     raise ValueError(f"Pixel size not found for submission {key}")
                 if "align" not in value.keys():
                     raise ValueError(f"Align not found for submission {key}")
+                if "populations_file" not in value.keys():
+                    raise ValueError(f"Population file not found for submission {key}")
                 if "flip" not in value.keys():
                     raise ValueError(f"Flip not found for submission {key}")
 
@@ -74,11 +84,10 @@ def validate_submission_config(self):
                 if not os.path.isdir(value["path"]):
                     raise ValueError(f"Path {value['path']} is not a directory")
 
-        ids = list(self.submission_config.keys())[1:]
-        if ids != list(range(len(ids))):
-            raise ValueError(
-                "Submission IDs should be integers starting from 0 and increasing by 1"
-            )
+                if not os.path.exists(value["populations_file"]):
+                    raise ValueError(
+                        f"Population file {value['populations_file']} does not exist"
+                    )
 
         return
 
@@ -137,13 +146,16 @@ def help(cls):
 
     def extract_submission_paths(self):
         submission_paths = []
+        population_files = []
         for key, value in self.submission_config.items():
             if key == "gt":
                 gt_path = value["path"]
 
             else:
                 submission_paths.append(value["path"])
-        return submission_paths, gt_path
+                population_files.append(value["populations_file"])
+
+        return submission_paths, population_files, gt_path
 
     def __len__(self):
         return len(self.submission_paths)
@@ -156,10 +168,7 @@ def __getitem__(self, idx):
 
         assert len(vol_paths) > 0, "No volumes found in submission directory"
 
-        populations = np.loadtxt(
-            os.path.join(self.submission_paths[idx], "populations.txt")
-        )
-        populations = torch.from_numpy(populations)
+        populations = torch.from_numpy(np.loadtxt(self.population_files[idx]))
 
         vol0 = mrcfile.open(vol_paths[0], mode="r")
         volumes = torch.zeros(

diff --git a/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json b/tests/data/unprocessed_dataset_2_submissions/submission_x/submission_config.json
@@ -12,6 +12,7 @@
         "box_size": 244,
         "pixel_size": 2.146,
         "path": "tests/data/unprocessed_dataset_2_submissions/submission_x",
+        "populations_file": "tests/data/unprocessed_dataset_2_submissions/submission_x/populations.txt",
         "flip": 1
     }
 }
diff --git a/tutorials/1_tutorial_preprocessing.ipynb b/tutorials/1_tutorial_preprocessing.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-17T15:40:12.854854Z",
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-17T15:40:20.557563Z",
@@ -30,7 +30,6 @@
     "import os\n",
     "import torch\n",
     "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
     "import yaml\n",
     "from ipyfilechooser import FileChooser"
    ]
@@ -80,6 +79,17 @@
     "display(submission1_path)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select path to populations (submission 1)\n",
+    "submission1_pop_path = FileChooser(path_to_sub_set.selected_path)\n",
+    "display(submission1_pop_path)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -97,6 +107,26 @@
     "display(submission2_path)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select path to populations (submission 2)\n",
+    "submission2_pop_path = FileChooser(path_to_sub_set.selected_path)\n",
+    "display(submission2_pop_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "submission2_pop_path.selected"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -116,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-13T07:40:59.387306Z",
@@ -140,6 +170,7 @@
     "        \"box_size\": 144,\n",
     "        \"pixel_size\": 1.073 * 2,\n",
     "        \"path\": submission1_path.selected_path,\n",
+    "        \"populations_file\": submission1_pop_path.selected,\n",
     "    },\n",
     "    1: {\n",
     "        \"name\": \"submission2\",\n",
@@ -148,13 +179,14 @@
     "        \"box_size\": 288,\n",
     "        \"pixel_size\": 1.073,\n",
     "        \"path\": submission2_path.selected_path,\n",
+    "        \"populations_file\": submission2_pop_path.selected,\n",
     "    },\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-13T07:41:01.194466Z",
@@ -176,17 +208,22 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After you create your submission_config, simply grab a copy of the file \"config_preproc.yaml\" from the provided config_files, and change the path for the \"submission_config_file\" to the file we created in the previous cell. Also change the path for the output. The rest of the parameters you can leave untouched. Please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. Then simply run\n",
+    "Lastly, to run the preprocessing pipeline follow these steps\n",
     "\n",
-    "```bash\n",
-    "cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml\n",
-    "```\n",
+    "0. Make sure to activate your environment and have the package installed!\n",
     "\n",
-    "Note: make sure to activate your environment and have the package installed!\n",
+    "1. Grab a copy of the file `config_preproc.yaml`from our config file templates.\n",
     "\n",
-    "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n",
+    "2. In the copied config file, update the value of `submission_config_file` to match the path to the file we created in the last cell.\n",
     "\n",
-    "IMPORTANT: The execution of the previous program relies on the existence of file to be saved at {{ submission1_path.selected_path }} with a specific formatting. The file must be named \"populations.txt\", and should be formatted as a single row/column CSV file containing the populations computed from your results. If the previous file is not included, the execution of the program will result in a runtime error."
+    "3. Optionally, change the other parameters. \n",
+    "    * Most of the parameters (BOT_* and thresh_percentile) are for the alignment. For details on how they work, please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. \n",
+    "\n",
+    "    * The other parameters are self explanatory, \"seed_flavor_assignment\" changes which submission gets assigned which ice cream flavor, keep this if you want to revert anonymity.\n",
+    "\n",
+    "4. Run the command: `cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml`\n",
+    "\n",
+    "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n"
    ]
   },
   {
@@ -209,7 +246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-13T07:43:16.259106Z",
@@ -242,12 +279,10 @@
    "source": [
     "n_submissions = 2  # change this to however many submissions you preprocessed\n",
     "\n",
-    "fig, ax = plt.subplots(2, 6, figsize=(20, 8))  # change values here too\n",
+    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))  # change values here too\n",
     "\n",
     "for i in range(n_submissions):\n",
-    "    idx = np.random.randint(\n",
-    "        0, 20\n",
-    "    )  # doing random volumes to check that everything went fine\n",
+    "    idx = 0\n",
     "\n",
     "    submission = torch.load(os.path.join(full_output_path, f\"submission_{i}.pt\"))\n",
     "    print(submission[\"volumes\"].shape, submission[\"id\"])\n",
@@ -258,9 +293,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "cryo-challenge-kernel",
    "language": "python",
-   "name": "python3"
+   "name": "cryo-challenge-kernel"
   },
   "language_info": {
    "codemirror_mode": {
@@ -272,7 +307,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.17"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,