merge from main to include testing

flatironinstitute · Jul 11, 2024 · 22504f5 · 22504f5
2 parents 6719dda + 5b07377
commit 22504f5
Show file tree

Hide file tree

Showing 9 changed files with 88 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,9 @@
-tests/data
+tests/data/dataset_2_submissions
+tests/data/Ground_truth
+tests/results
+tests/data/unprocessed_dataset_2_submissions/submission_x/*.mrc
+tests/data/unprocessed_dataset_2_submissions/submission_x/populations.txt
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/config_files/config_svd.yaml b/config_files/config_svd.yaml
@@ -4,7 +4,7 @@ submission_list: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
 experiment_mode: "all_vs_ref" # options are "all_vs_all", "all_vs_ref"
 
 power_spectrum_normalization:
-  ref_vol_key: "Vanilla" # which submission should be used
+  ref_vol_key: "FLAVOR" # which submission should be used
   ref_vol_index: 0 # which volume of that submission should be used
 
 # optional unless experiment_mode is "all_vs_ref"

diff --git a/src/cryo_challenge/_preprocessing/dataloader.py b/src/cryo_challenge/_preprocessing/dataloader.py
@@ -25,7 +25,11 @@ class SubmissionPreprocessingDataLoader(Dataset):
 
     def __init__(self, submission_config):
         self.submission_config = submission_config
-        self.submission_paths, self.gt_path = self.extract_submission_paths()
+        self.validate_submission_config()
+
+        self.submission_paths, self.population_files, self.gt_path = (
+            self.extract_submission_paths()
+        )
         self.subs_index = [int(idx) for idx in list(self.submission_config.keys())[1:]]
         path_to_gt_ref = os.path.join(
             self.gt_path, self.submission_config["gt"]["ref_align_fname"]
@@ -53,12 +57,16 @@ def validate_submission_config(self):
                     raise ValueError("Box size not found for ground truth")
                 if "pixel_size" not in value.keys():
                     raise ValueError("Pixel size not found for ground truth")
+                if "ref_align_fname" not in value.keys():
+                    raise ValueError(
+                        "Reference align file name not found for ground truth"
+                    )
                 continue
             else:
                 if "path" not in value.keys():
                     raise ValueError(f"Path not found for submission {key}")
-                if "id" not in value.keys():
-                    raise ValueError(f"ID not found for submission {key}")
+                if "name" not in value.keys():
+                    raise ValueError(f"Name not found for submission {key}")
                 if "box_size" not in value.keys():
                     raise ValueError(f"Box size not found for submission {key}")
                 if "pixel_size" not in value.keys():
@@ -74,11 +82,10 @@ def validate_submission_config(self):
                 if not os.path.isdir(value["path"]):
                     raise ValueError(f"Path {value['path']} is not a directory")
 
-        ids = list(self.submission_config.keys())[1:]
-        if ids != list(range(len(ids))):
-            raise ValueError(
-                "Submission IDs should be integers starting from 0 and increasing by 1"
-            )
+                if not os.path.exists(value["populations_file"]):
+                    raise ValueError(
+                        f"Population file {value['populations_file']} does not exist"
+                    )
 
         return
 
@@ -137,13 +144,16 @@ def help(cls):
 
     def extract_submission_paths(self):
         submission_paths = []
+        population_files = []
         for key, value in self.submission_config.items():
             if key == "gt":
                 gt_path = value["path"]
 
             else:
                 submission_paths.append(value["path"])
-        return submission_paths, gt_path
+                population_files.append(value["populations_file"])
+
+        return submission_paths, population_files, gt_path
 
     def __len__(self):
         return len(self.submission_paths)
@@ -156,10 +166,7 @@ def __getitem__(self, idx):
 
         assert len(vol_paths) > 0, "No volumes found in submission directory"
 
-        populations = np.loadtxt(self.submission_config["populations_file"]).astype(
-            float
-        )
-        populations = torch.from_numpy(populations)
+        populations = torch.from_numpy(np.loadtxt(self.population_files[idx]))
 
         vol0 = mrcfile.open(vol_paths[0], mode="r")
         volumes = torch.zeros(

diff --git a/src/cryo_challenge/_preprocessing/preprocessing_pipeline.py b/src/cryo_challenge/_preprocessing/preprocessing_pipeline.py
@@ -124,7 +124,8 @@ def preprocess_submissions(submission_dataset, config):
         print(f"   submission saved as submission_{idx}.pt")
         print(f"Preprocessing submission {idx} complete")
 
-    with open("hash_table.json", "w") as f:
+    hash_table_path = os.path.join(config["output_path"], "hash_table.json")
+    with open(hash_table_path, "w") as f:
         json.dump(hash_table, f, indent=4)
 
     return
diff --git a/tests/config_files/test_config_distribution_to_distribution.yaml b/tests/config_files/test_config_distribution_to_distribution.yaml
@@ -9,4 +9,4 @@ cvxpy_solver: ECOS
 optimal_q_kl:
   n_iter: 100000
   break_atol: 0.0001
-output_fname: results/test_distribution_to_distribution_submission_0.pkl
+output_fname: tests/results/test_distribution_to_distribution_submission_0.pkl
diff --git a/tests/config_files/test_config_map_to_map.yaml b/tests/config_files/test_config_map_to_map.yaml
@@ -11,7 +11,7 @@ data:
     metadata: tests/data/Ground_truth/test_metadata_10.csv
   mask:
     do: true
-    volume: data/Ground_truth/mask_dilated_wide_224x224.mrc
+    volume: tests/data/Ground_truth/mask_dilated_wide_224x224.mrc
 analysis:
   metrics:
     - l2

diff --git a/tests/config_files/test_config_svd.yaml b/tests/config_files/test_config_svd.yaml
@@ -5,7 +5,7 @@ experiment_mode: "all_vs_ref" # options are "all_vs_all", "all_vs_ref"
 # optional unless experiment_mode is "all_vs_ref"
 
 power_spectrum_normalization:
-  ref_vol_key: "Mango" # which submission should be used
+  ref_vol_key: "Coffee" # which submission should be used
   ref_vol_index: 0 # which volume of that submission should be used
 
 path_to_reference: tests/data/Ground_truth/test_maps_gt_flat_10.pt

diff --git a/tests/scripts/fetch_test_data.sh b/tests/scripts/fetch_test_data.sh
@@ -5,7 +5,7 @@ ln -s $ADIR/tests/data/dataset_2_submissions/test_submission_0_n8.pt $ADIR/tests
 wget https://files.osf.io/v1/resources/8h6fz/providers/dropbox/tests/Ground_truth/test_maps_gt_flat_10.pt?download=true -O tests/data/Ground_truth/test_maps_gt_flat_10.pt
 wget https://files.osf.io/v1/resources/8h6fz/providers/dropbox/tests/Ground_truth/test_metadata_10.csv?download=true -O tests/data/Ground_truth/test_metadata_10.csv
 wget https://files.osf.io/v1/resources/8h6fz/providers/dropbox/tests/Ground_truth/1.mrc?download=true -O tests/data/Ground_truth/1.mrc
-wget https://files.osf.io/v1/resources/8h6fz/providers/dropbox/Ground_truth/mask_dilated_wide_224x224.mrc?download=true -O data/Ground_truth/mask_dilated_wide_224x224.mrc
+wget https://files.osf.io/v1/resources/8h6fz/providers/dropbox/Ground_truth/mask_dilated_wide_224x224.mrc?download=true -O tests/data/Ground_truth/mask_dilated_wide_224x224.mrc
 for FILE in 1.mrc 2.mrc 3.mrc 4.mrc populations.txt
 do
     wget https://files.osf.io/v1/resources/8h6fz/providers/dropbox/tests/unprocessed_dataset_2_submissions/submission_x/${FILE}?download=true -O tests/data/unprocessed_dataset_2_submissions/submission_x/${FILE}

diff --git a/tutorials/1_tutorial_preprocessing.ipynb b/tutorials/1_tutorial_preprocessing.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-17T15:40:12.854854Z",
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-17T15:40:20.557563Z",
@@ -30,7 +30,6 @@
     "import os\n",
     "import torch\n",
     "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
     "import yaml\n",
     "from ipyfilechooser import FileChooser"
    ]
@@ -80,6 +79,17 @@
     "display(submission1_path)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select path to populations (submission 1)\n",
+    "submission1_pop_path = FileChooser(path_to_sub_set.selected_path)\n",
+    "display(submission1_pop_path)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -97,6 +107,26 @@
     "display(submission2_path)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select path to populations (submission 2)\n",
+    "submission2_pop_path = FileChooser(path_to_sub_set.selected_path)\n",
+    "display(submission2_pop_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "submission2_pop_path.selected"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -116,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-13T07:40:59.387306Z",
@@ -139,20 +169,22 @@
     "        \"box_size\": 144,\n",
     "        \"pixel_size\": 1.073 * 2,\n",
     "        \"path\": submission1_path.selected_path,\n",
+    "        \"populations_file\": submission1_pop_path.selected,\n",
     "    },\n",
     "    1: {\n",
     "        \"name\": \"submission2\",\n",
     "        \"align\": 1,\n",
     "        \"box_size\": 288,\n",
     "        \"pixel_size\": 1.073,\n",
     "        \"path\": submission2_path.selected_path,\n",
+    "        \"populations_file\": submission2_pop_path.selected,\n",
     "    },\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-13T07:41:01.194466Z",
@@ -174,17 +206,22 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After you create your submission_config, simply grab a copy of the file \"config_preproc.yaml\" from the provided config_files, and change the path for the \"submission_config_file\" to the file we created in the previous cell. Also change the path for the output. The rest of the parameters you can leave untouched. Please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. Then simply run\n",
+    "Lastly, to run the preprocessing pipeline follow these steps\n",
     "\n",
-    "```bash\n",
-    "cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml\n",
-    "```\n",
+    "0. Make sure to activate your environment and have the package installed!\n",
     "\n",
-    "Note: make sure to activate your environment and have the package installed!\n",
+    "1. Grab a copy of the file `config_preproc.yaml`from our config file templates.\n",
     "\n",
-    "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n",
+    "2. In the copied config file, update the value of `submission_config_file` to match the path to the file we created in the last cell.\n",
     "\n",
-    "IMPORTANT: The execution of the previous program relies on the existence of file to be saved at {{ submission1_path.selected_path }} with a specific formatting. The file must be named \"populations.txt\", and should be formatted as a single row/column CSV file containing the populations computed from your results. If the previous file is not included, the execution of the program will result in a runtime error."
+    "3. Optionally, change the other parameters. \n",
+    "    * Most of the parameters (BOT_* and thresh_percentile) are for the alignment. For details on how they work, please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. \n",
+    "\n",
+    "    * The other parameters are self explanatory, \"seed_flavor_assignment\" changes which submission gets assigned which ice cream flavor, keep this if you want to revert anonymity.\n",
+    "\n",
+    "4. Run the command: `cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml`\n",
+    "\n",
+    "You can run the following cell to visualize your volumes (more precisely, a projection of them)\n"
    ]
   },
   {
@@ -207,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-06-13T07:43:16.259106Z",
@@ -240,12 +277,10 @@
    "source": [
     "n_submissions = 2  # change this to however many submissions you preprocessed\n",
     "\n",
-    "fig, ax = plt.subplots(2, 6, figsize=(20, 8))  # change values here too\n",
+    "fig, ax = plt.subplots(1, 2, figsize=(10, 4))  # change values here too\n",
     "\n",
     "for i in range(n_submissions):\n",
-    "    idx = np.random.randint(\n",
-    "        0, 20\n",
-    "    )  # doing random volumes to check that everything went fine\n",
+    "    idx = 0\n",
     "\n",
     "    submission = torch.load(os.path.join(full_output_path, f\"submission_{i}.pt\"))\n",
     "    print(submission[\"volumes\"].shape, submission[\"id\"])\n",
@@ -256,9 +291,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "cryo-challenge-kernel",
    "language": "python",
-   "name": "python3"
+   "name": "cryo-challenge-kernel"
   },
   "language_info": {
    "codemirror_mode": {
@@ -270,7 +305,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.17"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,