Skip to content

Commit

Permalink
Merge pull request #40 from flatironinstitute/39-fix-small-bug-in-pre…
Browse files Browse the repository at this point in the history
…processing-pipeline

preprocessing update and small path #39
  • Loading branch information
DSilva27 authored Aug 5, 2024
2 parents e7b3304 + bb5c2f1 commit 26151d3
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 33 deletions.
35 changes: 22 additions & 13 deletions src/cryo_challenge/_preprocessing/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ class SubmissionPreprocessingDataLoader(Dataset):

def __init__(self, submission_config):
self.submission_config = submission_config
self.submission_paths, self.gt_path = self.extract_submission_paths()
self.validate_submission_config()

self.submission_paths, self.population_files, self.gt_path = (
self.extract_submission_paths()
)
self.subs_index = [int(idx) for idx in list(self.submission_config.keys())[1:]]
path_to_gt_ref = os.path.join(
self.gt_path, self.submission_config["gt"]["ref_align_fname"]
Expand Down Expand Up @@ -53,18 +57,24 @@ def validate_submission_config(self):
raise ValueError("Box size not found for ground truth")
if "pixel_size" not in value.keys():
raise ValueError("Pixel size not found for ground truth")
if "ref_align_fname" not in value.keys():
raise ValueError(
"Reference align file name not found for ground truth"
)
continue
else:
if "path" not in value.keys():
raise ValueError(f"Path not found for submission {key}")
if "id" not in value.keys():
raise ValueError(f"ID not found for submission {key}")
if "name" not in value.keys():
raise ValueError(f"Name not found for submission {key}")
if "box_size" not in value.keys():
raise ValueError(f"Box size not found for submission {key}")
if "pixel_size" not in value.keys():
raise ValueError(f"Pixel size not found for submission {key}")
if "align" not in value.keys():
raise ValueError(f"Align not found for submission {key}")
if "populations_file" not in value.keys():
raise ValueError(f"Population file not found for submission {key}")
if "flip" not in value.keys():
raise ValueError(f"Flip not found for submission {key}")

Expand All @@ -74,11 +84,10 @@ def validate_submission_config(self):
if not os.path.isdir(value["path"]):
raise ValueError(f"Path {value['path']} is not a directory")

ids = list(self.submission_config.keys())[1:]
if ids != list(range(len(ids))):
raise ValueError(
"Submission IDs should be integers starting from 0 and increasing by 1"
)
if not os.path.exists(value["populations_file"]):
raise ValueError(
f"Population file {value['populations_file']} does not exist"
)

return

Expand Down Expand Up @@ -137,13 +146,16 @@ def help(cls):

def extract_submission_paths(self):
submission_paths = []
population_files = []
for key, value in self.submission_config.items():
if key == "gt":
gt_path = value["path"]

else:
submission_paths.append(value["path"])
return submission_paths, gt_path
population_files.append(value["populations_file"])

return submission_paths, population_files, gt_path

def __len__(self):
return len(self.submission_paths)
Expand All @@ -156,10 +168,7 @@ def __getitem__(self, idx):

assert len(vol_paths) > 0, "No volumes found in submission directory"

populations = np.loadtxt(
os.path.join(self.submission_paths[idx], "populations.txt")
)
populations = torch.from_numpy(populations)
populations = torch.from_numpy(np.loadtxt(self.population_files[idx]))

vol0 = mrcfile.open(vol_paths[0], mode="r")
volumes = torch.zeros(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"box_size": 244,
"pixel_size": 2.146,
"path": "tests/data/unprocessed_dataset_2_submissions/submission_x",
"populations_file": "tests/data/unprocessed_dataset_2_submissions/submission_x/populations.txt",
"flip": 1
}
}
75 changes: 55 additions & 20 deletions tutorials/1_tutorial_preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-17T15:40:12.854854Z",
Expand All @@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-17T15:40:20.557563Z",
Expand All @@ -30,7 +30,6 @@
"import os\n",
"import torch\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import yaml\n",
"from ipyfilechooser import FileChooser"
]
Expand Down Expand Up @@ -80,6 +79,17 @@
"display(submission1_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Select path to populations (submission 1)\n",
"submission1_pop_path = FileChooser(path_to_sub_set.selected_path)\n",
"display(submission1_pop_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -97,6 +107,26 @@
"display(submission2_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Select path to populations (submission 2)\n",
"submission2_pop_path = FileChooser(path_to_sub_set.selected_path)\n",
"display(submission2_pop_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"submission2_pop_path.selected"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -116,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-13T07:40:59.387306Z",
Expand All @@ -140,6 +170,7 @@
" \"box_size\": 144,\n",
" \"pixel_size\": 1.073 * 2,\n",
" \"path\": submission1_path.selected_path,\n",
" \"populations_file\": submission1_pop_path.selected,\n",
" },\n",
" 1: {\n",
" \"name\": \"submission2\",\n",
Expand All @@ -148,13 +179,14 @@
" \"box_size\": 288,\n",
" \"pixel_size\": 1.073,\n",
" \"path\": submission2_path.selected_path,\n",
" \"populations_file\": submission2_pop_path.selected,\n",
" },\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-13T07:41:01.194466Z",
Expand All @@ -176,17 +208,22 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"After you create your submission_config, simply grab a copy of the file \"config_preproc.yaml\" from the provided config_files, and change the path for the \"submission_config_file\" to the file we created in the previous cell. Also change the path for the output. The rest of the parameters you can leave untouched. Please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. Then simply run\n",
"Lastly, to run the preprocessing pipeline follow these steps\n",
"\n",
"```bash\n",
"cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml\n",
"```\n",
"0. Make sure to activate your environment and have the package installed!\n",
"\n",
"Note: make sure to activate your environment and have the package installed!\n",
"1. Grab a copy of the file `config_preproc.yaml`from our config file templates.\n",
"\n",
"You can run the following cell to visualize your volumes (more precisely, a projection of them)\n",
"2. In the copied config file, update the value of `submission_config_file` to match the path to the file we created in the last cell.\n",
"\n",
"IMPORTANT: The execution of the previous program relies on the existence of file to be saved at {{ submission1_path.selected_path }} with a specific formatting. The file must be named \"populations.txt\", and should be formatted as a single row/column CSV file containing the populations computed from your results. If the previous file is not included, the execution of the program will result in a runtime error."
"3. Optionally, change the other parameters. \n",
" * Most of the parameters (BOT_* and thresh_percentile) are for the alignment. For details on how they work, please see the publication \"Singer, A., & Yang, R. (2024). Alignment of density maps in Wasserstein distance. Biological Imaging, 4, e5\" for more details. \n",
"\n",
" * The other parameters are self explanatory, \"seed_flavor_assignment\" changes which submission gets assigned which ice cream flavor, keep this if you want to revert anonymity.\n",
"\n",
"4. Run the command: `cryo_challenge run_preprocessing --config /path/to/config_preproc.yaml`\n",
"\n",
"You can run the following cell to visualize your volumes (more precisely, a projection of them)\n"
]
},
{
Expand All @@ -209,7 +246,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-13T07:43:16.259106Z",
Expand Down Expand Up @@ -242,12 +279,10 @@
"source": [
"n_submissions = 2 # change this to however many submissions you preprocessed\n",
"\n",
"fig, ax = plt.subplots(2, 6, figsize=(20, 8)) # change values here too\n",
"fig, ax = plt.subplots(1, 2, figsize=(10, 4)) # change values here too\n",
"\n",
"for i in range(n_submissions):\n",
" idx = np.random.randint(\n",
" 0, 20\n",
" ) # doing random volumes to check that everything went fine\n",
" idx = 0\n",
"\n",
" submission = torch.load(os.path.join(full_output_path, f\"submission_{i}.pt\"))\n",
" print(submission[\"volumes\"].shape, submission[\"id\"])\n",
Expand All @@ -258,9 +293,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "cryo-challenge-kernel",
"language": "python",
"name": "python3"
"name": "cryo-challenge-kernel"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -272,7 +307,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.17"
"version": "3.10.10"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 26151d3

Please sign in to comment.