Revert "64 set up GitHub lfs for testing, and ruff for linting"

flatironinstitute · Aug 6, 2024 · 758f236 · 758f236
1 parent 2b177c3
commit 758f236
Show file tree

Hide file tree

Showing 42 changed files with 226 additions and 393 deletions.
diff --git a/.gitattributes b/.gitattributes
diff --git a/.github/workflows/main_merge_check.yml b/.github/workflows/main_merge_check.yml
@@ -11,4 +11,4 @@ jobs:
         if: github.base_ref == 'main' && github.head_ref != 'dev'
         run: |
           echo "ERROR: You can only merge to main from dev."
-          exit 1
+          exit 1
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -26,22 +26,30 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: 'pip' # caching pip dependencies
 
-      - name: Install Git LFS
-        run: |
-          sudo apt-get update
-          sudo apt-get install git-lfs
-          git lfs install
-      - name: Pull LFS Files
-        run: git lfs pull
+      - name: Cache test data
+        id: cache_test_data
+        uses: actions/cache@v3
+        with:
+          path: |
+            tests/data
+            data
+          key: venv-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('**/tests/scripts/fetch_test_data.sh') }}
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install .
           pip install pytest omegaconf
-
+          
+      - name: Get test data from OSF
+        if: ${{ steps.cache_test_data.outputs.cache-hit != 'true' }}
+        run: |
+          sh tests/scripts/fetch_test_data.sh
+          
       - name: Test with pytest
         run: |
           pytest tests/test_preprocessing.py
           pytest tests/test_svd.py
           pytest tests/test_map_to_map.py
           pytest tests/test_distribution_to_distribution.py
+          
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,9 @@ data/dataset_1_submissions
 data/dataset_2_ground_truth
 
 # data for testing and resulting outputs
+tests/data/Ground_truth
+tests/data/dataset_2_submissions/
+tests/data/unprocessed_dataset_2_submissions/submission_x/
 tests/results/
 
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,7 +10,6 @@ repos:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
-    -   id: check-added-large-files
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
   rev: v0.3.4

diff --git a/README.md b/README.md
@@ -41,9 +41,6 @@ pip install .
 ```
 
 ## Developer installation
-
-First of all, make sure to have git lfs installed, otherwise you will have no access to the testing data. For installing, please follow these [guidelines](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage).
-
 If you are interested in testing the programs previously installed, please, install the repository in development mode with the following commands:
 
 ```bash
@@ -55,6 +52,7 @@ The test included in the repo can be executed with PyTest as shown below:
 
 ```bash
 cd /path/to/Cryo-EM-Heterogeneity-Challenge-1
+sh tests/scripts/fetch_test_data.sh # download test data from OSF
 pytest tests/test_preprocessing.py
 pytest tests/test_svd.py
 pytest tests/test_map_to_map.py

diff --git a/config_files/config_distribution_to_distribution.yaml b/config_files/config_distribution_to_distribution.yaml
@@ -12,4 +12,4 @@ cvxpy_solver: ECOS
 optimal_q_kl:
   n_iter: 100000
   break_atol: 0.0001
-output_fname: results/distribution_to_distribution_submission_0.pkl
+output_fname: results/distribution_to_distribution_submission_0.pkl
diff --git a/config_files/config_map_to_map_distance_matrix.yaml b/config_files/config_map_to_map_distance_matrix.yaml
@@ -1,15 +1,15 @@
 data:
   n_pix: 224
-  psize: 2.146
+  psize: 2.146 
   submission:
     fname: data/dataset_2_ground_truth/submission_0.pt
     volume_key: volumes
     metadata_key: populations
     label_key: id
   ground_truth:
-    volumes: data/dataset_2_ground_truth/maps_gt_flat.pt
-    metadata: data/dataset_2_ground_truth/metadata.csv
-  mask:
+    volumes: data/dataset_2_ground_truth/maps_gt_flat.pt 
+    metadata: data/dataset_2_ground_truth/metadata.csv 
+  mask: 
     do: true
     volume: data/dataset_2_ground_truth/mask_dilated_wide_224x224.mrc
 analysis:
@@ -23,4 +23,4 @@ analysis:
   normalize:
     do: true
     method: median_zscore
-output: results/map_to_map_distance_matrix_submission_0.pkl
+output: results/map_to_map_distance_matrix_submission_0.pkl
diff --git a/src/cryo_challenge/__init__.py b/src/cryo_challenge/__init__.py
@@ -1,3 +1 @@
-from cryo_challenge.__about__ import __version__
-
-__all__ = ["__version__"]
+from cryo_challenge.__about__ import __version__
diff --git a/src/cryo_challenge/_distribution_to_distribution/distribution_to_distribution.py b/src/cryo_challenge/_distribution_to_distribution/distribution_to_distribution.py
@@ -2,6 +2,8 @@
 import numpy as np
 import pickle
 from scipy.stats import rankdata
+import yaml
+import argparse
 import torch
 import ot
 
@@ -12,12 +14,10 @@
 
 
 def sort_by_transport(cost):
-    m, n = cost.shape
-    _, transport = compute_wasserstein_between_distributions_from_weights_and_cost(
-        np.ones(m) / m, np.ones(n) / n, cost
-    )
-    indices = np.argsort((transport * np.arange(m)[..., None]).sum(0))
-    return cost[:, indices], indices, transport
+    m,n = cost.shape
+    _, transport = compute_wasserstein_between_distributions_from_weights_and_cost(np.ones(m) / m, np.ones(n)/n, cost)
+    indices = np.argsort((transport * np.arange(m)[...,None]).sum(0))
+    return cost[:,indices], indices, transport
 
 
 def compute_wasserstein_between_distributions_from_weights_and_cost(
@@ -65,14 +65,15 @@ def make_assignment_matrix(cost_matrix):
 
 
 def run(config):
+
     metadata_df = pd.read_csv(config["gt_metadata_fname"])
     metadata_df.sort_values("pc1", inplace=True)
 
     with open(config["input_fname"], "rb") as f:
         data = pickle.load(f)
 
     # user_submitted_populations = np.ones(80)/80
-    user_submitted_populations = data["user_submitted_populations"]  # .numpy()
+    user_submitted_populations = data["user_submitted_populations"]#.numpy()
     id = torch.load(data["config"]["data"]["submission"]["fname"])["id"]
 
     results_dict = {}
@@ -212,5 +213,5 @@ def optimal_q_kl(n_iter, x_start, A, Window, prob_gt, break_atol):
     DistributionToDistributionResultsValidator.from_dict(results_dict)
     with open(config["output_fname"], "wb") as f:
         pickle.dump(results_dict, f)
-
+    
     return results_dict
diff --git a/src/cryo_challenge/_map_to_map/map_to_map_distance_matrix.py b/src/cryo_challenge/_map_to_map/map_to_map_distance_matrix.py
@@ -42,7 +42,7 @@ def run(config):
     user_submission_label = submission[label_key]
 
     # n_trunc = 10
-    metadata_gt = pd.read_csv(config["data"]["ground_truth"]["metadata"])  # [:n_trunc]
+    metadata_gt = pd.read_csv(config["data"]["ground_truth"]["metadata"])#[:n_trunc]
 
     results_dict = {}
     results_dict["config"] = config

diff --git a/src/cryo_challenge/_ploting/plotting_utils.py b/src/cryo_challenge/_ploting/plotting_utils.py
@@ -1,7 +1,6 @@
 import numpy as np
 
-
 def res_at_fsc_threshold(fscs, threshold=0.5):
     res_fsc_half = np.argmin(fscs > threshold, axis=-1)
-    fraction_nyquist = 0.5 * res_fsc_half / fscs.shape[-1]
-    return res_fsc_half, fraction_nyquist
+    fraction_nyquist = 0.5*res_fsc_half / fscs.shape[-1]
+    return res_fsc_half, fraction_nyquist
diff --git a/src/cryo_challenge/data/__init__.py b/src/cryo_challenge/data/__init__.py
@@ -1,18 +1,6 @@
-from ._validation.config_validators import (
-    validate_input_config_disttodist as validate_input_config_disttodist,
-)
-from ._validation.config_validators import (
-    validate_config_dtd_optimal_q_kl as validate_config_dtd_optimal_q_kl,
-)
-from cryo_challenge.data._validation.output_validators import (
-    DistributionToDistributionResultsValidator as DistributionToDistributionResultsValidator,
-)
-from cryo_challenge.data._validation.output_validators import (
-    MetricDistToDistValidator as MetricDistToDistValidator,
-)
-from cryo_challenge.data._validation.output_validators import (
-    ReplicateValidatorEMD as ReplicateValidatorEMD,
-)
-from cryo_challenge.data._validation.output_validators import (
-    ReplicateValidatorKL as ReplicateValidatorKL,
-)
+from ._validation.config_validators import validate_input_config_disttodist as validate_input_config_disttodist
+from ._validation.config_validators import validate_config_dtd_optimal_q_kl as validate_config_dtd_optimal_q_kl
+from cryo_challenge.data._validation.output_validators import DistributionToDistributionResultsValidator as DistributionToDistributionResultsValidator
+from cryo_challenge.data._validation.output_validators import MetricDistToDistValidator as MetricDistToDistValidator
+from cryo_challenge.data._validation.output_validators import ReplicateValidatorEMD as ReplicateValidatorEMD
+from cryo_challenge.data._validation.output_validators import ReplicateValidatorKL as ReplicateValidatorKL
diff --git a/src/cryo_challenge/data/_io/svd_io_utils.py b/src/cryo_challenge/data/_io/svd_io_utils.py
@@ -106,16 +106,14 @@ def load_ref_vols(box_size_ds: int, path_to_volumes: str, dtype=torch.float32):
 
     # Reshape volumes to correct size
     if volumes.dim() == 2:
-        box_size = int(round((float(volumes.shape[-1]) ** (1.0 / 3.0))))
+        box_size = int(round((float(volumes.shape[-1]) ** (1. / 3.))))
         volumes = torch.reshape(volumes, (-1, box_size, box_size, box_size))
     elif volumes.dim() == 4:
         pass
     else:
-        raise ValueError(
-            f"The shape of the volumes stored in {path_to_volumes} have the unexpected shape "
-            f"{torch.shape}. Please, review the file and regenerate it so that volumes stored hasve the "
-            f"shape (num_vols, box_size ** 3) or (num_vols, box_size, box_size, box_size)."
-        )
+        raise ValueError(f"The shape of the volumes stored in {path_to_volumes} have the unexpected shape "
+                         f"{torch.shape}. Please, review the file and regenerate it so that volumes stored hasve the "
+                         f"shape (num_vols, box_size ** 3) or (num_vols, box_size, box_size, box_size).")
 
     volumes_ds = torch.empty(
         (volumes.shape[0], box_size_ds, box_size_ds, box_size_ds), dtype=dtype

diff --git a/src/cryo_challenge/data/_validation/config_validators.py b/src/cryo_challenge/data/_validation/config_validators.py
@@ -1,7 +1,7 @@
 from numbers import Number
 import pandas as pd
 import os
-
+from typing import List
 
 def validate_generic_config(config: dict, reference: dict) -> None:
     """

diff --git a/src/cryo_challenge/data/_validation/output_validators.py b/src/cryo_challenge/data/_validation/output_validators.py
@@ -13,7 +13,7 @@
 @dataclass_json
 @dataclass
 class MapToMapResultsValidator:
-    """
+    '''
     Validate the output dictionary of the map-to-map distance matrix computation.
 
     config: dict, input config dictionary.
@@ -22,8 +22,7 @@ class MapToMapResultsValidator:
     l2: dict, L2 results.
     bioem: dict, BioEM results.
     fsc: dict, FSC results.
-    """
-
+    '''
     config: dict
     user_submitted_populations: torch.Tensor
     corr: Optional[dict] = None
@@ -50,7 +49,7 @@ class ReplicateValidatorEMD:
     Validate the output dictionary of one EMD in the the distribution-to-distribution pipeline.
 
     q_opt: List[float], optimal user submitted distribution, which sums to 1.
-    EMD_opt: float, EMD between the ground truth distribution (p) and the (optimized) user submitted distribution (q_opt).
+    EMD_opt: float, EMD between the ground truth distribution (p) and the (optimized) user submitted distribution (q_opt). 
         The transport plan is a joint distribution, such that:
         summing over the rows gives the (optimized) user submitted distribution, and summing over the columns gives the ground truth distribution.
     transport_plan_opt: List[List[float]], transport plan between the ground truth distribution (p, rows) and the (optimized) user submitted distribution (q_opt, columns).
@@ -62,7 +61,6 @@ class ReplicateValidatorEMD:
         The transport plan is a joint distribution, such that:
         summing over the rows gives the user submitted distribution, and summing over the columns gives the ground truth distribution.
     """
-
     q_opt: List[float]
     EMD_opt: float
     transport_plan_opt: List[List[float]]
@@ -89,9 +87,8 @@ class ReplicateValidatorKL:
     iter_stop: int, number of iterations until convergence.
     eps_stop: float, stopping criterion.
     klpq_submitted: float, KL divergence between the ground truth distribution (p) and the user submitted distribution (q).
-    klqp_submitted: float, KL divergence between the user submitted distribution (q) and the ground truth distribution (p).
+    klqp_submitted: float, KL divergence between the user submitted distribution (q) and the ground truth distribution (p).    
     """
-
     q_opt: List[float]
     klpq_opt: float
     klqp_opt: float
@@ -109,12 +106,11 @@ def __post_init__(self):
 @dataclass_json
 @dataclass
 class MetricDistToDistValidator:
-    """
+    '''
     Validate the output dictionary of one map to map metric in the the distribution-to-distribution pipeline.
 
     replicates: dict, dictionary of replicates.
-    """
-
+    '''
     replicates: dict
 
     def validate_replicates(self, n_replicates):
@@ -130,7 +126,7 @@ def validate_replicates(self, n_replicates):
 @dataclass_json
 @dataclass
 class DistributionToDistributionResultsValidator:
-    """
+    '''
     Validate the output dictionary of the distribution-to-distribution pipeline.
 
     config: dict, input config dictionary.
@@ -140,8 +136,7 @@ class DistributionToDistributionResultsValidator:
     bioem: dict, BioEM distance results.
     l2: dict, L2 distance results.
     corr: dict, correlation distance results.
-    """
-
+    '''
     config: dict
     user_submitted_populations: torch.Tensor
     id: str

diff --git a/tests/config_files/test_config_map_to_map.yaml b/tests/config_files/test_config_map_to_map.yaml
@@ -2,7 +2,7 @@ data:
   n_pix: 224
   psize: 2.146
   submission:
-    fname: tests/data/dataset_2_submissions/submission_10000.pt
+    fname: tests/data/dataset_2_submissions/test_submission_0_n8.pt
     volume_key: volumes
     metadata_key: populations
     label_key: id

diff --git a/tests/config_files/test_config_svd.yaml b/tests/config_files/test_config_svd.yaml
@@ -1,6 +1,6 @@
 path_to_volumes: tests/data/dataset_2_submissions/
 box_size_ds: 32
-submission_list: [10000]
+submission_list: [0]
 experiment_mode: "all_vs_ref" # options are "all_vs_all", "all_vs_ref"
 # optional unless experiment_mode is "all_vs_ref"
 path_to_reference: tests/data/Ground_truth/test_maps_gt_flat_10.pt

diff --git a/tests/data/Ground_truth/1.mrc b/tests/data/Ground_truth/1.mrc
diff --git a/tests/data/Ground_truth/mask_dilated_wide_224x224.mrc b/tests/data/Ground_truth/mask_dilated_wide_224x224.mrc
diff --git a/tests/data/Ground_truth/test_maps_gt_flat_10.npy b/tests/data/Ground_truth/test_maps_gt_flat_10.npy
diff --git a/tests/data/Ground_truth/test_maps_gt_flat_10.pt b/tests/data/Ground_truth/test_maps_gt_flat_10.pt
diff --git a/tests/data/Ground_truth/test_metadata_10.csv b/tests/data/Ground_truth/test_metadata_10.csv
diff --git a/tests/data/dataset_2_submissions/submission_10000.pt b/tests/data/dataset_2_submissions/submission_10000.pt
diff --git a/tests/data/test_maps_gt_flat_2.pt b/tests/data/test_maps_gt_flat_2.pt
diff --git a/tests/data/unprocessed_dataset_2_submissions/submission_x/1.mrc b/tests/data/unprocessed_dataset_2_submissions/submission_x/1.mrc
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,4 +11,4 @@ jobs: @@
             if: github.base_ref == 'main' && github.head_ref != 'dev'
             run: |
               echo "ERROR: You can only merge to main from dev."
-              exit 1
+              exit 1