Merge pull request #108 from helicalAI/main

Change workflow to exectute tests and notebooks sequentially
helicalAI · Oct 3, 2024 · d254fe4 · d254fe4
2 parents 77b32c0 + 72fec17
commit d254fe4
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 57 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -7,10 +7,8 @@ on:
       - release
 
 jobs:
-  build:
-
+  tests:
     runs-on: self-hosted
-    timeout-minutes: 60 
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2
@@ -47,6 +45,7 @@ jobs:
       #     pytest-coverage-path: ./pytest-coverage.txt
       #     junitxml-path: ./pytest.xml
 
+
       - name: Execute Geneformer v1
         run: |
           python examples/run_models/run_geneformer.py ++model_name="gf-12L-30M-i2048"
@@ -66,8 +65,37 @@ jobs:
       - name: Execute Hyena
         run: |
           python examples/run_models/run_hyena_dna.py
-      
+
       - name: Execute benchmarking
         run: |
           pip install scanorama
           python examples/run_benchmark.py
+
+  notebooks:
+    needs: tests
+    runs-on: self-hosted
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.8
+
+      - name: Install dependencies
+        run: |
+            pip install -r requirements-dev.txt
+            
+      - name: Reduce datasets to speedup checks
+        run: |
+          sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
+          sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
+          sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+          sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+          sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+          sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+
+      - name: Run Notebooks
+        run: |
+          pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb
diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
diff --git a/ci/download_all.py b/ci/download_all.py
@@ -66,6 +66,7 @@ def main():
     downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt")
     downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt")
 
+    downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true")
     return True
 
 if __name__ == "__main__":

diff --git a/ci/tests/test_geneformer/test_geneformer_model.py b/ci/tests/test_geneformer/test_geneformer_model.py
@@ -52,7 +52,8 @@ def test_ensure_data_validity_raising_error_with_missing_ensembl_id_column(self,
 
     @pytest.mark.parametrize("gene_symbols, raises_error",
                              [
-                                (['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True),
+                                (['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True), # humans
+                                (['ENSMUSG00000021033', 'ENSMUSG00000021033', 'ENSMUSG00000021033'], True), # mice
                                 (['SAMD11', 'None', 'HES4'], True),
                                 (['SAMD11', 'PLEKHN1', 'HES4'], False),
                              ]

diff --git a/examples/config.yaml b/examples/config.yaml
@@ -16,7 +16,7 @@ data:
       batch_key: "batch"
       label_key: "LVL1"
       path: "helical-ai/yolksac_human"
-      gene_names: "gene_name"
+      gene_names: "index"
       name: "helical-ai/yolksac_human"
 
 

diff --git a/examples/run_benchmark.py b/examples/run_benchmark.py
@@ -147,15 +147,19 @@ def benchmark(cfg: DictConfig) -> None:
     head_cfg = cfg["svm"]
     integration_cfg = cfg["integration"]
 
-    hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    data = get_anndata_from_hf_dataset(hf_dataset)[:100]
+    # either load via huggingface
+    # hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # data = get_anndata_from_hf_dataset(hf_dataset)[:10]
+
+    # or load directly
+    data = ad.read_h5ad("./yolksac_human.h5ad")[:10]
     data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category")
 
     # set gene names. for example if the index is the ensemble gene id 
     # data.var_names = data.var["feature_name"]
 
-    run_classification_example(data, ["geneformer", "scgpt"], data_cfg, head_cfg, device=cfg["device"])
-    # run_integration_example(data, ["geneformer", "scgpt", "scanorama"], data_cfg, integration_cfg, device=cfg["device"])
+    run_classification_example(data, ["scgpt", "geneformer"], data_cfg, head_cfg, device=cfg["device"])
+    # run_integration_example(data, ["scgpt", "geneformer", "scanorama"], data_cfg, integration_cfg, device=cfg["device"])
     LOGGER.info("Benchmarking done.")
 
 if __name__ == "__main__":

diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py
@@ -1,24 +1,27 @@
 from helical import Geneformer,GeneformerConfig
-from helical.utils import get_anndata_from_hf_dataset
 import hydra
 from omegaconf import DictConfig
+import anndata as ad
 from datasets import load_dataset
+from helical.utils import get_anndata_from_hf_dataset
 
 
 @hydra.main(version_base=None, config_path="configs", config_name="geneformer_config")
 def run(cfg: DictConfig):
     geneformer_config = GeneformerConfig(**cfg)
     geneformer = Geneformer(configurer = geneformer_config)
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # either load via huggingface
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     dataset = geneformer.process_data(ann_data[:10])
     embeddings = geneformer.get_embeddings(dataset)
 
     print(embeddings.shape)
 
-
-
 if __name__ == "__main__":
     run()
diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py
@@ -1,6 +1,7 @@
 from helical.models.scgpt.model import scGPT, scGPTConfig
 import hydra
 from omegaconf import DictConfig
+import anndata as ad
 from datasets import load_dataset
 from helical.utils import get_anndata_from_hf_dataset
 
@@ -9,8 +10,12 @@ def run(cfg: DictConfig):
     scgpt_config = scGPTConfig(**cfg)
     scgpt = scGPT(configurer = scgpt_config)
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # either load via huggingface
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     data = scgpt.process_data(ann_data[:10])
     embeddings = scgpt.get_embeddings(data)

diff --git a/examples/run_models/run_uce.py b/examples/run_models/run_uce.py
@@ -2,6 +2,7 @@
 import hydra
 from omegaconf import DictConfig
 import numpy as np
+import anndata as ad
 from datasets import load_dataset
 from helical.utils import get_anndata_from_hf_dataset
 
@@ -13,9 +14,13 @@
 def run(cfg: DictConfig):
     configurer=UCEConfig(**cfg)
     uce = UCE(configurer=configurer)
+    # either load via huggingface
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     batch_size = 10
 
@@ -38,5 +43,6 @@ def run(cfg: DictConfig):
     # Concatenate the embeddings from each batch
     all_embeddings = np.concatenate(all_embeddings, axis=0)
     print(all_embeddings.shape)
+
 if __name__ == "__main__":
     run()
diff --git a/helical/models/geneformer/model.py b/helical/models/geneformer/model.py
@@ -145,7 +145,7 @@ def process_data(self,
 
         # map gene symbols to ensemble ids if provided
         if gene_names != "ensembl_id":
-            if (adata.var[gene_names].str.startswith("ENSG").all()) or (adata.var[gene_names].str.startswith("None").any()):
+            if (adata.var[gene_names].str.startswith("ENS").all()) or (adata.var[gene_names].str.startswith("None").any()):
                 message = "It seems an anndata with 'ensemble ids' and/or 'None' was passed. " \
                 "Please set gene_names='ensembl_id' and remove 'None's to skip mapping."
                 LOGGER.info(message)