Speed up pipeline by loading anndata directly

instead of loading it via Huggingface
helicalAI · Oct 3, 2024 · b142966 · b142966
1 parent 9980fb1
commit b142966
Show file tree

Hide file tree

Showing 7 changed files with 33 additions and 49 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -7,7 +7,7 @@ on:
       - release
 
 jobs:
-  setup:
+  tests:
     runs-on: self-hosted
     steps:
       - name: Checkout repository
@@ -28,18 +28,6 @@ jobs:
         run: |
           python ci/download_all.py
 
-  tests:
-    runs-on: self-hosted
-    needs: setup
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
-
       - name: Execute unittests
         run: |
           pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/
@@ -57,17 +45,6 @@ jobs:
       #     pytest-coverage-path: ./pytest-coverage.txt
       #     junitxml-path: ./pytest.xml
 
-  inference:
-    runs-on: self-hosted
-    needs: tests
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
 
       - name: Execute Geneformer v1
         run: |
@@ -88,26 +65,14 @@ jobs:
       - name: Execute Hyena
         run: |
           python examples/run_models/run_hyena_dna.py
-  
-  benchmarking:
-    needs: inference
-    runs-on: self-hosted
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
 
       - name: Execute benchmarking
         run: |
           pip install scanorama
           python examples/run_benchmark.py
 
   notebooks:
-    needs: benchmarking
+    needs: tests
     runs-on: self-hosted
     steps:
       - name: Checkout repository

diff --git a/ci/download_all.py b/ci/download_all.py
@@ -66,6 +66,7 @@ def main():
     downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt")
     downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt")
 
+    downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true")
     return True
 
 if __name__ == "__main__":

diff --git a/examples/config.yaml b/examples/config.yaml
@@ -16,7 +16,7 @@ data:
       batch_key: "batch"
       label_key: "LVL1"
       path: "helical-ai/yolksac_human"
-      gene_names: "gene_name"
+      gene_names: "index"
       name: "helical-ai/yolksac_human"
 
 

diff --git a/examples/run_benchmark.py b/examples/run_benchmark.py
@@ -147,8 +147,12 @@ def benchmark(cfg: DictConfig) -> None:
     head_cfg = cfg["svm"]
     integration_cfg = cfg["integration"]
 
-    hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    data = get_anndata_from_hf_dataset(hf_dataset)[:100]
+    # either load via huggingface
+    # hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # data = get_anndata_from_hf_dataset(hf_dataset)[:100]
+
+    # or load directly
+    data = ad.read_h5ad("./yolksac_human.h5ad")[:100]
     data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category")
 
     # set gene names. for example if the index is the ensemble gene id 

diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py
@@ -1,24 +1,27 @@
 from helical import Geneformer,GeneformerConfig
-from helical.utils import get_anndata_from_hf_dataset
 import hydra
 from omegaconf import DictConfig
+import anndata as ad
 from datasets import load_dataset
+from helical.utils import get_anndata_from_hf_dataset
 
 
 @hydra.main(version_base=None, config_path="configs", config_name="geneformer_config")
 def run(cfg: DictConfig):
     geneformer_config = GeneformerConfig(**cfg)
     geneformer = Geneformer(configurer = geneformer_config)
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # either load via huggingface
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     dataset = geneformer.process_data(ann_data[:10])
     embeddings = geneformer.get_embeddings(dataset)
 
     print(embeddings.shape)
 
-
-
 if __name__ == "__main__":
     run()
diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py
@@ -1,6 +1,7 @@
 from helical.models.scgpt.model import scGPT, scGPTConfig
 import hydra
 from omegaconf import DictConfig
+import anndata as ad
 from datasets import load_dataset
 from helical.utils import get_anndata_from_hf_dataset
 
@@ -9,8 +10,12 @@ def run(cfg: DictConfig):
     scgpt_config = scGPTConfig(**cfg)
     scgpt = scGPT(configurer = scgpt_config)
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # either load via huggingface
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     data = scgpt.process_data(ann_data[:10])
     embeddings = scgpt.get_embeddings(data)

diff --git a/examples/run_models/run_uce.py b/examples/run_models/run_uce.py
@@ -2,6 +2,7 @@
 import hydra
 from omegaconf import DictConfig
 import numpy as np
+import anndata as ad
 from datasets import load_dataset
 from helical.utils import get_anndata_from_hf_dataset
 
@@ -13,9 +14,13 @@
 def run(cfg: DictConfig):
     configurer=UCEConfig(**cfg)
     uce = UCE(configurer=configurer)
+    # either load via huggingface
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     batch_size = 10
 
@@ -38,5 +43,6 @@ def run(cfg: DictConfig):
     # Concatenate the embeddings from each batch
     all_embeddings = np.concatenate(all_embeddings, axis=0)
     print(all_embeddings.shape)
+
 if __name__ == "__main__":
     run()