From 9e09d1145c038efac6499b13d678eb819e776da8 Mon Sep 17 00:00:00 2001
From: Benoit Putzeys <benoit@helical-ai.com>
Date: Wed, 2 Oct 2024 17:46:18 +0200
Subject: [PATCH 1/6] Change workflow name

---
 .github/workflows/notebooks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
index 0486d967..da9863c1 100644
--- a/.github/workflows/notebooks.yml
+++ b/.github/workflows/notebooks.yml
@@ -1,4 +1,4 @@
-name: Notebooks
+name: notebooks
 
 on:
   workflow_run:

From cb20600c1b510f19e2614017821bc7fe11c41422 Mon Sep 17 00:00:00 2001
From: Benoit Putzeys <benoit@helical-ai.com>
Date: Wed, 2 Oct 2024 20:10:20 +0200
Subject: [PATCH 2/6] Add branch to workflow

---
 .github/workflows/main.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 7b94a6ed..98e85761 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -5,6 +5,7 @@ on:
     branches:
       - main
       - release
+      - change-workflow-name
 
 jobs:
   build:

From 6c67323b133ed013485484bf3be2d9a0d8a0d43f Mon Sep 17 00:00:00 2001
From: Benoit Putzeys <benoit@helical-ai.com>
Date: Wed, 2 Oct 2024 20:44:21 +0200
Subject: [PATCH 3/6] Change workflow to be in single file

---
 .github/workflows/main.yml      | 33 ++++++++++++++++++++++++++---
 .github/workflows/notebooks.yml | 37 ---------------------------------
 2 files changed, 30 insertions(+), 40 deletions(-)
 delete mode 100644 .github/workflows/notebooks.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 98e85761..253abf34 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -5,11 +5,9 @@ on:
     branches:
       - main
       - release
-      - change-workflow-name
 
 jobs:
-  build:
-  
+  tests:
     runs-on: self-hosted
     timeout-minutes: 60 
     steps:
@@ -72,3 +70,32 @@ jobs:
         run: |
           pip install scanorama
           python examples/run_benchmark.py
+
+  notebooks:
+    needs: tests
+    runs-on: self-hosted
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.8
+
+      - name: Install dependencies
+        run: |
+            pip install -r requirements-dev.txt
+            
+      - name: Reduce datasets to speedup checks
+        run: |
+          sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
+          sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
+          sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+          sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+          sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+          sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
+
+      - name: Run Notebooks
+        run: |
+          pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb
\ No newline at end of file
diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
deleted file mode 100644
index da9863c1..00000000
--- a/.github/workflows/notebooks.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: notebooks
-
-on:
-  workflow_run:
-    workflows: ["CI Pipeline"]
-    types:
-      - completed
-
-jobs:
-  notebooks:
-    runs-on: self-hosted
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
-
-      - name: Install dependencies
-        run: |
-            pip install -r requirements-dev.txt
-            
-      - name: Reduce datasets to speedup checks
-        run: |
-          sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
-          sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
-          sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
-          sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
-          sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
-          sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
-
-      - name: Run Notebooks
-        run: |
-          pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb
\ No newline at end of file

From 9980fb13f475b2aea119b9ca9c4c7398eac162a8 Mon Sep 17 00:00:00 2001
From: Benoit Putzeys <benoit@helical-ai.com>
Date: Wed, 2 Oct 2024 20:51:14 +0200
Subject: [PATCH 4/6] Have a more granular approach to run the pipeline

---
 .github/workflows/main.yml | 43 ++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 253abf34..3c3440c0 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -7,9 +7,8 @@ on:
       - release
 
 jobs:
-  tests:
+  setup:
     runs-on: self-hosted
-    timeout-minutes: 60 
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2
@@ -29,6 +28,18 @@ jobs:
         run: |
           python ci/download_all.py
 
+  tests:
+    runs-on: self-hosted
+    needs: setup
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.8
+
       - name: Execute unittests
         run: |
           pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/
@@ -46,6 +57,18 @@ jobs:
       #     pytest-coverage-path: ./pytest-coverage.txt
       #     junitxml-path: ./pytest.xml
 
+  inference:
+    runs-on: self-hosted
+    needs: tests
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.8
+
       - name: Execute Geneformer v1
         run: |
           python examples/run_models/run_geneformer.py ++model_name="gf-12L-30M-i2048"
@@ -65,14 +88,26 @@ jobs:
       - name: Execute Hyena
         run: |
           python examples/run_models/run_hyena_dna.py
-      
+  
+  benchmarking:
+    needs: inference
+    runs-on: self-hosted
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.8
+
       - name: Execute benchmarking
         run: |
           pip install scanorama
           python examples/run_benchmark.py
 
   notebooks:
-    needs: tests
+    needs: benchmarking
     runs-on: self-hosted
     steps:
       - name: Checkout repository

From 4d6d39231f56e16302c80d8cbd196fd5933e0935 Mon Sep 17 00:00:00 2001
From: Benoit Putzeys <benoit@helical-ai.com>
Date: Thu, 3 Oct 2024 07:57:10 +0200
Subject: [PATCH 5/6] Speed up pipeline by loading anndata directly

instead of loading it via Huggingface
---
 .github/workflows/main.yml            | 39 ++-------------------------
 ci/download_all.py                    |  1 +
 examples/config.yaml                  |  2 +-
 examples/run_benchmark.py             | 12 ++++++---
 examples/run_models/run_geneformer.py | 13 +++++----
 examples/run_models/run_scgpt.py      |  9 +++++--
 examples/run_models/run_uce.py        | 10 +++++--
 7 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3c3440c0..f724a809 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -7,7 +7,7 @@ on:
       - release
 
 jobs:
-  setup:
+  tests:
     runs-on: self-hosted
     steps:
       - name: Checkout repository
@@ -28,18 +28,6 @@ jobs:
         run: |
           python ci/download_all.py
 
-  tests:
-    runs-on: self-hosted
-    needs: setup
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
-
       - name: Execute unittests
         run: |
           pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/
@@ -57,17 +45,6 @@ jobs:
       #     pytest-coverage-path: ./pytest-coverage.txt
       #     junitxml-path: ./pytest.xml
 
-  inference:
-    runs-on: self-hosted
-    needs: tests
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
 
       - name: Execute Geneformer v1
         run: |
@@ -88,18 +65,6 @@ jobs:
       - name: Execute Hyena
         run: |
           python examples/run_models/run_hyena_dna.py
-  
-  benchmarking:
-    needs: inference
-    runs-on: self-hosted
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: setup python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11.8
 
       - name: Execute benchmarking
         run: |
@@ -107,7 +72,7 @@ jobs:
           python examples/run_benchmark.py
 
   notebooks:
-    needs: benchmarking
+    needs: tests
     runs-on: self-hosted
     steps:
       - name: Checkout repository
diff --git a/ci/download_all.py b/ci/download_all.py
index 7eda0a34..d77acc58 100644
--- a/ci/download_all.py
+++ b/ci/download_all.py
@@ -66,6 +66,7 @@ def main():
     downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt")
     downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt")
 
+    downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true")
     return True
 
 if __name__ == "__main__":
diff --git a/examples/config.yaml b/examples/config.yaml
index 9126bdaf..9bd8c3ea 100644
--- a/examples/config.yaml
+++ b/examples/config.yaml
@@ -16,7 +16,7 @@ data:
       batch_key: "batch"
       label_key: "LVL1"
       path: "helical-ai/yolksac_human"
-      gene_names: "gene_name"
+      gene_names: "index"
       name: "helical-ai/yolksac_human"
 
 
diff --git a/examples/run_benchmark.py b/examples/run_benchmark.py
index 1a45eef7..26ba2c25 100644
--- a/examples/run_benchmark.py
+++ b/examples/run_benchmark.py
@@ -147,15 +147,19 @@ def benchmark(cfg: DictConfig) -> None:
     head_cfg = cfg["svm"]
     integration_cfg = cfg["integration"]
 
-    hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    data = get_anndata_from_hf_dataset(hf_dataset)[:100]
+    # either load via huggingface
+    # hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # data = get_anndata_from_hf_dataset(hf_dataset)[:10]
+
+    # or load directly
+    data = ad.read_h5ad("./yolksac_human.h5ad")[:10]
     data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category")
 
     # set gene names. for example if the index is the ensemble gene id 
     # data.var_names = data.var["feature_name"]
 
-    run_classification_example(data, ["geneformer", "scgpt"], data_cfg, head_cfg, device=cfg["device"])
-    # run_integration_example(data, ["geneformer", "scgpt", "scanorama"], data_cfg, integration_cfg, device=cfg["device"])
+    run_classification_example(data, ["scgpt", "geneformer"], data_cfg, head_cfg, device=cfg["device"])
+    # run_integration_example(data, ["scgpt", "geneformer", "scanorama"], data_cfg, integration_cfg, device=cfg["device"])
     LOGGER.info("Benchmarking done.")
 
 if __name__ == "__main__":
diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py
index 7d69b5b4..93f4abe9 100644
--- a/examples/run_models/run_geneformer.py
+++ b/examples/run_models/run_geneformer.py
@@ -1,8 +1,9 @@
 from helical import Geneformer,GeneformerConfig
-from helical.utils import get_anndata_from_hf_dataset
 import hydra
 from omegaconf import DictConfig
+import anndata as ad
 from datasets import load_dataset
+from helical.utils import get_anndata_from_hf_dataset
 
 
 @hydra.main(version_base=None, config_path="configs", config_name="geneformer_config")
@@ -10,15 +11,17 @@ def run(cfg: DictConfig):
     geneformer_config = GeneformerConfig(**cfg)
     geneformer = Geneformer(configurer = geneformer_config)
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # either load via huggingface
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     dataset = geneformer.process_data(ann_data[:10])
     embeddings = geneformer.get_embeddings(dataset)
 
     print(embeddings.shape)
 
-
-
 if __name__ == "__main__":
     run()
\ No newline at end of file
diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py
index fa5d7e12..7ce27793 100644
--- a/examples/run_models/run_scgpt.py
+++ b/examples/run_models/run_scgpt.py
@@ -1,6 +1,7 @@
 from helical.models.scgpt.model import scGPT, scGPTConfig
 import hydra
 from omegaconf import DictConfig
+import anndata as ad
 from datasets import load_dataset
 from helical.utils import get_anndata_from_hf_dataset
 
@@ -9,8 +10,12 @@ def run(cfg: DictConfig):
     scgpt_config = scGPTConfig(**cfg)
     scgpt = scGPT(configurer = scgpt_config)
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # either load via huggingface
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     data = scgpt.process_data(ann_data[:10])
     embeddings = scgpt.get_embeddings(data)
diff --git a/examples/run_models/run_uce.py b/examples/run_models/run_uce.py
index 985b4886..0b57bf17 100644
--- a/examples/run_models/run_uce.py
+++ b/examples/run_models/run_uce.py
@@ -2,6 +2,7 @@
 import hydra
 from omegaconf import DictConfig
 import numpy as np
+import anndata as ad
 from datasets import load_dataset
 from helical.utils import get_anndata_from_hf_dataset
 
@@ -13,9 +14,13 @@
 def run(cfg: DictConfig):
     configurer=UCEConfig(**cfg)
     uce = UCE(configurer=configurer)
+    # either load via huggingface
 
-    hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
-    ann_data = get_anndata_from_hf_dataset(hf_dataset)
+    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # ann_data = get_anndata_from_hf_dataset(hf_dataset)
+
+    # or load directly
+    ann_data = ad.read_h5ad("./yolksac_human.h5ad")
     
     batch_size = 10
 
@@ -38,5 +43,6 @@ def run(cfg: DictConfig):
     # Concatenate the embeddings from each batch
     all_embeddings = np.concatenate(all_embeddings, axis=0)
     print(all_embeddings.shape)
+    
 if __name__ == "__main__":
     run()
\ No newline at end of file

From af3f6aa7cf7f839c8a917be2dd6ecff30f0b6cb8 Mon Sep 17 00:00:00 2001
From: Benoit Putzeys <benoit@helical-ai.com>
Date: Thu, 3 Oct 2024 11:15:43 +0200
Subject: [PATCH 6/6] Fix bug where model would not stop execution if no

genes were present (such as in the case of passing mouse data)
---
 ci/tests/test_geneformer/test_geneformer_model.py | 3 ++-
 helical/models/geneformer/model.py                | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/tests/test_geneformer/test_geneformer_model.py b/ci/tests/test_geneformer/test_geneformer_model.py
index d029aa9b..fd6a2e9b 100644
--- a/ci/tests/test_geneformer/test_geneformer_model.py
+++ b/ci/tests/test_geneformer/test_geneformer_model.py
@@ -52,7 +52,8 @@ def test_ensure_data_validity_raising_error_with_missing_ensembl_id_column(self,
     
     @pytest.mark.parametrize("gene_symbols, raises_error",
                              [
-                                (['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True),
+                                (['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True), # humans
+                                (['ENSMUSG00000021033', 'ENSMUSG00000021033', 'ENSMUSG00000021033'], True), # mice
                                 (['SAMD11', 'None', 'HES4'], True),
                                 (['SAMD11', 'PLEKHN1', 'HES4'], False),
                              ]
diff --git a/helical/models/geneformer/model.py b/helical/models/geneformer/model.py
index 5ca23f78..b7139b78 100644
--- a/helical/models/geneformer/model.py
+++ b/helical/models/geneformer/model.py
@@ -145,7 +145,7 @@ def process_data(self,
 
         # map gene symbols to ensemble ids if provided
         if gene_names != "ensembl_id":
-            if (adata.var[gene_names].str.startswith("ENSG").all()) or (adata.var[gene_names].str.startswith("None").any()):
+            if (adata.var[gene_names].str.startswith("ENS").all()) or (adata.var[gene_names].str.startswith("None").any()):
                 message = "It seems an anndata with 'ensemble ids' and/or 'None' was passed. " \
                 "Please set gene_names='ensembl_id' and remove 'None's to skip mapping."
                 LOGGER.info(message)