From 9e09d1145c038efac6499b13d678eb819e776da8 Mon Sep 17 00:00:00 2001 From: Benoit Putzeys Date: Wed, 2 Oct 2024 17:46:18 +0200 Subject: [PATCH 1/6] Change workflow name --- .github/workflows/notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 0486d967..da9863c1 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -1,4 +1,4 @@ -name: Notebooks +name: notebooks on: workflow_run: From cb20600c1b510f19e2614017821bc7fe11c41422 Mon Sep 17 00:00:00 2001 From: Benoit Putzeys Date: Wed, 2 Oct 2024 20:10:20 +0200 Subject: [PATCH 2/6] Add branch to workflow --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7b94a6ed..98e85761 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -5,6 +5,7 @@ on: branches: - main - release + - change-workflow-name jobs: build: From 6c67323b133ed013485484bf3be2d9a0d8a0d43f Mon Sep 17 00:00:00 2001 From: Benoit Putzeys Date: Wed, 2 Oct 2024 20:44:21 +0200 Subject: [PATCH 3/6] Change workflow to be in single file --- .github/workflows/main.yml | 33 ++++++++++++++++++++++++++--- .github/workflows/notebooks.yml | 37 --------------------------------- 2 files changed, 30 insertions(+), 40 deletions(-) delete mode 100644 .github/workflows/notebooks.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 98e85761..253abf34 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -5,11 +5,9 @@ on: branches: - main - release - - change-workflow-name jobs: - build: - + tests: runs-on: self-hosted timeout-minutes: 60 steps: @@ -72,3 +70,32 @@ jobs: run: | pip install scanorama python examples/run_benchmark.py + + notebooks: + needs: tests + runs-on: self-hosted + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: 3.11.8 + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Reduce datasets to speedup checks + run: | + sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb + sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb + sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + + - name: Run Notebooks + run: | + pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb \ No newline at end of file diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml deleted file mode 100644 index da9863c1..00000000 --- a/.github/workflows/notebooks.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: notebooks - -on: - workflow_run: - workflows: ["CI Pipeline"] - types: - - completed - -jobs: - notebooks: - runs-on: self-hosted - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - - - name: Install dependencies - run: | - pip install -r requirements-dev.txt - - - name: Reduce datasets to speedup checks - run: | - sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb - sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb - sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - - - name: Run Notebooks - run: | - pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb \ No newline at end of file From 9980fb13f475b2aea119b9ca9c4c7398eac162a8 Mon Sep 17 00:00:00 2001 From: Benoit Putzeys Date: Wed, 2 Oct 2024 20:51:14 +0200 Subject: [PATCH 4/6] Have a more granular approach to run the pipeline --- .github/workflows/main.yml | 43 ++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 253abf34..3c3440c0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,9 +7,8 @@ on: - release jobs: - tests: + setup: runs-on: self-hosted - timeout-minutes: 60 steps: - name: Checkout repository uses: actions/checkout@v2 @@ -29,6 +28,18 @@ jobs: run: | python ci/download_all.py + tests: + runs-on: self-hosted + needs: setup + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: 3.11.8 + - name: Execute unittests run: | pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/ @@ -46,6 +57,18 @@ jobs: # pytest-coverage-path: ./pytest-coverage.txt # junitxml-path: ./pytest.xml + inference: + runs-on: self-hosted + needs: tests + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: 3.11.8 + - name: Execute Geneformer v1 run: | python examples/run_models/run_geneformer.py ++model_name="gf-12L-30M-i2048" @@ -65,14 +88,26 @@ jobs: - name: Execute Hyena run: | python examples/run_models/run_hyena_dna.py - + + benchmarking: + needs: inference + runs-on: self-hosted + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: 3.11.8 + - name: Execute benchmarking run: | pip install scanorama python examples/run_benchmark.py notebooks: - needs: tests + needs: benchmarking runs-on: self-hosted steps: - name: Checkout repository From 4d6d39231f56e16302c80d8cbd196fd5933e0935 Mon Sep 17 00:00:00 2001 From: Benoit Putzeys Date: Thu, 3 Oct 2024 07:57:10 +0200 Subject: [PATCH 5/6] Speed up pipeline by loading anndata directly instead of loading it via Huggingface --- .github/workflows/main.yml | 39 ++------------------------- ci/download_all.py | 1 + examples/config.yaml | 2 +- examples/run_benchmark.py | 12 ++++++--- examples/run_models/run_geneformer.py | 13 +++++---- examples/run_models/run_scgpt.py | 9 +++++-- examples/run_models/run_uce.py | 10 +++++-- 7 files changed, 35 insertions(+), 51 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3c3440c0..f724a809 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ on: - release jobs: - setup: + tests: runs-on: self-hosted steps: - name: Checkout repository @@ -28,18 +28,6 @@ jobs: run: | python ci/download_all.py - tests: - runs-on: self-hosted - needs: setup - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - - name: Execute unittests run: | pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/ @@ -57,17 +45,6 @@ jobs: # pytest-coverage-path: ./pytest-coverage.txt # junitxml-path: ./pytest.xml - inference: - runs-on: self-hosted - needs: tests - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - name: Execute Geneformer v1 run: | @@ -88,18 +65,6 @@ jobs: - name: Execute Hyena run: | python examples/run_models/run_hyena_dna.py - - benchmarking: - needs: inference - runs-on: self-hosted - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - name: Execute benchmarking run: | @@ -107,7 +72,7 @@ jobs: python examples/run_benchmark.py notebooks: - needs: benchmarking + needs: tests runs-on: self-hosted steps: - name: Checkout repository diff --git a/ci/download_all.py b/ci/download_all.py index 7eda0a34..d77acc58 100644 --- a/ci/download_all.py +++ b/ci/download_all.py @@ -66,6 +66,7 @@ def main(): downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt") downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt") + downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true") return True if __name__ == "__main__": diff --git a/examples/config.yaml b/examples/config.yaml index 9126bdaf..9bd8c3ea 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -16,7 +16,7 @@ data: batch_key: "batch" label_key: "LVL1" path: "helical-ai/yolksac_human" - gene_names: "gene_name" + gene_names: "index" name: "helical-ai/yolksac_human" diff --git a/examples/run_benchmark.py b/examples/run_benchmark.py index 1a45eef7..26ba2c25 100644 --- a/examples/run_benchmark.py +++ b/examples/run_benchmark.py @@ -147,15 +147,19 @@ def benchmark(cfg: DictConfig) -> None: head_cfg = cfg["svm"] integration_cfg = cfg["integration"] - hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - data = get_anndata_from_hf_dataset(hf_dataset)[:100] + # either load via huggingface + # hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # data = get_anndata_from_hf_dataset(hf_dataset)[:10] + + # or load directly + data = ad.read_h5ad("./yolksac_human.h5ad")[:10] data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category") # set gene names. for example if the index is the ensemble gene id # data.var_names = data.var["feature_name"] - run_classification_example(data, ["geneformer", "scgpt"], data_cfg, head_cfg, device=cfg["device"]) - # run_integration_example(data, ["geneformer", "scgpt", "scanorama"], data_cfg, integration_cfg, device=cfg["device"]) + run_classification_example(data, ["scgpt", "geneformer"], data_cfg, head_cfg, device=cfg["device"]) + # run_integration_example(data, ["scgpt", "geneformer", "scanorama"], data_cfg, integration_cfg, device=cfg["device"]) LOGGER.info("Benchmarking done.") if __name__ == "__main__": diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py index 7d69b5b4..93f4abe9 100644 --- a/examples/run_models/run_geneformer.py +++ b/examples/run_models/run_geneformer.py @@ -1,8 +1,9 @@ from helical import Geneformer,GeneformerConfig -from helical.utils import get_anndata_from_hf_dataset import hydra from omegaconf import DictConfig +import anndata as ad from datasets import load_dataset +from helical.utils import get_anndata_from_hf_dataset @hydra.main(version_base=None, config_path="configs", config_name="geneformer_config") @@ -10,15 +11,17 @@ def run(cfg: DictConfig): geneformer_config = GeneformerConfig(**cfg) geneformer = Geneformer(configurer = geneformer_config) - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # either load via huggingface + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") dataset = geneformer.process_data(ann_data[:10]) embeddings = geneformer.get_embeddings(dataset) print(embeddings.shape) - - if __name__ == "__main__": run() \ No newline at end of file diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py index fa5d7e12..7ce27793 100644 --- a/examples/run_models/run_scgpt.py +++ b/examples/run_models/run_scgpt.py @@ -1,6 +1,7 @@ from helical.models.scgpt.model import scGPT, scGPTConfig import hydra from omegaconf import DictConfig +import anndata as ad from datasets import load_dataset from helical.utils import get_anndata_from_hf_dataset @@ -9,8 +10,12 @@ def run(cfg: DictConfig): scgpt_config = scGPTConfig(**cfg) scgpt = scGPT(configurer = scgpt_config) - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # either load via huggingface + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") data = scgpt.process_data(ann_data[:10]) embeddings = scgpt.get_embeddings(data) diff --git a/examples/run_models/run_uce.py b/examples/run_models/run_uce.py index 985b4886..0b57bf17 100644 --- a/examples/run_models/run_uce.py +++ b/examples/run_models/run_uce.py @@ -2,6 +2,7 @@ import hydra from omegaconf import DictConfig import numpy as np +import anndata as ad from datasets import load_dataset from helical.utils import get_anndata_from_hf_dataset @@ -13,9 +14,13 @@ def run(cfg: DictConfig): configurer=UCEConfig(**cfg) uce = UCE(configurer=configurer) + # either load via huggingface - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") batch_size = 10 @@ -38,5 +43,6 @@ def run(cfg: DictConfig): # Concatenate the embeddings from each batch all_embeddings = np.concatenate(all_embeddings, axis=0) print(all_embeddings.shape) + if __name__ == "__main__": run() \ No newline at end of file From af3f6aa7cf7f839c8a917be2dd6ecff30f0b6cb8 Mon Sep 17 00:00:00 2001 From: Benoit Putzeys Date: Thu, 3 Oct 2024 11:15:43 +0200 Subject: [PATCH 6/6] Fix bug where model would not stop execution if no genes were present (such as in the case of passing mouse data) --- ci/tests/test_geneformer/test_geneformer_model.py | 3 ++- helical/models/geneformer/model.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/tests/test_geneformer/test_geneformer_model.py b/ci/tests/test_geneformer/test_geneformer_model.py index d029aa9b..fd6a2e9b 100644 --- a/ci/tests/test_geneformer/test_geneformer_model.py +++ b/ci/tests/test_geneformer/test_geneformer_model.py @@ -52,7 +52,8 @@ def test_ensure_data_validity_raising_error_with_missing_ensembl_id_column(self, @pytest.mark.parametrize("gene_symbols, raises_error", [ - (['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True), + (['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True), # humans + (['ENSMUSG00000021033', 'ENSMUSG00000021033', 'ENSMUSG00000021033'], True), # mice (['SAMD11', 'None', 'HES4'], True), (['SAMD11', 'PLEKHN1', 'HES4'], False), ] diff --git a/helical/models/geneformer/model.py b/helical/models/geneformer/model.py index 5ca23f78..b7139b78 100644 --- a/helical/models/geneformer/model.py +++ b/helical/models/geneformer/model.py @@ -145,7 +145,7 @@ def process_data(self, # map gene symbols to ensemble ids if provided if gene_names != "ensembl_id": - if (adata.var[gene_names].str.startswith("ENSG").all()) or (adata.var[gene_names].str.startswith("None").any()): + if (adata.var[gene_names].str.startswith("ENS").all()) or (adata.var[gene_names].str.startswith("None").any()): message = "It seems an anndata with 'ensemble ids' and/or 'None' was passed. " \ "Please set gene_names='ensembl_id' and remove 'None's to skip mapping." LOGGER.info(message)