diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7b94a6ed..f724a809 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,10 +7,8 @@ on: - release jobs: - build: - + tests: runs-on: self-hosted - timeout-minutes: 60 steps: - name: Checkout repository uses: actions/checkout@v2 @@ -47,6 +45,7 @@ jobs: # pytest-coverage-path: ./pytest-coverage.txt # junitxml-path: ./pytest.xml + - name: Execute Geneformer v1 run: | python examples/run_models/run_geneformer.py ++model_name="gf-12L-30M-i2048" @@ -66,8 +65,37 @@ jobs: - name: Execute Hyena run: | python examples/run_models/run_hyena_dna.py - + - name: Execute benchmarking run: | pip install scanorama python examples/run_benchmark.py + + notebooks: + needs: tests + runs-on: self-hosted + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: 3.11.8 + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Reduce datasets to speedup checks + run: | + sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb + sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb + sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb + + - name: Run Notebooks + run: | + pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb \ No newline at end of file diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml deleted file mode 100644 index 0486d967..00000000 --- a/.github/workflows/notebooks.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Notebooks - -on: - workflow_run: - workflows: ["CI Pipeline"] - types: - - completed - -jobs: - notebooks: - runs-on: self-hosted - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - - - name: Install dependencies - run: | - pip install -r requirements-dev.txt - - - name: Reduce datasets to speedup checks - run: | - sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb - sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb - sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb - - - name: Run Notebooks - run: | - pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb \ No newline at end of file diff --git a/ci/download_all.py b/ci/download_all.py index 7eda0a34..d77acc58 100644 --- a/ci/download_all.py +++ b/ci/download_all.py @@ -66,6 +66,7 @@ def main(): downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt") downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt") + downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true") return True if __name__ == "__main__": diff --git a/examples/config.yaml b/examples/config.yaml index 9126bdaf..9bd8c3ea 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -16,7 +16,7 @@ data: batch_key: "batch" label_key: "LVL1" path: "helical-ai/yolksac_human" - gene_names: "gene_name" + gene_names: "index" name: "helical-ai/yolksac_human" diff --git a/examples/run_benchmark.py b/examples/run_benchmark.py index 1a45eef7..26ba2c25 100644 --- a/examples/run_benchmark.py +++ b/examples/run_benchmark.py @@ -147,15 +147,19 @@ def benchmark(cfg: DictConfig) -> None: head_cfg = cfg["svm"] integration_cfg = cfg["integration"] - hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - data = get_anndata_from_hf_dataset(hf_dataset)[:100] + # either load via huggingface + # hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # data = get_anndata_from_hf_dataset(hf_dataset)[:10] + + # or load directly + data = ad.read_h5ad("./yolksac_human.h5ad")[:10] data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category") # set gene names. for example if the index is the ensemble gene id # data.var_names = data.var["feature_name"] - run_classification_example(data, ["geneformer", "scgpt"], data_cfg, head_cfg, device=cfg["device"]) - # run_integration_example(data, ["geneformer", "scgpt", "scanorama"], data_cfg, integration_cfg, device=cfg["device"]) + run_classification_example(data, ["scgpt", "geneformer"], data_cfg, head_cfg, device=cfg["device"]) + # run_integration_example(data, ["scgpt", "geneformer", "scanorama"], data_cfg, integration_cfg, device=cfg["device"]) LOGGER.info("Benchmarking done.") if __name__ == "__main__": diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py index 7d69b5b4..93f4abe9 100644 --- a/examples/run_models/run_geneformer.py +++ b/examples/run_models/run_geneformer.py @@ -1,8 +1,9 @@ from helical import Geneformer,GeneformerConfig -from helical.utils import get_anndata_from_hf_dataset import hydra from omegaconf import DictConfig +import anndata as ad from datasets import load_dataset +from helical.utils import get_anndata_from_hf_dataset @hydra.main(version_base=None, config_path="configs", config_name="geneformer_config") @@ -10,15 +11,17 @@ def run(cfg: DictConfig): geneformer_config = GeneformerConfig(**cfg) geneformer = Geneformer(configurer = geneformer_config) - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # either load via huggingface + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") dataset = geneformer.process_data(ann_data[:10]) embeddings = geneformer.get_embeddings(dataset) print(embeddings.shape) - - if __name__ == "__main__": run() \ No newline at end of file diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py index fa5d7e12..7ce27793 100644 --- a/examples/run_models/run_scgpt.py +++ b/examples/run_models/run_scgpt.py @@ -1,6 +1,7 @@ from helical.models.scgpt.model import scGPT, scGPTConfig import hydra from omegaconf import DictConfig +import anndata as ad from datasets import load_dataset from helical.utils import get_anndata_from_hf_dataset @@ -9,8 +10,12 @@ def run(cfg: DictConfig): scgpt_config = scGPTConfig(**cfg) scgpt = scGPT(configurer = scgpt_config) - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # either load via huggingface + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") data = scgpt.process_data(ann_data[:10]) embeddings = scgpt.get_embeddings(data) diff --git a/examples/run_models/run_uce.py b/examples/run_models/run_uce.py index 985b4886..0b57bf17 100644 --- a/examples/run_models/run_uce.py +++ b/examples/run_models/run_uce.py @@ -2,6 +2,7 @@ import hydra from omegaconf import DictConfig import numpy as np +import anndata as ad from datasets import load_dataset from helical.utils import get_anndata_from_hf_dataset @@ -13,9 +14,13 @@ def run(cfg: DictConfig): configurer=UCEConfig(**cfg) uce = UCE(configurer=configurer) + # either load via huggingface - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") batch_size = 10 @@ -38,5 +43,6 @@ def run(cfg: DictConfig): # Concatenate the embeddings from each batch all_embeddings = np.concatenate(all_embeddings, axis=0) print(all_embeddings.shape) + if __name__ == "__main__": run() \ No newline at end of file