diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3c3440c0..f724a809 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ on: - release jobs: - setup: + tests: runs-on: self-hosted steps: - name: Checkout repository @@ -28,18 +28,6 @@ jobs: run: | python ci/download_all.py - tests: - runs-on: self-hosted - needs: setup - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - - name: Execute unittests run: | pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/ @@ -57,17 +45,6 @@ jobs: # pytest-coverage-path: ./pytest-coverage.txt # junitxml-path: ./pytest.xml - inference: - runs-on: self-hosted - needs: tests - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - name: Execute Geneformer v1 run: | @@ -88,18 +65,6 @@ jobs: - name: Execute Hyena run: | python examples/run_models/run_hyena_dna.py - - benchmarking: - needs: inference - runs-on: self-hosted - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: setup python - uses: actions/setup-python@v5 - with: - python-version: 3.11.8 - name: Execute benchmarking run: | @@ -107,7 +72,7 @@ jobs: python examples/run_benchmark.py notebooks: - needs: benchmarking + needs: tests runs-on: self-hosted steps: - name: Checkout repository diff --git a/ci/download_all.py b/ci/download_all.py index 7eda0a34..d77acc58 100644 --- a/ci/download_all.py +++ b/ci/download_all.py @@ -66,6 +66,7 @@ def main(): downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt") downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt") + downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true") return True if __name__ == "__main__": diff --git a/examples/config.yaml b/examples/config.yaml index 9126bdaf..9bd8c3ea 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -16,7 +16,7 @@ data: batch_key: "batch" label_key: "LVL1" path: "helical-ai/yolksac_human" - gene_names: "gene_name" + gene_names: "index" name: "helical-ai/yolksac_human" diff --git a/examples/run_benchmark.py b/examples/run_benchmark.py index 1a45eef7..3bc166e9 100644 --- a/examples/run_benchmark.py +++ b/examples/run_benchmark.py @@ -147,8 +147,12 @@ def benchmark(cfg: DictConfig) -> None: head_cfg = cfg["svm"] integration_cfg = cfg["integration"] - hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - data = get_anndata_from_hf_dataset(hf_dataset)[:100] + # either load via huggingface + # hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # data = get_anndata_from_hf_dataset(hf_dataset)[:100] + + # or load directly + data = ad.read_h5ad("./yolksac_human.h5ad")[:100] data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category") # set gene names. for example if the index is the ensemble gene id diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py index 7d69b5b4..93f4abe9 100644 --- a/examples/run_models/run_geneformer.py +++ b/examples/run_models/run_geneformer.py @@ -1,8 +1,9 @@ from helical import Geneformer,GeneformerConfig -from helical.utils import get_anndata_from_hf_dataset import hydra from omegaconf import DictConfig +import anndata as ad from datasets import load_dataset +from helical.utils import get_anndata_from_hf_dataset @hydra.main(version_base=None, config_path="configs", config_name="geneformer_config") @@ -10,15 +11,17 @@ def run(cfg: DictConfig): geneformer_config = GeneformerConfig(**cfg) geneformer = Geneformer(configurer = geneformer_config) - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # either load via huggingface + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") dataset = geneformer.process_data(ann_data[:10]) embeddings = geneformer.get_embeddings(dataset) print(embeddings.shape) - - if __name__ == "__main__": run() \ No newline at end of file diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py index fa5d7e12..7ce27793 100644 --- a/examples/run_models/run_scgpt.py +++ b/examples/run_models/run_scgpt.py @@ -1,6 +1,7 @@ from helical.models.scgpt.model import scGPT, scGPTConfig import hydra from omegaconf import DictConfig +import anndata as ad from datasets import load_dataset from helical.utils import get_anndata_from_hf_dataset @@ -9,8 +10,12 @@ def run(cfg: DictConfig): scgpt_config = scGPTConfig(**cfg) scgpt = scGPT(configurer = scgpt_config) - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # either load via huggingface + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") data = scgpt.process_data(ann_data[:10]) embeddings = scgpt.get_embeddings(data) diff --git a/examples/run_models/run_uce.py b/examples/run_models/run_uce.py index 985b4886..0b57bf17 100644 --- a/examples/run_models/run_uce.py +++ b/examples/run_models/run_uce.py @@ -2,6 +2,7 @@ import hydra from omegaconf import DictConfig import numpy as np +import anndata as ad from datasets import load_dataset from helical.utils import get_anndata_from_hf_dataset @@ -13,9 +14,13 @@ def run(cfg: DictConfig): configurer=UCEConfig(**cfg) uce = UCE(configurer=configurer) + # either load via huggingface - hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") - ann_data = get_anndata_from_hf_dataset(hf_dataset) + # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists") + # ann_data = get_anndata_from_hf_dataset(hf_dataset) + + # or load directly + ann_data = ad.read_h5ad("./yolksac_human.h5ad") batch_size = 10 @@ -38,5 +43,6 @@ def run(cfg: DictConfig): # Concatenate the embeddings from each batch all_embeddings = np.concatenate(all_embeddings, axis=0) print(all_embeddings.shape) + if __name__ == "__main__": run() \ No newline at end of file