Skip to content

Commit

Permalink
Speed up pipeline by loading anndata directly
Browse files Browse the repository at this point in the history
instead of loading it via Huggingface
  • Loading branch information
bputzeys committed Oct 3, 2024
1 parent 9980fb1 commit b142966
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 49 deletions.
39 changes: 2 additions & 37 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
- release

jobs:
setup:
tests:
runs-on: self-hosted
steps:
- name: Checkout repository
Expand All @@ -28,18 +28,6 @@ jobs:
run: |
python ci/download_all.py
tests:
runs-on: self-hosted
needs: setup
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: setup python
uses: actions/setup-python@v5
with:
python-version: 3.11.8

- name: Execute unittests
run: |
pytest --cov-report=html:html_cov --cov-branch --cov-report term --cov=helical ci/
Expand All @@ -57,17 +45,6 @@ jobs:
# pytest-coverage-path: ./pytest-coverage.txt
# junitxml-path: ./pytest.xml

inference:
runs-on: self-hosted
needs: tests
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: setup python
uses: actions/setup-python@v5
with:
python-version: 3.11.8

- name: Execute Geneformer v1
run: |
Expand All @@ -88,26 +65,14 @@ jobs:
- name: Execute Hyena
run: |
python examples/run_models/run_hyena_dna.py
benchmarking:
needs: inference
runs-on: self-hosted
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: setup python
uses: actions/setup-python@v5
with:
python-version: 3.11.8
- name: Execute benchmarking
run: |
pip install scanorama
python examples/run_benchmark.py
notebooks:
needs: benchmarking
needs: tests
runs-on: self-hosted
steps:
- name: Checkout repository
Expand Down
1 change: 1 addition & 0 deletions ci/download_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def main():
downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt")
downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt")

downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true")
return True

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ data:
batch_key: "batch"
label_key: "LVL1"
path: "helical-ai/yolksac_human"
gene_names: "gene_name"
gene_names: "index"
name: "helical-ai/yolksac_human"


Expand Down
8 changes: 6 additions & 2 deletions examples/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,12 @@ def benchmark(cfg: DictConfig) -> None:
head_cfg = cfg["svm"]
integration_cfg = cfg["integration"]

hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
data = get_anndata_from_hf_dataset(hf_dataset)[:100]
# either load via huggingface
# hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# data = get_anndata_from_hf_dataset(hf_dataset)[:100]

# or load directly
data = ad.read_h5ad("./yolksac_human.h5ad")[:100]
data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category")

# set gene names. for example if the index is the ensemble gene id
Expand Down
13 changes: 8 additions & 5 deletions examples/run_models/run_geneformer.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@
from helical import Geneformer,GeneformerConfig
from helical.utils import get_anndata_from_hf_dataset
import hydra
from omegaconf import DictConfig
import anndata as ad
from datasets import load_dataset
from helical.utils import get_anndata_from_hf_dataset


@hydra.main(version_base=None, config_path="configs", config_name="geneformer_config")
def run(cfg: DictConfig):
geneformer_config = GeneformerConfig(**cfg)
geneformer = Geneformer(configurer = geneformer_config)

hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
ann_data = get_anndata_from_hf_dataset(hf_dataset)
# either load via huggingface
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

dataset = geneformer.process_data(ann_data[:10])
embeddings = geneformer.get_embeddings(dataset)

print(embeddings.shape)



if __name__ == "__main__":
run()
9 changes: 7 additions & 2 deletions examples/run_models/run_scgpt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from helical.models.scgpt.model import scGPT, scGPTConfig
import hydra
from omegaconf import DictConfig
import anndata as ad
from datasets import load_dataset
from helical.utils import get_anndata_from_hf_dataset

Expand All @@ -9,8 +10,12 @@ def run(cfg: DictConfig):
scgpt_config = scGPTConfig(**cfg)
scgpt = scGPT(configurer = scgpt_config)

hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
ann_data = get_anndata_from_hf_dataset(hf_dataset)
# either load via huggingface
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

data = scgpt.process_data(ann_data[:10])
embeddings = scgpt.get_embeddings(data)
Expand Down
10 changes: 8 additions & 2 deletions examples/run_models/run_uce.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import hydra
from omegaconf import DictConfig
import numpy as np
import anndata as ad
from datasets import load_dataset
from helical.utils import get_anndata_from_hf_dataset

Expand All @@ -13,9 +14,13 @@
def run(cfg: DictConfig):
configurer=UCEConfig(**cfg)
uce = UCE(configurer=configurer)
# either load via huggingface

hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
ann_data = get_anndata_from_hf_dataset(hf_dataset)
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

batch_size = 10

Expand All @@ -38,5 +43,6 @@ def run(cfg: DictConfig):
# Concatenate the embeddings from each batch
all_embeddings = np.concatenate(all_embeddings, axis=0)
print(all_embeddings.shape)

if __name__ == "__main__":
run()

0 comments on commit b142966

Please sign in to comment.