Skip to content

Commit

Permalink
Merge pull request #108 from helicalAI/main
Browse files Browse the repository at this point in the history
Change workflow to exectute tests and notebooks sequentially
  • Loading branch information
bputzeys authored Oct 3, 2024
2 parents 77b32c0 + 72fec17 commit d254fe4
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 57 deletions.
36 changes: 32 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@ on:
- release

jobs:
build:

tests:
runs-on: self-hosted
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v2
Expand Down Expand Up @@ -47,6 +45,7 @@ jobs:
# pytest-coverage-path: ./pytest-coverage.txt
# junitxml-path: ./pytest.xml


- name: Execute Geneformer v1
run: |
python examples/run_models/run_geneformer.py ++model_name="gf-12L-30M-i2048"
Expand All @@ -66,8 +65,37 @@ jobs:
- name: Execute Hyena
run: |
python examples/run_models/run_hyena_dna.py
- name: Execute benchmarking
run: |
pip install scanorama
python examples/run_benchmark.py
notebooks:
needs: tests
runs-on: self-hosted
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: setup python
uses: actions/setup-python@v5
with:
python-version: 3.11.8

- name: Install dependencies
run: |
pip install -r requirements-dev.txt
- name: Reduce datasets to speedup checks
run: |
sed -i 's/train\[:65%\]/train\[:5%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
sed -i 's/train\[70%:\]/train\[5%:7%\]/g' ./examples/notebooks/Cell-Type-Annotation.ipynb
sed -i 's/get_anndata_from_hf_dataset(ds\[\\"train\\"\])/get_anndata_from_hf_dataset(ds\[\\"train\\"\])[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
sed -i 's/get_anndata_from_hf_dataset(ds\[\\"test\\"\])/get_anndata_from_hf_dataset(ds\[\\"test\\"\])[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:2]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb
- name: Run Notebooks
run: |
pytest --durations=0 --nbmake ./examples/notebooks/*.ipynb
37 changes: 0 additions & 37 deletions .github/workflows/notebooks.yml

This file was deleted.

1 change: 1 addition & 0 deletions ci/download_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def main():
downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen.ckpt")
downloader.download_via_name("hyena_dna/hyenadna-tiny-1k-seqlen-d256.ckpt")

downloader.download_via_link(Path("yolksac_human.h5ad"), "https://huggingface.co/datasets/helical-ai/yolksac_human/resolve/main/data/17_04_24_YolkSacRaw_F158_WE_annots.h5ad?download=true")
return True

if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion ci/tests/test_geneformer/test_geneformer_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def test_ensure_data_validity_raising_error_with_missing_ensembl_id_column(self,

@pytest.mark.parametrize("gene_symbols, raises_error",
[
(['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True),
(['ENSGSAMD11', 'ENSGPLEKHN1', 'ENSGHES4'], True), # humans
(['ENSMUSG00000021033', 'ENSMUSG00000021033', 'ENSMUSG00000021033'], True), # mice
(['SAMD11', 'None', 'HES4'], True),
(['SAMD11', 'PLEKHN1', 'HES4'], False),
]
Expand Down
2 changes: 1 addition & 1 deletion examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ data:
batch_key: "batch"
label_key: "LVL1"
path: "helical-ai/yolksac_human"
gene_names: "gene_name"
gene_names: "index"
name: "helical-ai/yolksac_human"


Expand Down
12 changes: 8 additions & 4 deletions examples/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,19 @@ def benchmark(cfg: DictConfig) -> None:
head_cfg = cfg["svm"]
integration_cfg = cfg["integration"]

hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
data = get_anndata_from_hf_dataset(hf_dataset)[:100]
# either load via huggingface
# hf_dataset = load_dataset(data_cfg["path"], split="train[:10%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# data = get_anndata_from_hf_dataset(hf_dataset)[:10]

# or load directly
data = ad.read_h5ad("./yolksac_human.h5ad")[:10]
data.obs[data_cfg["label_key"]] = data.obs[data_cfg["label_key"]].astype("category")

# set gene names. for example if the index is the ensemble gene id
# data.var_names = data.var["feature_name"]

run_classification_example(data, ["geneformer", "scgpt"], data_cfg, head_cfg, device=cfg["device"])
# run_integration_example(data, ["geneformer", "scgpt", "scanorama"], data_cfg, integration_cfg, device=cfg["device"])
run_classification_example(data, ["scgpt", "geneformer"], data_cfg, head_cfg, device=cfg["device"])
# run_integration_example(data, ["scgpt", "geneformer", "scanorama"], data_cfg, integration_cfg, device=cfg["device"])
LOGGER.info("Benchmarking done.")

if __name__ == "__main__":
Expand Down
13 changes: 8 additions & 5 deletions examples/run_models/run_geneformer.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@
from helical import Geneformer,GeneformerConfig
from helical.utils import get_anndata_from_hf_dataset
import hydra
from omegaconf import DictConfig
import anndata as ad
from datasets import load_dataset
from helical.utils import get_anndata_from_hf_dataset


@hydra.main(version_base=None, config_path="configs", config_name="geneformer_config")
def run(cfg: DictConfig):
geneformer_config = GeneformerConfig(**cfg)
geneformer = Geneformer(configurer = geneformer_config)

hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
ann_data = get_anndata_from_hf_dataset(hf_dataset)
# either load via huggingface
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

dataset = geneformer.process_data(ann_data[:10])
embeddings = geneformer.get_embeddings(dataset)

print(embeddings.shape)



if __name__ == "__main__":
run()
9 changes: 7 additions & 2 deletions examples/run_models/run_scgpt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from helical.models.scgpt.model import scGPT, scGPTConfig
import hydra
from omegaconf import DictConfig
import anndata as ad
from datasets import load_dataset
from helical.utils import get_anndata_from_hf_dataset

Expand All @@ -9,8 +10,12 @@ def run(cfg: DictConfig):
scgpt_config = scGPTConfig(**cfg)
scgpt = scGPT(configurer = scgpt_config)

hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
ann_data = get_anndata_from_hf_dataset(hf_dataset)
# either load via huggingface
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

data = scgpt.process_data(ann_data[:10])
embeddings = scgpt.get_embeddings(data)
Expand Down
10 changes: 8 additions & 2 deletions examples/run_models/run_uce.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import hydra
from omegaconf import DictConfig
import numpy as np
import anndata as ad
from datasets import load_dataset
from helical.utils import get_anndata_from_hf_dataset

Expand All @@ -13,9 +14,13 @@
def run(cfg: DictConfig):
configurer=UCEConfig(**cfg)
uce = UCE(configurer=configurer)
# either load via huggingface

hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
ann_data = get_anndata_from_hf_dataset(hf_dataset)
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

batch_size = 10

Expand All @@ -38,5 +43,6 @@ def run(cfg: DictConfig):
# Concatenate the embeddings from each batch
all_embeddings = np.concatenate(all_embeddings, axis=0)
print(all_embeddings.shape)

if __name__ == "__main__":
run()
2 changes: 1 addition & 1 deletion helical/models/geneformer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def process_data(self,

# map gene symbols to ensemble ids if provided
if gene_names != "ensembl_id":
if (adata.var[gene_names].str.startswith("ENSG").all()) or (adata.var[gene_names].str.startswith("None").any()):
if (adata.var[gene_names].str.startswith("ENS").all()) or (adata.var[gene_names].str.startswith("None").any()):
message = "It seems an anndata with 'ensemble ids' and/or 'None' was passed. " \
"Please set gene_names='ensembl_id' and remove 'None's to skip mapping."
LOGGER.info(message)
Expand Down

0 comments on commit d254fe4

Please sign in to comment.