Skip to content

Commit

Permalink
downloading files again for UCE , for the Geneformer all the files ar…
Browse files Browse the repository at this point in the history
…e now loaded in the init function.
  • Loading branch information
maxiallard committed Oct 3, 2024
1 parent 72fec17 commit 90f2eaa
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 31 deletions.
16 changes: 12 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11
FROM python:3.10

RUN apt-get update -y \
&& apt-get upgrade -y \
Expand All @@ -10,9 +10,17 @@ RUN apt-get update -y \
gfortran \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*


RUN pip install --upgrade --force-reinstall git+https://github.com/helicalAI/helical.git

# RUN mkdir /logs && chown 1000 /logs
# RUN mkdir /metaflow && chown 1000 /metaflow
# ENV HOME=/metaflow
# WORKDIR /metaflow
# USER 1000

# RUN pip install --upgrade helical
RUN pip install git+https://github.com/helicalAI/helical.git@main
RUN pip install metaflow simple-azure-blob-downloader azure-storage-blob azure-identity azure-keyvault-secrets

# Define the entry point for the container
ENTRYPOINT ["/bin/bash"]
# ENTRYPOINT ["/bin/bash"]
13 changes: 12 additions & 1 deletion helical/models/geneformer/geneformer_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ class GeneformerConfig():
The device to use. Either use "cuda" or "cpu".
accelerator : bool, optional, default = False
The accelerator configuration. By default same device as model.
nproc: int, optional, default = -1
Number of processes to use for data processing.
custom_attr_name_dict : dict, optional, default = None
A dictionary that contains the names of the custom attributes to be added to the dataset.
The keys of the dictionary are the names of the custom attributes, and the values are the names of the columns in adata.obs.
For example, if you want to add a custom attribute called "cell_type" to the dataset, you would pass custom_attr_name_dict = {"cell_type": "cell_type"}.
If you do not want to add any custom attributes, you can leave this parameter as None.
Returns
-------
Expand All @@ -35,6 +42,8 @@ def __init__(
emb_mode: Literal["cls", "cell", "gene"] = "cell",
device: Literal["cpu", "cuda"] = "cpu",
accelerator: Optional[bool] = False,
nproc: int = -1,
custom_attr_name_dict: Optional[dict] = None
):

# model specific parameters
Expand Down Expand Up @@ -107,7 +116,9 @@ def __init__(
"accelerator": accelerator,
"input_size": self.model_map[model_name]["input_size"],
"special_token": self.model_map[model_name]["special_token"],
"embsize": self.model_map[model_name]["embsize"]
"embsize": self.model_map[model_name]["embsize"],
"nproc": nproc,
"custom_attr_name_dict": custom_attr_name_dict
}


Expand Down
40 changes: 18 additions & 22 deletions helical/models/geneformer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,28 @@ def __init__(self, configurer: GeneformerConfig = default_configurer) -> None:
self.model = self.accelerator.prepare(self.model)
else:
self.accelerator = None

# load token dictionary (Ensembl IDs:token)
with open(self.files_config["token_path"], "rb") as f:
self.gene_token_dict = pickle.load(f)
self.pad_token_id = self.gene_token_dict.get("<pad>")

self.tk = TranscriptomeTokenizer(custom_attr_name_dict=self.config["custom_attr_name_dict"],
nproc=self.config['nproc'],
model_input_size=self.config["input_size"],
special_token=self.config["special_token"],
gene_median_file = self.files_config["gene_median_path"],
token_dictionary_file = self.files_config["token_path"],
gene_mapping_file = self.files_config["ensembl_dict_path"],
)

LOGGER.info(f"Model finished initializing.")

def process_data(self,
adata: AnnData,
gene_names: str = "index",
nproc: int = 1,
output_path: Optional[str] = None,
custom_attr_name_dict: Optional[dict] = None) -> Dataset:
) -> Dataset:
"""Processes the data for the Geneformer model
Parameters
Expand All @@ -125,15 +139,9 @@ def process_data(self,
If it is changes to "ensembl_id", there will be no mapping.
In the special case where the data has Ensemble IDs as the index, and you pass "index". This would result in invalid mappings.
In that case, it is recommended to create a new column with the Ensemble IDs in the data and pass "ensembl_id" as the gene_names.
nproc : int, optional, default = 1
Number of processes to use for dataset processing.
output_path : str, default = None
Whether to save the tokenized dataset to the specified output_path.
custom_attr_name_dict : dict, optional, default = None
A dictionary that contains the names of the custom attributes to be added to the dataset.
The keys of the dictionary are the names of the custom attributes, and the values are the names of the columns in adata.obs.
For example, if you want to add a custom attribute called "cell_type" to the dataset, you would pass custom_attr_name_dict = {"cell_type": "cell_type"}.
If you do not want to add any custom attributes, you can leave this parameter as None.
Returns
-------
Expand All @@ -152,19 +160,7 @@ def process_data(self,
raise ValueError(message)
adata = map_gene_symbols_to_ensembl_ids(adata, gene_names)

# load token dictionary (Ensembl IDs:token)
with open(self.files_config["token_path"], "rb") as f:
self.gene_token_dict = pickle.load(f)
self.pad_token_id = self.gene_token_dict.get("<pad>")

self.tk = TranscriptomeTokenizer(custom_attr_name_dict=custom_attr_name_dict,
nproc=nproc,
model_input_size=self.config["input_size"],
special_token=self.config["special_token"],
gene_median_file = self.files_config["gene_median_path"],
token_dictionary_file = self.files_config["token_path"],
gene_mapping_file = self.files_config["ensembl_dict_path"],
)

tokenized_cells, cell_metadata = self.tk.tokenize_anndata(adata)

# tokenized_cells, cell_metadata = self.tk.tokenize_anndata(adata)
Expand Down
9 changes: 8 additions & 1 deletion helical/models/uce/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def process_data(self,
UCEDataset
Inherits from Dataset class.
"""




self.ensure_rna_data_validity(adata, gene_names)

if gene_names != "index":
Expand All @@ -107,6 +109,11 @@ def process_data(self,
"protein_embeddings_dir": self.model_dir / "protein_embeddings/",
"offset_pkl_path": self.model_dir / "species_offsets.pkl"
}

## TODO : Remove double downloads. This is required since metaflow might not have stored the files in the right location and the files might have dissapeared. The downloader should check if the file already exists.
downloader = Downloader()
for k,file in files_config.items():
downloader.download_via_name(file)

if filter_genes_min_cell is not None:
sc.pp.filter_genes(adata, min_cells=filter_genes_min_cell)
Expand Down
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@ build-backend = "hatchling.build"

[project]
name = "helical"
version = "0.0.1a2"
version = "0.0.1a4"
authors = [
{ name="Maxime Allard", email="[email protected]" },
{ name = "Benoit Putzeys", email="[email protected]"},
{ name="Helical Team", email="[email protected]" },
]
description = "Helical Python SDK"
readme = "README.md"
Expand Down

0 comments on commit 90f2eaa

Please sign in to comment.