downloading files again for UCE , for the Geneformer all the files ar…

…e now loaded in the init function.
helicalAI · Oct 3, 2024 · 90f2eaa · 90f2eaa
1 parent 72fec17
commit 90f2eaa
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 31 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.10
 
 RUN apt-get update -y \
     && apt-get upgrade -y \
@@ -10,9 +10,17 @@ RUN apt-get update -y \
         gfortran \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-
 
-RUN pip install --upgrade --force-reinstall git+https://github.com/helicalAI/helical.git
+
+# RUN mkdir /logs && chown 1000 /logs
+# RUN mkdir /metaflow && chown 1000 /metaflow
+# ENV HOME=/metaflow
+# WORKDIR /metaflow
+# USER 1000
+
+# RUN pip install --upgrade helical
+RUN pip install git+https://github.com/helicalAI/helical.git@main
+RUN pip install metaflow simple-azure-blob-downloader azure-storage-blob azure-identity azure-keyvault-secrets
 
 # Define the entry point for the container
-ENTRYPOINT ["/bin/bash"]
+# ENTRYPOINT ["/bin/bash"]
diff --git a/helical/models/geneformer/geneformer_config.py b/helical/models/geneformer/geneformer_config.py
@@ -20,6 +20,13 @@ class GeneformerConfig():
         The device to use. Either use "cuda" or "cpu".
     accelerator : bool, optional, default = False
         The accelerator configuration. By default same device as model.
+    nproc: int, optional, default = -1
+        Number of processes to use for data processing.
+    custom_attr_name_dict : dict, optional, default = None
+        A dictionary that contains the names of the custom attributes to be added to the dataset. 
+        The keys of the dictionary are the names of the custom attributes, and the values are the names of the columns in adata.obs. 
+        For example, if you want to add a custom attribute called "cell_type" to the dataset, you would pass custom_attr_name_dict = {"cell_type": "cell_type"}.
+        If you do not want to add any custom attributes, you can leave this parameter as None.
 
     Returns
     -------
@@ -35,6 +42,8 @@ def __init__(
             emb_mode: Literal["cls", "cell", "gene"] = "cell",
             device: Literal["cpu", "cuda"] = "cpu",
             accelerator: Optional[bool] = False,
+            nproc: int = -1,
+            custom_attr_name_dict: Optional[dict] = None
             ):
 
         # model specific parameters
@@ -107,7 +116,9 @@ def __init__(
             "accelerator": accelerator,
             "input_size": self.model_map[model_name]["input_size"],
             "special_token": self.model_map[model_name]["special_token"],
-            "embsize": self.model_map[model_name]["embsize"]
+            "embsize": self.model_map[model_name]["embsize"],
+            "nproc": nproc,
+            "custom_attr_name_dict": custom_attr_name_dict
         }
 
 

diff --git a/helical/models/geneformer/model.py b/helical/models/geneformer/model.py
@@ -101,14 +101,28 @@ def __init__(self, configurer: GeneformerConfig = default_configurer) -> None:
             self.model = self.accelerator.prepare(self.model)
         else:
             self.accelerator = None
+
+        # load token dictionary (Ensembl IDs:token)
+        with open(self.files_config["token_path"], "rb") as f:
+            self.gene_token_dict = pickle.load(f)
+            self.pad_token_id = self.gene_token_dict.get("<pad>")
+
+        self.tk = TranscriptomeTokenizer(custom_attr_name_dict=self.config["custom_attr_name_dict"],
+                                         nproc=self.config['nproc'], 
+                                         model_input_size=self.config["input_size"],
+                                         special_token=self.config["special_token"],
+                                         gene_median_file = self.files_config["gene_median_path"], 
+                                         token_dictionary_file = self.files_config["token_path"],
+                                         gene_mapping_file = self.files_config["ensembl_dict_path"],
+                                        )
+
         LOGGER.info(f"Model finished initializing.")
 
     def process_data(self, 
                      adata: AnnData,  
                      gene_names: str = "index", 
-                     nproc: int = 1, 
                      output_path: Optional[str] = None,
-                     custom_attr_name_dict: Optional[dict] = None) -> Dataset:   
+                     ) -> Dataset:   
         """Processes the data for the Geneformer model
 
         Parameters 
@@ -125,15 +139,9 @@ def process_data(self,
             If it is changes to "ensembl_id", there will be no mapping.
             In the special case where the data has Ensemble IDs as the index, and you pass "index". This would result in invalid mappings.
             In that case, it is recommended to create a new column with the Ensemble IDs in the data and pass "ensembl_id" as the gene_names.
-        nproc : int, optional, default = 1
-            Number of processes to use for dataset processing.
         output_path : str, default = None
             Whether to save the tokenized dataset to the specified output_path.
-        custom_attr_name_dict : dict, optional, default = None
-            A dictionary that contains the names of the custom attributes to be added to the dataset. 
-            The keys of the dictionary are the names of the custom attributes, and the values are the names of the columns in adata.obs. 
-            For example, if you want to add a custom attribute called "cell_type" to the dataset, you would pass custom_attr_name_dict = {"cell_type": "cell_type"}.
-            If you do not want to add any custom attributes, you can leave this parameter as None.
+        
 
         Returns
         -------
@@ -152,19 +160,7 @@ def process_data(self,
                 raise ValueError(message)
             adata = map_gene_symbols_to_ensembl_ids(adata, gene_names)
 
-        # load token dictionary (Ensembl IDs:token)
-        with open(self.files_config["token_path"], "rb") as f:
-            self.gene_token_dict = pickle.load(f)
-            self.pad_token_id = self.gene_token_dict.get("<pad>")
-
-        self.tk = TranscriptomeTokenizer(custom_attr_name_dict=custom_attr_name_dict,
-                                         nproc=nproc, 
-                                         model_input_size=self.config["input_size"],
-                                         special_token=self.config["special_token"],
-                                         gene_median_file = self.files_config["gene_median_path"], 
-                                         token_dictionary_file = self.files_config["token_path"],
-                                         gene_mapping_file = self.files_config["ensembl_dict_path"],
-                                        )
+
         tokenized_cells, cell_metadata = self.tk.tokenize_anndata(adata)
 
         # tokenized_cells, cell_metadata =  self.tk.tokenize_anndata(adata)

diff --git a/helical/models/uce/model.py b/helical/models/uce/model.py
@@ -96,7 +96,9 @@ def process_data(self,
         UCEDataset
             Inherits from Dataset class.
         """
-
+
+
+
         self.ensure_rna_data_validity(adata, gene_names)
 
         if gene_names != "index":
@@ -107,6 +109,11 @@ def process_data(self,
             "protein_embeddings_dir": self.model_dir / "protein_embeddings/",
             "offset_pkl_path": self.model_dir / "species_offsets.pkl"
         }
+
+        ## TODO : Remove double downloads. This is required since metaflow might not have stored the files in the right location and the files might have dissapeared. The downloader should check if the file already exists.
+        downloader = Downloader()
+        for k,file in files_config.items():
+            downloader.download_via_name(file)
 
         if filter_genes_min_cell is not None:
             sc.pp.filter_genes(adata, min_cells=filter_genes_min_cell)

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,10 +4,9 @@ build-backend = "hatchling.build"
 
 [project]
 name = "helical"
-version = "0.0.1a2"
+version = "0.0.1a4"
 authors = [
-  { name="Maxime Allard", email="[email protected]" },
-  { name = "Benoit Putzeys", email="[email protected]"},
+  { name="Helical Team", email="[email protected]" },
 ]
 description = "Helical Python SDK"
 readme = "README.md"