From e8af37535a9ca1f90ccb4064ee2aef3bf60e5f25 Mon Sep 17 00:00:00 2001
From: Daniel Bolin <axbolin@iu.edu>
Date: Thu, 29 Feb 2024 16:19:02 -0500
Subject: [PATCH] Add --query-layers-key option for all algorithms

---
 containers/azimuth/context/main.py    | 60 ++++++++++++++++++++++++---
 containers/azimuth/options.yml        |  5 +++
 containers/celltypist/context/main.py | 17 ++++++++
 containers/celltypist/options.yml     |  7 +++-
 containers/popv/context/main.py       |  4 +-
 containers/popv/options.yml           |  2 +-
 6 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/containers/azimuth/context/main.py b/containers/azimuth/context/main.py
index 2fc5a65..d805099 100644
--- a/containers/azimuth/context/main.py
+++ b/containers/azimuth/context/main.py
@@ -17,6 +17,7 @@ class AzimuthOrganMetadata(t.TypedDict):
 
 class AzimuthOptions(t.TypedDict):
     reference_data_dir: Path
+    query_layers_key: t.Optional[str]
 
 
 class AzimuthAlgorithm(Algorithm[AzimuthOrganMetadata, AzimuthOptions]):
@@ -37,15 +38,18 @@ def do_run(
         # obs columns of dtype 'object'. As a workaround we create a
         # clean matrix without obs columns on which azimuth is run
         # after which the annotations are copied back to the original matrix
+        temp_index = self.create_temp_obs_index(data)
         clean_matrix_path = Path("clean_matrix.h5ad")
-        clean_matrix = self.create_clean_matrix(data)
+        clean_matrix = self.create_clean_matrix(data, temp_index)
+
+        self.set_data_layer(clean_matrix, options["query_layers_key"])
         clean_matrix.write_h5ad(clean_matrix_path)
 
         annotated_matrix_path = self.run_azimuth_scripts(
             clean_matrix_path, reference_data
         )
         annotated_matrix = anndata.read_h5ad(annotated_matrix_path)
-        self.copy_annotations(data, annotated_matrix)
+        self.copy_annotations(data, annotated_matrix, temp_index)
 
         return {
             "data": data,
@@ -53,30 +57,73 @@ def do_run(
             "prediction_column": "predicted." + metadata["prediction_column"],
         }
 
-    def create_clean_matrix(self, matrix: anndata.AnnData) -> anndata.AnnData:
+    def create_temp_obs_index(self, matrix: anndata.AnnData) -> pandas.Index:
+        """Creates a new index by adding a prefix to each index name.
+        Used as a workaround for: https://github.com/satijalab/azimuth/issues/178
+        and https://github.com/satijalab/azimuth/issues/138
+
+        Args:
+            matrix (anndata.AnnData): Original data
+
+        Returns:
+            pandas.Index: A new index
+        """
+        return matrix.obs.index.map(lambda name: f"QUERY:{name}")
+
+    def create_clean_matrix(
+        self,
+        matrix: anndata.AnnData,
+        temp_index: pandas.Index,
+    ) -> anndata.AnnData:
         """Creates a copy of the data with all observation columns removed.
 
         Args:
             matrix (anndata.AnnData): Original data
+            temp_index (pandas.Index): Temporary index generated by `create_temp_obs_index`
 
         Returns:
             anndata.AnnData: Cleaned data
         """
-        clean_obs = pandas.DataFrame(index=matrix.obs.index)
+        clean_obs = pandas.DataFrame(index=temp_index)
         clean_matrix = matrix.copy()
         clean_matrix.obs = clean_obs
+
         return clean_matrix
 
+    def set_data_layer(
+        self, matrix: anndata.AnnData, query_layers_key: t.Optional[str]
+    ) -> None:
+        """Set the data layer to use for annotating.
+
+        Args:
+            matrix (anndata.AnnData): Matrix to update
+            query_layers_key (t.Optional[str]): A layer name or 'raw'
+        """
+        if query_layers_key == "raw":
+            matrix.X = matrix.raw.X
+        elif query_layers_key is not None:
+            matrix.X = matrix.layers[query_layers_key].copy()
+
     def copy_annotations(
-        self, matrix: anndata.AnnData, annotated_matrix: anndata.AnnData
+        self,
+        matrix: anndata.AnnData,
+        annotated_matrix: anndata.AnnData,
+        temp_index: pandas.Index,
     ) -> None:
         """Copies annotations from one matrix to another.
 
         Args:
             matrix (anndata.AnnData): Matrix to copy to
             annotated_matrix (anndata.AnnData): Matrix to copy from
+            temp_index (pandas.Index): Temporary index generated by `create_temp_obs_index`
         """
-        matrix.obs = matrix.obs.join(annotated_matrix.obs, rsuffix="_azimuth")
+        matrix.obs = matrix.obs.merge(
+            annotated_matrix.obs,
+            how="left",
+            left_on=temp_index,
+            right_index=True,
+            suffixes=(None, "_azimuth"),
+        )
 
     def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
         """Creates a subprocess running the Azimuth annotation R script.
@@ -156,6 +203,7 @@ def _get_arg_parser():
         required=True,
         help="Path to directory with reference data",
     )
+    parser.add_argument("--query-layers-key", help="Data layer to use")
 
     return parser
 
diff --git a/containers/azimuth/options.yml b/containers/azimuth/options.yml
index d78b252..314145a 100644
--- a/containers/azimuth/options.yml
+++ b/containers/azimuth/options.yml
@@ -7,3 +7,8 @@ fields:
     label: Directory with reference data directories
     inputBinding:
       prefix: --reference-data-dir
+  queryLayersKey:
+    type: string?
+    label: Data layer to use
+    inputBinding:
+      prefix: --query-layers-key
diff --git a/containers/celltypist/context/main.py b/containers/celltypist/context/main.py
index 0498deb..3a23387 100644
--- a/containers/celltypist/context/main.py
+++ b/containers/celltypist/context/main.py
@@ -16,6 +16,7 @@ class CelltypistOrganMetadata(t.TypedDict):
 
 class CelltypistOptions(t.TypedDict):
     ensemble_lookup: Path
+    query_layers_key: t.Optional[str]
 
 
 class CelltypistAlgorithm(Algorithm[CelltypistOrganMetadata, CelltypistOptions]):
@@ -31,6 +32,7 @@ def do_run(
     ) -> RunResult:
         """Annotate data using celltypist."""
         data = scanpy.read_h5ad(matrix)
+        self.set_data_layer(data, options["query_layers_key"])
         data = self.normalize(data)
         data, var_names = self.normalize_var_names(data, options)
         data = celltypist.annotate(
@@ -40,6 +42,20 @@ def do_run(
 
         return {"data": data, "organ_level": metadata["model"].replace(".", "_")}
 
+    def set_data_layer(
+        self, matrix: scanpy.AnnData, query_layers_key: t.Optional[str]
+    ) -> None:
+        """Set the data layer to use for annotating.
+
+        Args:
+            matrix (anndata.AnnData): Matrix to update
+            query_layers_key (t.Optional[str]): A layer name or 'raw'
+        """
+        if query_layers_key == "raw":
+            matrix.X = matrix.raw.X
+        elif query_layers_key is not None:
+            matrix.X = matrix.layers[query_layers_key].copy()
+
     def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
         """Normalizes data according to celltypist requirements.
 
@@ -115,6 +131,7 @@ def _get_arg_parser():
         default="/ensemble-lookup.csv",
         help="Ensemble id to gene name csv",
     )
+    parser.add_argument("--query-layers-key", help="Data layer to use")
 
     return parser
 
diff --git a/containers/celltypist/options.yml b/containers/celltypist/options.yml
index 6894767..dfddd40 100644
--- a/containers/celltypist/options.yml
+++ b/containers/celltypist/options.yml
@@ -1,4 +1,9 @@
 type: record
 name: options
 label: Celltypist specific options
-fields: {}
+fields:
+  queryLayersKey:
+    type: string?
+    label: Data layer to use
+    inputBinding:
+      prefix: --query-layers-key
diff --git a/containers/popv/context/main.py b/containers/popv/context/main.py
index 4acef4b..12edabb 100644
--- a/containers/popv/context/main.py
+++ b/containers/popv/context/main.py
@@ -299,9 +299,7 @@ def _get_arg_parser():
         required=True,
         help="Path to models directory",
     )
-    parser.add_argument(
-        "--query-layers-key", required=True, help="Name of layer with raw counts"
-    )
+    parser.add_argument("--query-layers-key", help="Name of layer with raw counts")
     parser.add_argument("--prediction-mode", default="fast", help="Prediction mode")
     parser.add_argument(
         "--cell-ontology-dir",
diff --git a/containers/popv/options.yml b/containers/popv/options.yml
index 6c3ab73..fef5e06 100644
--- a/containers/popv/options.yml
+++ b/containers/popv/options.yml
@@ -13,7 +13,7 @@ fields:
     inputBinding:
       prefix: --models-dir
   queryLayersKey:
-    type: string
+    type: string?
     inputBinding:
       prefix: --query-layers-key
   predictionMode: