Improve RSEM handling

complextissue · Sep 18, 2024 · cbdcf2d · cbdcf2d
1 parent 4df75a1
commit cbdcf2d
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 3 deletions.
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
@@ -306,6 +306,11 @@ def tximport(
                 }
 
                 if inferential_replicates:
+                    if transcript_data_sample["inferential_replicates"] is None:
+                        raise ValueError(
+                            f"The quantification file does not contain inferential replicates: {file_path}"
+                        )
+
                     bootstrap_count = transcript_data_sample["inferential_replicates"]["replicates"].shape[1]
                     data_vars["inferential_replicates"] = xr.DataArray(
                         data=np.zeros((len(transcript_data_sample["transcript_id"]), bootstrap_count, len(file_paths))),
@@ -386,6 +391,29 @@ def tximport(
             transcript_data, biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
         )
 
+    # Remove appended gene names after underscore for RSEM data for both transcript and gene ids
+    if (
+        data_type == "rsem"
+        and (gene_level and transcript_data.coords["gene_id"].values[0].count("_") > 0)
+        or (not gene_level and transcript_data.coords["transcript_id"].values[0].count("_") > 0)
+        and ignore_after_bar
+    ):
+        warning(
+            (
+                "RSEM seems to have been run with `--append-names`. "
+                "Removing the appended names. "
+                "Set `ignore_after_bar` to False to keep the appended names."
+            )
+        )
+        if not gene_level:
+            transcript_data.coords["transcript_id"] = [
+                transcript_id.split("_")[0] for transcript_id in transcript_data.coords["transcript_id"].values
+            ]
+        else:
+            transcript_data.coords["gene_id"] = [
+                gene_id.split("_")[0] for gene_id in transcript_data.coords["gene_id"].values
+            ]
+
     result: Union[xr.Dataset, ad.AnnData]
     if return_transcript_data:
         # Return the transcript-level expression if requested

diff --git a/pytximport/importers/_read_rsem.py b/pytximport/importers/_read_rsem.py
@@ -31,7 +31,6 @@ def read_rsem(
         file_path = Path(file_path)
 
     if file_path.is_dir():
-        # Add quant.sf to the file path
         if gene_level:
             file_paths = list(file_path.glob("*.genes.results.gz"))
             file_identifier = "genes.results.gz"
@@ -48,8 +47,8 @@ def read_rsem(
         file_path = file_paths[0]
 
     # Check that we are importing a .sf file
-    if not file_path.suffix == ".gz":
-        raise ImportError("Only .gz files are supported.")
+    if not file_path.suffix == ".gz" and not file_path.suffix == ".results":
+        raise ImportError("Only .gz and .results files are supported.")
 
     if gene_level and "transcript" in id_column:
         warning("Gene-level quantification file with transcript-level column name. Please check the column name.")