Skip to content

Commit

Permalink
Improve RSEM handling
Browse files Browse the repository at this point in the history
  • Loading branch information
maltekuehl committed Sep 18, 2024
1 parent 4df75a1 commit cbdcf2d
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
28 changes: 28 additions & 0 deletions pytximport/core/_tximport.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,11 @@ def tximport(
}

if inferential_replicates:
if transcript_data_sample["inferential_replicates"] is None:
raise ValueError(
f"The quantification file does not contain inferential replicates: {file_path}"
)

bootstrap_count = transcript_data_sample["inferential_replicates"]["replicates"].shape[1]
data_vars["inferential_replicates"] = xr.DataArray(
data=np.zeros((len(transcript_data_sample["transcript_id"]), bootstrap_count, len(file_paths))),
Expand Down Expand Up @@ -386,6 +391,29 @@ def tximport(
transcript_data, biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
)

# Remove appended gene names after underscore for RSEM data for both transcript and gene ids
if (
data_type == "rsem"
and (gene_level and transcript_data.coords["gene_id"].values[0].count("_") > 0)
or (not gene_level and transcript_data.coords["transcript_id"].values[0].count("_") > 0)
and ignore_after_bar
):
warning(
(
"RSEM seems to have been run with `--append-names`. "
"Removing the appended names. "
"Set `ignore_after_bar` to False to keep the appended names."
)
)
if not gene_level:
transcript_data.coords["transcript_id"] = [
transcript_id.split("_")[0] for transcript_id in transcript_data.coords["transcript_id"].values
]
else:
transcript_data.coords["gene_id"] = [
gene_id.split("_")[0] for gene_id in transcript_data.coords["gene_id"].values
]

result: Union[xr.Dataset, ad.AnnData]
if return_transcript_data:
# Return the transcript-level expression if requested
Expand Down
5 changes: 2 additions & 3 deletions pytximport/importers/_read_rsem.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def read_rsem(
file_path = Path(file_path)

if file_path.is_dir():
# Add quant.sf to the file path
if gene_level:
file_paths = list(file_path.glob("*.genes.results.gz"))
file_identifier = "genes.results.gz"
Expand All @@ -48,8 +47,8 @@ def read_rsem(
file_path = file_paths[0]

# Check that we are importing a .sf file
if not file_path.suffix == ".gz":
raise ImportError("Only .gz files are supported.")
if not file_path.suffix == ".gz" and not file_path.suffix == ".results":
raise ImportError("Only .gz and .results files are supported.")

if gene_level and "transcript" in id_column:
warning("Gene-level quantification file with transcript-level column name. Please check the column name.")
Expand Down

0 comments on commit cbdcf2d

Please sign in to comment.