Skip to content

Commit

Permalink
Merge pull request #24 from complextissue/dev
Browse files Browse the repository at this point in the history
Improve test coverage, fix abundance recomputing bug after biotype fi…
  • Loading branch information
maltekuehl authored Nov 27, 2024
2 parents 6897056 + edecd98 commit 2137d76
Show file tree
Hide file tree
Showing 10 changed files with 109 additions and 40 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ markers = [

[tool.coverage.run]
source = ["pytximport"]
omit = ["*/test/*"]
omit = ["*/test/*", "**/*/_cli.py"]

[tool.coverage.report]
exclude_lines = [
Expand Down
6 changes: 3 additions & 3 deletions pytximport/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
help="Welcome to the pytximport command-line interface for importing transcript-level quantification files.",
)
@click.pass_context
def cli( # type: ignore
def cli( # type: ignore # pragma: no cover
ctx: click.Context,
):
"""Welcome to the pytximport command-line interface for importing transcript-level quantification files."""
Expand Down Expand Up @@ -160,7 +160,7 @@ def cli( # type: ignore
is_flag=True,
help="Whether the existence of the files is optional.",
)
def run( # type: ignore
def run( # type: ignore # pragma: no cover
**kwargs,
) -> None:
"""Call the tximport function via the command line."""
Expand Down Expand Up @@ -221,7 +221,7 @@ def run( # type: ignore
is_flag=True,
help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.",
)
def create_map( # type: ignore
def create_map( # type: ignore # pragma: no cover
**kwargs,
) -> None:
"""Create a transcript-to-gene mapping file via the command line."""
Expand Down
8 changes: 6 additions & 2 deletions pytximport/core/_tximport.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def tximport(
biotype_filter (List[str], optional): Filter the transcripts by biotype, including only those provided. Enables
post-hoc filtering of the data based on the biotype of the transcripts. Assumes that the biotype is present
in the transcript_id of the data, bar-separated. If this is not the case, please use the `filter_by_biotype`
function from the `pytximport.utils` module instead. Defaults to None.
function from the `pytximport.utils` module instead. Please note that the abundance will NOT be recalculated
after filtering to avoid introducing bias. If you wish to recalculate the abundance, please use the
`filter_by_biotype` function from the `pytximport.utils` module instead. Defaults to None.
Returns:
Union[xr.Dataset, ad.AnnData, SummarizedExperiment, None]: The estimated gene-level or transcript-level
Expand Down Expand Up @@ -409,7 +411,9 @@ def tximport(

if biotype_filter is not None:
transcript_data = filter_by_biotype(
transcript_data, biotype_filter=biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
transcript_data,
biotype_filter=biotype_filter,
id_column=("gene_id" if gene_level else "transcript_id"),
)

# Remove appended gene names after underscore for RSEM data for both transcript and gene ids
Expand Down
2 changes: 1 addition & 1 deletion pytximport/importers/_read_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def read_tsv(
usecols.append(counts_column)
dtype[counts_column] = np.float64

if abundance_column is not None:
if abundance_column is not None and abundance_column not in usecols:
usecols.append(abundance_column)
dtype[abundance_column] = np.float64

Expand Down
12 changes: 7 additions & 5 deletions pytximport/utils/_filter_by_biotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,11 @@ def filter_by_biotype(
# quantification tools include the biotype in the transcript_id
# This only works if the transcript_id contains the biotype as one of the bar-separated fields and is not the
# best way to filter the transcripts by biotype
assert any(
"|" in transcript_id for transcript_id in transcript_ids
), "The transcript_id does not contain the biotype."
assert any("|" in transcript_id for transcript_id in transcript_ids), (
"The transcript_id column does not contain the biotype. Please use the `pytximport.utils.filter_by_biotype`"
" function with the `transcript_gene_map` argument instead. This function can be called after the"
" `tximport` function using the resulting AnnData object or xarray Dataset."
)

transcript_id_fields = [transcript_id.split("|") for transcript_id in transcript_ids]

Expand Down Expand Up @@ -99,7 +101,7 @@ def filter_by_biotype(

elif isinstance(transcript_data, ad.AnnData):
# Calculate the total abundance before filtering
total_abundance = transcript_data.obsm["abundance"].sum(axis=0)
total_abundance = transcript_data.obsm["abundance"].sum(axis=1)
transcript_data = transcript_data[:, transcript_keep_boolean]

log(
Expand All @@ -112,7 +114,7 @@ def filter_by_biotype(
log(25, "Recalculating the abundance after filtering by biotype.")

if isinstance(transcript_data, ad.AnnData):
new_abundance = transcript_data.obsm["abundance"].sum(axis=0)
new_abundance = transcript_data.obsm["abundance"].sum(axis=1)
ratio = total_abundance / new_abundance
transcript_data.obsm["abundance"] = (transcript_data.obsm["abundance"].T * ratio).T
elif isinstance(transcript_data, xr.Dataset):
Expand Down
15 changes: 15 additions & 0 deletions test/data/piscem/res_oarfish_naming.quant
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
tname len num_reads
ENST00000513300.5 1924 102.328
ENST00000282507.7 2355 1592.02
ENST00000504685.5 1476 68.6528
ENST00000243108.4 1733 343.499
ENST00000303450.4 1516 664
ENST00000243082.4 2039 55
ENST00000303406.4 1524 304.189
ENST00000303460.4 1936 47
ENST00000243056.4 2423 42
ENST00000312492.2 1805 228
ENST00000040584.5 1889 4295
ENST00000430889.2 1666 623.628
ENST00000394331.3 2943 85.6842
ENST00000243103.3 3335 962
54 changes: 37 additions & 17 deletions test/test_biotype_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List

import anndata as ad
import numpy as np
import xarray as xr

from pytximport import tximport
Expand Down Expand Up @@ -31,22 +32,41 @@ def test_biotype_filter(
return_transcript_data=transcript_level,
)

id_column = "transcript_id" if transcript_level else "gene_id"
for recalculate_abundance in [True, False]:
id_column = "transcript_id" if transcript_level else "gene_id"

result_filtered = filter_by_biotype(
result,
transcript_gene_map_mouse,
biotype_filter=["protein_coding"],
id_column=id_column,
)
result_filtered = filter_by_biotype(
result,
transcript_gene_map_mouse,
biotype_filter=["protein_coding"],
id_column=id_column,
recalculate_abundance=recalculate_abundance,
)

if output_type == "anndata":
assert isinstance(result_filtered, ad.AnnData)
assert len(result_filtered.var_names) > 0, "No genes were retained."
assert len(result_filtered.var_names) < len(result.var_names), "All genes were retained."

if recalculate_abundance:
np.testing.assert_allclose(
result_filtered.obsm["abundance"].sum(axis=1),
result.obsm["abundance"].sum(axis=1),
rtol=1e-4,
atol=1,
)

else:
assert isinstance(result_filtered, xr.Dataset)
assert len(result_filtered.coords[id_column]) > 0, "No genes were retained."
assert len(result_filtered.coords[id_column]) < len(
result.coords[id_column]
), "All genes were retained."

if output_type == "anndata":
assert isinstance(result_filtered, ad.AnnData)
assert len(result_filtered.var_names) > 0, "No genes were retained."
assert len(result_filtered.var_names) < len(result.var_names), "All genes were retained."
else:
assert isinstance(result_filtered, xr.Dataset)
assert len(result_filtered.coords[id_column]) > 0, "No genes were retained."
assert len(result_filtered.coords[id_column]) < len(
result.coords[id_column]
), "All genes were retained."
if recalculate_abundance:
np.testing.assert_allclose(
result_filtered["abundance"].sum(axis=0),
result["abundance"].sum(axis=0),
rtol=1e-4,
atol=1,
)
2 changes: 1 addition & 1 deletion test/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def test_correctness_inferential_replicates(
ignore_transcript_version=True,
ignore_after_bar=True,
output_type="xarray",
counts_from_abundance=None, # type: ignore
counts_from_abundance=None,
)

assert isinstance(result, xr.Dataset), "The result is not an xarray Dataset."
Expand Down
21 changes: 17 additions & 4 deletions test/test_piscem.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
"""Test importing salmon quantification files."""

from pathlib import Path
from typing import List

import anndata as ad
import pandas as pd
import xarray as xr

from pytximport import tximport
from pytximport.utils import biotype_filters


def test_piscem(
Expand All @@ -28,7 +25,23 @@ def test_piscem(
output_type="anndata",
ignore_transcript_version=True,
ignore_after_bar=True,
counts_from_abundance=None, # type: ignore
counts_from_abundance=None,
)

# Check that the result is an AnnData object
assert isinstance(result, ad.AnnData)

# Check that the counts.data are all positive
assert (result.X >= 0).all()

result = tximport(
[piscem_file.parent / "res_oarfish_naming.quant"],
"oarfish",
transcript_gene_mapping_human,
output_type="anndata",
ignore_transcript_version=True,
ignore_after_bar=True,
counts_from_abundance=None,
)

# Check that the result is an AnnData object
Expand Down
27 changes: 21 additions & 6 deletions test/test_transcript_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import anndata as ad
import pandas as pd
import xarray as xr

from pytximport import tximport
from pytximport.utils import biotype_filters, replace_transcript_ids_with_names
Expand All @@ -22,7 +23,7 @@ def test_salmon_transcript_level(
transcript_name_mapping_human (pd.DataFrame): The mapping of transcript ids to transcript names.
transcript_name_mapping_human_path (Path): The path to the transcript name mapping.
"""
result = tximport(
result_ad = tximport(
[salmon_file],
"salmon",
return_transcript_data=True,
Expand All @@ -32,14 +33,14 @@ def test_salmon_transcript_level(
counts_from_abundance="scaled_tpm",
)

assert isinstance(result, ad.AnnData)
counts = result.X
assert isinstance(result_ad, ad.AnnData)
counts = result_ad.X

# Check that the counts.data are all positive
assert (counts >= 0).all()

# replace the transcript ids with the transcript names
result_df = replace_transcript_ids_with_names(result, transcript_name_mapping_human)
result_df = replace_transcript_ids_with_names(result_ad, transcript_name_mapping_human)

# Check that the result is an AnnData object
assert isinstance(result_df, ad.AnnData)
Expand All @@ -55,10 +56,24 @@ def test_salmon_transcript_level(
)

# Check that it also works with a path to the transcript name mapping
result_path = replace_transcript_ids_with_names(result, transcript_name_mapping_human_path)
result_replaced_ad = replace_transcript_ids_with_names(result_ad, transcript_name_mapping_human_path)

# Check that the results are the same
pd.testing.assert_frame_equal(result_df.var_names.to_frame(), result_path.var_names.to_frame())
pd.testing.assert_frame_equal(result_df.var_names.to_frame(), result_replaced_ad.var_names.to_frame())

# Check with an xarray Dataset
result_xr = tximport(
[salmon_file],
"salmon",
return_transcript_data=True,
output_type="xarray",
ignore_transcript_version=True,
ignore_after_bar=True,
counts_from_abundance="scaled_tpm",
)

result_replaced_xr = replace_transcript_ids_with_names(result_xr, transcript_name_mapping_human)
assert isinstance(result_replaced_xr, xr.Dataset)


def test_dtu_scaled_tpm(
Expand Down

0 comments on commit 2137d76

Please sign in to comment.