Skip to content

Commit

Permalink
Add piscem test file and improve unit test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
maltekuehl committed Sep 14, 2024
1 parent dc7bee4 commit 17d0af7
Show file tree
Hide file tree
Showing 11 changed files with 278,370 additions and 36 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "pytximport"
description = "A python implementation of tximport to transform transcript into gene counts"
description = "A python implementation of `tximport` to transform transcript into gene counts"
requires-python = ">=3.9"
license = { file = "LICENSE" }
authors = [{ name = "Malte Kuehl", email = "[email protected]" }]
Expand All @@ -25,7 +25,7 @@ classifiers = [
]
dynamic = ["version"]
dependencies = [
"anndata>=0.9.0",
"anndata>=0.8.0",
"click>=8.0.0,<9",
"flox>=0.9.0,<0.10.0",
"h5py>=3.0.0,<4",
Expand Down
15 changes: 12 additions & 3 deletions pytximport/core/_tximport.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,18 @@ def tximport(

importer = read_tsv
elif data_type == "piscem":
id_column = "Name" if id_column is None else id_column
counts_column = "NumReads" if counts_column is None else counts_column
length_column = "EffectiveLength" if length_column is None else length_column
warning(
(
"Assuming a piscem-infer .quant file with columns: target_name, ecount, eeln, tpm. "
"This differs from the assumed columns in the original tximport package. "
"If you encounter issues, please provide the column names explicitly."
)
)

id_column = "target_name" if id_column is None else id_column
counts_column = "ecount" if counts_column is None else counts_column
length_column = "eeln" if length_column is None else length_column
abundance_column = "tpm" if abundance_column is None else abundance_column

importer = read_tsv
elif data_type == "stringtie":
Expand Down
11 changes: 0 additions & 11 deletions pytximport/importers/_read_salmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,6 @@ def read_salmon(
if not file_path.suffix == ".sf" and not file_path.suffix == ".gz":
raise ImportError("Only .sf and .gz files are supported.")

# Unzip the file if it is compressed
if file_path.suffix == ".gz":
try:
with gzip.open(file_path, "rt") as f:
file_content = f.read()
file_path = file_path.with_suffix(".sf")
with open(file_path, "w") as f:
f.write(file_content)
except Exception as e:
raise ImportError(f"Could not unzip the file: {file_path}") from e

transcript_data = read_tsv(
file_path,
id_column=id_column,
Expand Down
27 changes: 22 additions & 5 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ def salmon_file() -> Path:
return Path(FILE_DIR) / "salmon" / "quant.sf"


@pytest.fixture(scope="session")
def salmon_file_gzip() -> Path:
"""Provide the path to a salmon quantification file."""
return Path(FILE_DIR) / "salmon" / "quant.sf.gz"


@pytest.fixture(scope="session")
def salmon_multiple_files() -> List[Path]:
"""Create multiple salmon quantification files."""
Expand All @@ -100,6 +106,12 @@ def salmon_multiple_files() -> List[Path]:
return file_paths


@pytest.fixture(scope="session")
def piscem_file() -> Path:
"""Provide the path to a piscem quantification file."""
return Path(FILE_DIR) / "piscem" / "res.quant"


@pytest.fixture(scope="session")
def fabry_disease_files() -> List[Path]:
"""Output the paths to the fabry disease files."""
Expand All @@ -120,15 +132,20 @@ def fabry_disease_files() -> List[Path]:
@pytest.fixture(scope="session")
def transcript_name_mapping_human() -> pd.DataFrame:
"""Provide a transcript id to transcript name mapping for human samples."""
from pybiomart import Dataset

dataset = Dataset(name="hsapiens_gene_ensembl", host="http://www.ensembl.org")
transcript_name_mapping_human = dataset.query(attributes=["ensembl_transcript_id", "external_transcript_name"])
transcript_name_mapping_human.columns = ["transcript_id", "transcript_name"]
transcript_name_mapping_human = pd.read_table(
Path(FILE_DIR) / "transcript_name_mapping_human.tsv",
header=0,
)

return transcript_name_mapping_human


@pytest.fixture(scope="session")
def transcript_name_mapping_human_path() -> Path:
"""Provides the path to the transcript id to transcript name mapping for human samples."""
return Path(FILE_DIR) / "transcript_name_mapping_human.tsv"


@pytest.fixture(scope="session")
def gtf_annotation_file() -> Path:
"""Provide the path to a GTF annotation file."""
Expand Down
15 changes: 15 additions & 0 deletions test/data/piscem/res.quant
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
target_name len eeln tpm ecount
ENST00000513300.5 1924 1746.98 11129.2 102.328
ENST00000282507.7 2355 2177.98 138884 1592.02
ENST00000504685.5 1476 1298.98 10041.8 68.6528
ENST00000243108.4 1733 1555.98 41944.9 343.499
ENST00000303450.4 1516 1338.98 94221.8 664
ENST00000243082.4 2039 1861.98 5612.36 55
ENST00000303406.4 1524 1346.98 42908.2 304.189
ENST00000303460.4 1936 1758.98 5076.85 47
ENST00000243056.4 2423 2245.98 3553.05 42
ENST00000312492.2 1805 1627.98 26609.9 228
ENST00000040584.5 1889 1711.98 476675 4295
ENST00000430889.2 1666 1488.98 79578.2 623.628
ENST00000394331.3 2943 2765.98 5885.85 85.6842
ENST00000243103.3 3335 3157.98 57879.3 962
Binary file added test/data/salmon/quant.sf.gz
Binary file not shown.
Loading

0 comments on commit 17d0af7

Please sign in to comment.