Add piscem test file and improve unit test coverage

complextissue · Sep 14, 2024 · 17d0af7 · 17d0af7
1 parent dc7bee4
commit 17d0af7
Show file tree

Hide file tree

Showing 11 changed files with 278,370 additions and 36 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "pytximport"
-description = "A python implementation of tximport to transform transcript into gene counts"
+description = "A python implementation of `tximport` to transform transcript into gene counts"
 requires-python = ">=3.9"
 license = { file = "LICENSE" }
 authors = [{ name = "Malte Kuehl", email = "[email protected]" }]
@@ -25,7 +25,7 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "anndata>=0.9.0",
+    "anndata>=0.8.0",
     "click>=8.0.0,<9",
     "flox>=0.9.0,<0.10.0",
     "h5py>=3.0.0,<4",

diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
@@ -199,9 +199,18 @@ def tximport(
 
         importer = read_tsv
     elif data_type == "piscem":
-        id_column = "Name" if id_column is None else id_column
-        counts_column = "NumReads" if counts_column is None else counts_column
-        length_column = "EffectiveLength" if length_column is None else length_column
+        warning(
+            (
+                "Assuming a piscem-infer .quant file with columns: target_name, ecount, eeln, tpm. "
+                "This differs from the assumed columns in the original tximport package. "
+                "If you encounter issues, please provide the column names explicitly."
+            )
+        )
+
+        id_column = "target_name" if id_column is None else id_column
+        counts_column = "ecount" if counts_column is None else counts_column
+        length_column = "eeln" if length_column is None else length_column
+        abundance_column = "tpm" if abundance_column is None else abundance_column
 
         importer = read_tsv
     elif data_type == "stringtie":

diff --git a/pytximport/importers/_read_salmon.py b/pytximport/importers/_read_salmon.py
@@ -121,17 +121,6 @@ def read_salmon(
     if not file_path.suffix == ".sf" and not file_path.suffix == ".gz":
         raise ImportError("Only .sf and .gz files are supported.")
 
-    # Unzip the file if it is compressed
-    if file_path.suffix == ".gz":
-        try:
-            with gzip.open(file_path, "rt") as f:
-                file_content = f.read()
-            file_path = file_path.with_suffix(".sf")
-            with open(file_path, "w") as f:
-                f.write(file_content)
-        except Exception as e:
-            raise ImportError(f"Could not unzip the file: {file_path}") from e
-
     transcript_data = read_tsv(
         file_path,
         id_column=id_column,

diff --git a/test/conftest.py b/test/conftest.py
@@ -89,6 +89,12 @@ def salmon_file() -> Path:
     return Path(FILE_DIR) / "salmon" / "quant.sf"
 
 
+@pytest.fixture(scope="session")
+def salmon_file_gzip() -> Path:
+    """Provide the path to a salmon quantification file."""
+    return Path(FILE_DIR) / "salmon" / "quant.sf.gz"
+
+
 @pytest.fixture(scope="session")
 def salmon_multiple_files() -> List[Path]:
     """Create multiple salmon quantification files."""
@@ -100,6 +106,12 @@ def salmon_multiple_files() -> List[Path]:
     return file_paths
 
 
+@pytest.fixture(scope="session")
+def piscem_file() -> Path:
+    """Provide the path to a piscem quantification file."""
+    return Path(FILE_DIR) / "piscem" / "res.quant"
+
+
 @pytest.fixture(scope="session")
 def fabry_disease_files() -> List[Path]:
     """Output the paths to the fabry disease files."""
@@ -120,15 +132,20 @@ def fabry_disease_files() -> List[Path]:
 @pytest.fixture(scope="session")
 def transcript_name_mapping_human() -> pd.DataFrame:
     """Provide a transcript id to transcript name mapping for human samples."""
-    from pybiomart import Dataset
-
-    dataset = Dataset(name="hsapiens_gene_ensembl", host="http://www.ensembl.org")
-    transcript_name_mapping_human = dataset.query(attributes=["ensembl_transcript_id", "external_transcript_name"])
-    transcript_name_mapping_human.columns = ["transcript_id", "transcript_name"]
+    transcript_name_mapping_human = pd.read_table(
+        Path(FILE_DIR) / "transcript_name_mapping_human.tsv",
+        header=0,
+    )
 
     return transcript_name_mapping_human
 
 
+@pytest.fixture(scope="session")
+def transcript_name_mapping_human_path() -> Path:
+    """Provides the path to the transcript id to transcript name mapping for human samples."""
+    return Path(FILE_DIR) / "transcript_name_mapping_human.tsv"
+
+
 @pytest.fixture(scope="session")
 def gtf_annotation_file() -> Path:
     """Provide the path to a GTF annotation file."""

diff --git a/test/data/piscem/res.quant b/test/data/piscem/res.quant
@@ -0,0 +1,15 @@
+target_name	len	eeln	tpm	ecount
+ENST00000513300.5	1924	1746.98	11129.2	102.328
+ENST00000282507.7	2355	2177.98	138884	1592.02
+ENST00000504685.5	1476	1298.98	10041.8	68.6528
+ENST00000243108.4	1733	1555.98	41944.9	343.499
+ENST00000303450.4	1516	1338.98	94221.8	664
+ENST00000243082.4	2039	1861.98	5612.36	55
+ENST00000303406.4	1524	1346.98	42908.2	304.189
+ENST00000303460.4	1936	1758.98	5076.85	47
+ENST00000243056.4	2423	2245.98	3553.05	42
+ENST00000312492.2	1805	1627.98	26609.9	228
+ENST00000040584.5	1889	1711.98	476675	4295
+ENST00000430889.2	1666	1488.98	79578.2	623.628
+ENST00000394331.3	2943	2765.98	5885.85	85.6842
+ENST00000243103.3	3335	3157.98	57879.3	962
diff --git a/test/data/salmon/quant.sf.gz b/test/data/salmon/quant.sf.gz