Allow .csv files for transcript to gene maps, fix dependencies for pi…

…p, require Python 3.9
complextissue · Sep 11, 2024 · 10bb27d · 10bb27d
1 parent 09b6336
commit 10bb27d
Show file tree

Hide file tree

Showing 16 changed files with 246 additions and 85 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: [3.11.4]
+        python-version: [3.9.16]
 
     steps:
       - uses: actions/checkout@v3

diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-3.11.2
+3.9
diff --git a/CITATION.cff b/CITATION.cff
@@ -6,6 +6,6 @@ authors:
   - family-names: "Puelles"
     given-names: "Victor"
 title: "pytximport: Gene count estimation from transcript quantification files in Python"
-version: 0.8.0
+version: 0.9.0
 date-released: 2024-07-11
 url: "https://github.com/complextissue/pytximport"
diff --git a/INSTALL.md b/INSTALL.md
@@ -1,7 +1,7 @@
 # Dependencies
 
 To fulfill all dependencies for this project, **all** of the following steps are required.
-`pytximport` only targets support for `python` versions greater than or equal `3.8`.
+`pytximport` only targets support for `python` versions greater than or equal `3.9`.
 
 ## Installation for `pytximport`
 

diff --git a/README.md b/README.md
@@ -15,6 +15,13 @@
 
 ## Installation
 
+The recommended way to install `pytximport`is through Bioconda:
+
+```bash
+mamba install -c bioconda pytximpport
+```
+
+`pytximport` can also be installed via pip:
 ```bash
 pip install pytximport
 ```
@@ -71,7 +78,7 @@ The `tximport` package has become a main stay in the bulk RNA sequencing communi
 Please cite both the original publication as well as this Python implementation:
 
 - Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
-- Kuehl, M., & Puelles, V. (2024). pytximport: Gene count estimation from transcript quantification files in Python (Version 0.8.0) [Computer software]. https://github.com/complextissue/pytximport
+- Kuehl, M., & Puelles, V. (2024). pytximport: Gene count estimation from transcript quantification files in Python (Version 0.9.0) [Computer software]. https://github.com/complextissue/pytximport
 
 ## License
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -18,7 +18,7 @@
 author = "Malte Kuehl"
 
 # The full version, including alpha/beta/rc tags
-release = "0.8.0"
+release = "0.9.0"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/example.ipynb b/docs/source/example.ipynb
diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -3,11 +3,12 @@
 :::{card}
 :class-card: sd-bg-warning
 :class-body: sd-bg-text-warning
-**pytximport** only supports Python versions greater than or equal to **3.8**.
+**pytximport** only supports Python versions greater than or equal to **3.9**.
 :::
 
 :::{card} Recommendation
-Installation via `pip` is the easiest and recommended way to include `pytximport` in your projects.
+Installation via `Bioconda` is the recommended way to include `pytximport` in your projects. A `pip`-installabe package
+is also provided.
 :::
 
 ## Installation Options
@@ -16,20 +17,20 @@ Choose an option to install this package.
 
 ::::{tab-set}
 
-:::{tab-item} PyPi
-Install `pytximport` package using `pip`:
+:::{tab-item} Bioconda
+Install `pytximport` from `Bioconda` using `mamba` or `conda`:
 
 ```bash
-python3 -m pip install pytximport
+mamba install -c bioconda pytximport
 ```
 
 :::
 
-:::{tab-item} Bioconda
-Install `pytximport` from `Bioconda` using `mamba` or `conda`:
+:::{tab-item} PyPi
+Install `pytximport` package using `pip`:
 
 ```bash
-mamba install -c bioconda pytximport
+python3 -m pip install pytximport
 ```
 
 :::
@@ -44,12 +45,16 @@ python3 -m pip install git+https://github.com/complextissue/pytximport.git
 :::
 
 :::{tab-item} Source
+This option is only recommended for potential contributors and installs additional developement dependencies.
 Install `pytximport` from source:
 
 ```bash
 git clone https://github.com/complextissue/pytximport.git
 cd pytximport
-make install
+pyenv local 3.9
+make create-venv
+source .venv/source/activate
+make install-dev
 ```
 
 :::

diff --git a/docs/source/start.md b/docs/source/start.md
@@ -69,7 +69,7 @@ The `tximport` package has become a main stay in the bulk RNA sequencing communi
 Please cite both the original publication as well as this Python implementation:
 
 - Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
-- Kuehl, M., & Puelles, V. (2024). pytximport: Gene count estimation from transcript quantification files in Python (Version 0.8.0) [Computer software]. https://github.com/complextissue/pytximport
+- Kuehl, M., & Puelles, V. (2024). pytximport: Gene count estimation from transcript quantification files in Python (Version 0.9.0) [Computer software]. https://github.com/complextissue/pytximport
 
 ## License
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "flit_core.buildapi"
 [project]
 name = "pytximport"
 description = "A python implementation of tximport to transform transcript into gene counts"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 license = { file = "LICENSE" }
 authors = [{ name = "Malte Kuehl", email = "[email protected]" }]
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -17,7 +17,6 @@ classifiers = [
     "Natural Language :: English",
     "Typing :: Typed",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -26,16 +25,15 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "anndata",
-    "click",
-    "dask",
-    "flox",
-    "h5py",
-    "numpy",
-    "pandas",
-    "pybiomart",
-    "tqdm",
-    "xarray",
+    "anndata>=0.9.0,<1",
+    "click>=8.0.0,<9",
+    "flox>=0.9.0,<0.10.0",
+    "h5py>=3.0.0,<4",
+    "numpy>=1.19.0,<3",
+    "pandas>=2.2.0,<3",
+    "pybiomart==0.2.0",
+    "tqdm>=4.0.0,<5",
+    "xarray>=2024.0.0",
 ]
 
 [project.optional-dependencies]
@@ -75,7 +73,7 @@ pytximport = "pytximport:cli"
 [project.urls]
 Home = "https://github.com/complextissue/pytximport"
 Source = "https://github.com/complextissue/pytximport"
-Documentation = "https://pytximport.readthedocs.io/en/latest/"
+Documentation = "https://pytximport.readthedocs.io/en/stable/"
 
 [tool.flit.sdist]
 exclude = ["docs/*", "test/*"]
@@ -86,7 +84,7 @@ src_paths = ["pytximport", "test"]
 
 [tool.black]
 line-length = 120
-target-version = ["py38", "py39", "py310", "py311"]
+target-version = ["py39", "py310", "py311"]
 
 [tool.mypy]
 warn_return_any = true

diff --git a/pytximport/_version.py b/pytximport/_version.py
@@ -1,4 +1,4 @@
 """Version information for the pytximport package."""
 
 # This package will follow Semantic Versioning after version 1.0.0: https://semver.org/
-__version__ = "0.8.0"
+__version__ = "0.9.0"
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
@@ -56,7 +56,7 @@ def tximport(
             of quantification files. Defaults to "salmon".
         transcript_gene_map (Optional[Union[pd.DataFrame, Union[str, Path]], optional): The mapping from transcripts to
             genes. Has to contain two columns: `transcript_id` and `gene_id`. If you provide a path to a file, it has to
-            be a tab-separated file with a header. Defaults to None.
+            be either a tab-separated (.tsv) or comma-separated (.csv) file with a header. Defaults to None.
         counts_from_abundance (Optional[Literal["scaled_tpm", "length_scaled_tpm", "dtu_scaled_tpm"]], optional):
             Whether to calculate count estimates based on the abundance. When using scaled_tpm or length_scaled_tpm the
             counts no longer correlate with the the average transcript length per sample. In those cases, the length
@@ -122,13 +122,21 @@ def tximport(
 
     # read the transcript to gene mapping
     if isinstance(transcript_gene_map, str) or isinstance(transcript_gene_map, Path):
+        transcript_gene_map = Path(transcript_gene_map)
+        if not transcript_gene_map.exists():
+            raise FileNotFoundError(f"The transcript to gene mapping does not exist: {transcript_gene_map}")
+
         try:
-            transcript_gene_map = pd.read_table(transcript_gene_map, header=0)
+            if transcript_gene_map.suffix == ".csv":
+                transcript_gene_map = pd.read_csv(transcript_gene_map, header=0)
+            else:
+                transcript_gene_map = pd.read_table(transcript_gene_map, header=0)
         except Exception as exception:
             raise ValueError(f"Could not read the transcript to gene mapping: {exception}")
 
-    if isinstance(transcript_gene_map, pd.DataFrame):
+    if transcript_gene_map is not None:
         # assert that transcript_id and gene_id are present in the mapping
+        assert isinstance(transcript_gene_map, pd.DataFrame), "The mapping must be a DataFrame."
         assert "transcript_id" in transcript_gene_map.columns, "The mapping does not contain a `transcript_id` column."
         assert "gene_id" in transcript_gene_map.columns, "The mapping does not contain a `gene_id` column."
 
@@ -142,7 +150,9 @@ def tximport(
 
     # assert that return_transcript_data is True if transcript_gene_map is None
     if transcript_gene_map is None:
-        assert return_transcript_data, "A transcript to gene mapping must be provided when summarizing to genes."
+        assert (
+            return_transcript_data and not gene_level
+        ), "A transcript to gene mapping must be provided when summarizing to genes."
 
     if gene_level and data_type != "rsem":
         raise ValueError("Gene-level imports are only available for RSEM quantification files.")

diff --git a/requirements.frozen.txt b/requirements.frozen.txt
@@ -0,0 +1,68 @@
+anndata==0.10.9
+array_api_compat==1.8
+attrs==24.2.0
+build==1.2.2
+CacheControl==0.14.0
+cattrs==24.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+cleo==2.1.0
+click==8.1.7
+crashtest==0.4.1
+distlib==0.3.8
+docutils==0.21.2
+dulwich==0.21.7
+exceptiongroup==1.2.2
+fastjsonschema==2.20.0
+filelock==3.16.0
+flit==3.9.0
+flit_core==3.9.0
+flox==0.9.10
+future==1.0.0
+h5py==3.11.0
+idna==3.8
+importlib_metadata==8.4.0
+installer==0.7.0
+jaraco.classes==3.4.0
+keyring==24.3.1
+more-itertools==10.5.0
+msgpack==1.1.0
+natsort==8.4.0
+numpy==2.0.2
+numpy-groupies==0.11.2
+packaging==24.1
+pandas==2.2.2
+pexpect==4.9.0
+pkginfo==1.11.1
+platformdirs==4.3.2
+poetry==1.8.3
+poetry-core==1.9.0
+poetry-plugin-export==1.8.0
+ptyprocess==0.7.0
+pybiomart==0.2.0
+pycparser==2.22
+pyproject_hooks==1.1.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+rapidfuzz==3.9.7
+requests==2.32.3
+requests-cache==1.2.1
+requests-toolbelt==1.0.0
+scipy==1.13.1
+shellingham==1.5.4
+six==1.16.0
+tomli==2.0.1
+tomli_w==1.0.0
+tomlkit==0.13.2
+toolz==0.12.1
+tqdm==4.66.5
+trove-classifiers==2024.7.2
+typing_extensions==4.12.2
+tzdata==2024.1
+url-normalize==1.4.3
+urllib3==2.2.2
+virtualenv==20.26.4
+xarray==2024.7.0
+xattr==1.1.0
+zipp==3.20.1
diff --git a/...disease/transcript_gene_mapping_human.csv → ...disease/transcript_gene_mapping_human.tsv b/...disease/transcript_gene_mapping_human.csv → ...disease/transcript_gene_mapping_human.tsv
diff --git a/test/test_comparison.ipynb b/test/test_comparison.ipynb
@@ -228,7 +228,7 @@
     "library(tximport)\n",
     "library(readr)\n",
     "dir <- \"./data/fabry_disease\"\n",
-    "tx2gene <- read_tsv(file.path(dir, \"transcript_gene_mapping_human.csv\"))\n",
+    "tx2gene <- read_tsv(file.path(dir, \"transcript_gene_mapping_human.tsv\"))\n",
     "rowMedians <- function(x) {\n",
     "    apply(x, 1, median, na.rm = TRUE)\n",
     "}\n",
@@ -376,9 +376,9 @@
     }
    ],
    "source": [
-    "!pytximport -i ./data/fabry_disease/SRR16504309_wt/quant.sf -i ./data/fabry_disease/SRR16504310_wt/quant.sf -i ./data/fabry_disease/SRR16504311_ko/quant.sf -i ./data/fabry_disease/SRR16504312_ko/quant.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -ow -o ./data/fabry_disease/counts_pytximport_no.csv\n",
-    "!pytximport -i ./data/fabry_disease/SRR16504309_wt/quant.sf -i ./data/fabry_disease/SRR16504310_wt/quant.sf -i ./data/fabry_disease/SRR16504311_ko/quant.sf -i ./data/fabry_disease/SRR16504312_ko/quant.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -ow -o ./data/fabry_disease/counts_pytximport_scaledTPM.csv -c scaled_tpm\n",
-    "!pytximport -i ./data/fabry_disease/SRR16504309_wt/quant.sf -i ./data/fabry_disease/SRR16504310_wt/quant.sf -i ./data/fabry_disease/SRR16504311_ko/quant.sf -i ./data/fabry_disease/SRR16504312_ko/quant.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -ow -o ./data/fabry_disease/counts_pytximport_lengthScaledTPM.csv -c length_scaled_tpm"
+    "!pytximport -i ./data/fabry_disease/SRR16504309_wt/quant.sf -i ./data/fabry_disease/SRR16504310_wt/quant.sf -i ./data/fabry_disease/SRR16504311_ko/quant.sf -i ./data/fabry_disease/SRR16504312_ko/quant.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.tsv -ow -o ./data/fabry_disease/counts_pytximport_no.csv\n",
+    "!pytximport -i ./data/fabry_disease/SRR16504309_wt/quant.sf -i ./data/fabry_disease/SRR16504310_wt/quant.sf -i ./data/fabry_disease/SRR16504311_ko/quant.sf -i ./data/fabry_disease/SRR16504312_ko/quant.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.tsv -ow -o ./data/fabry_disease/counts_pytximport_scaledTPM.csv -c scaled_tpm\n",
+    "!pytximport -i ./data/fabry_disease/SRR16504309_wt/quant.sf -i ./data/fabry_disease/SRR16504310_wt/quant.sf -i ./data/fabry_disease/SRR16504311_ko/quant.sf -i ./data/fabry_disease/SRR16504312_ko/quant.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.tsv -ow -o ./data/fabry_disease/counts_pytximport_lengthScaledTPM.csv -c length_scaled_tpm"
    ]
   },
   {
@@ -453,7 +453,7 @@
     "files_protein_coding <- c(\n",
     "  file.path(dir, \"quant.sf\")\n",
     ")\n",
-    "tx2gene <- read_tsv(file.path(\"./data/fabry_disease\", \"transcript_gene_mapping_human.csv\"))\n",
+    "tx2gene <- read_tsv(file.path(\"./data/fabry_disease\", \"transcript_gene_mapping_human.tsv\"))\n",
     "countsFromAbundanceOptions <- c(\"scaledTPM\", \"dtuScaledTPM\")\n",
     "for (idx in seq_along(countsFromAbundanceOptions)) {\n",
     "    txi <- tximport(\n",
@@ -505,7 +505,7 @@
     }
    ],
    "source": [
-    "!pytximport -i ./data/salmon/quant.sf -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/salmon/counts_pytximport_dtuScaledTPM.csv -t salmon -tx -c dtu_scaled_tpm"
+    "!pytximport -i ./data/salmon/quant.sf -m ./data/fabry_disease/transcript_gene_mapping_human.tsv -o ./data/salmon/counts_pytximport_dtuScaledTPM.csv -t salmon -tx -c dtu_scaled_tpm"
    ]
   },
   {
@@ -568,7 +568,7 @@
     "files_protein_coding <- c(\n",
     "  file.path(dir, \"test.genes.results.gz\")\n",
     ")\n",
-    "tx2gene <- read_tsv(file.path(\"./data/fabry_disease\", \"transcript_gene_mapping_human.csv\"))\n",
+    "tx2gene <- read_tsv(file.path(\"./data/fabry_disease\", \"transcript_gene_mapping_human.tsv\"))\n",
     "countsFromAbundanceOptions <- c(\"no\")\n",
     "for (idx in seq_along(countsFromAbundanceOptions)) {\n",
     "    txi <- tximport(\n",

diff --git a/test/test_correctness.py b/test/test_correctness.py
@@ -24,7 +24,7 @@ def test_correctness(
         result = tximport(
             fabry_disease_files,
             "salmon",
-            fabry_directory / "transcript_gene_mapping_human.csv",
+            fabry_directory / "transcript_gene_mapping_human.tsv",
             ignore_transcript_version=True,
             ignore_after_bar=True,
             output_type="xarray",
@@ -80,7 +80,7 @@ def test_correctness_transcript_level(
         result = tximport(
             [salmon_file],
             "salmon",
-            data_directory / "fabry_disease" / "transcript_gene_mapping_human.csv",
+            data_directory / "fabry_disease" / "transcript_gene_mapping_human.tsv",
             return_transcript_data=True,
             ignore_transcript_version=True,
             ignore_after_bar=True,
@@ -133,7 +133,7 @@ def test_correctness_gene_level(
     result = tximport(
         rsem_files,
         "rsem",
-        data_directory / "fabry_disease" / "transcript_gene_mapping_human.csv",
+        data_directory / "fabry_disease" / "transcript_gene_mapping_human.tsv",
         gene_level=True,
         ignore_transcript_version=True,
         ignore_after_bar=True,
@@ -180,7 +180,7 @@ def test_correctness_inferential_replicates(
             result = tximport(
                 fabry_disease_files,
                 data_type,  # type: ignore
-                fabry_directory / "transcript_gene_mapping_human.csv",
+                fabry_directory / "transcript_gene_mapping_human.tsv",
                 return_transcript_data=return_transcript_data,
                 inferential_replicates=True,
                 inferential_replicate_transformer=lambda x: np.median(x, axis=1),