complextissue · maltekuehl · Nov 27, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.gitignore b/.gitignore
@@ -82,3 +82,4 @@ test/data/fabry_disease/counts_pytximport.csv
 test/data/salmon/quant.h5ad
 .pybiomart.sqlite
 test/data/salmon/counts_pytximport_dtuScaledTPM.csv
+docs/source/transcript_gene_map.tsv
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -6,7 +6,7 @@ build:
 
 sphinx:
    configuration: docs/source/conf.py
-   fail_on_warning: true
+   fail_on_warning: false
 
 python:
   install:

diff --git a/CITATION.cff b/CITATION.cff
@@ -64,6 +64,5 @@ keywords:
   - Python
   - scverse
 license: GPL-3.0-or-later
-commit: d9533f5
-version: 0.10.0
-date-released: '2024-11-20'
+version: 0.11.0
+date-released: '2024-11-23'
diff --git a/README.md b/README.md
@@ -8,7 +8,8 @@
 [![Documentation Status](https://readthedocs.org/projects/pytximport/badge/?version=latest)](https://pytximport.readthedocs.io/en/latest/?badge=latest)
 [![Codecov](https://codecov.io/gh/complextissue/pytximport/graph/badge.svg?token=M9JEHJVXYI)](https://codecov.io/gh/complextissue/pytximport)
 [![Install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pytximport/README.html)
-![PyPI - Downloads](https://img.shields.io/pypi/dm/pytximport)
+![Conda Downloads](https://img.shields.io/conda/d/bioconda/pytximport)
+![Pepy Total Downloads](https://img.shields.io/pepy/dt/pytximport?label=PyPi%7Cdownloads)
 [![Python Version Required](https://img.shields.io/pypi/pyversions/pytximport)](https://pypi.org/project/pytximport/)
 [![Code Style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
@@ -83,9 +84,18 @@ Common options are:
 - `-tx`: Provide this flag to return transcript-level instead of gene-summarized data. Incompatible with gene-level input and `counts_from_abundance=length_scaled_tpm`.
 - `--help`: Display all configuration options.
 
-## Development status
+Transcript-to-gene mappings can also be generated from the command line:
 
-`pytximport` is still in development and has not yet reached version 1.0.0 in the [SemVer](https://semver.org/) versioning scheme. While it should work for almost all use cases and we regularly compare outputs against the R implementation, breaking changes between minor versions may occur. If you encounter any problems, please open a GitHub issue. If you are a Python developer, we welcome pull requests implementing missing features, adding more extensive unit tests and bug fixes.
+```bash
+pytximport create-map -i ./data/annotation.gtf -o tx2gene.csv -ow
+```
+
+Command options are:
+
+- `-i`: The path to an annotation file in GTF format.
+- `-o`: The output path to save the resulting transcript-to-gene mapping to.
+- `-ow`: Provide this flag to overwrite an existing file at the output path.
+- `--help`: Display all configuration options.
 
 ## Motivation
 
@@ -116,7 +126,7 @@ Features unique to `pytximport`:
 - `SummarizedExperiment`-support to represent outputs in familiar Bioconductor data structures available through the [BiocPy](https://github.com/biocpy) ecosystem.
 - Saving outputs directly to file (use the `output_path` argument).
 - Removing transcript versions from **both** the quantification files and the transcript-to-gene map when `ignore_transcript_version` is provided.
-- Post-hoc biotype-filtering. Set `biotype_filter` to a whitelist of possible biotypes contained within the bar-separated values of your transcript ids.
+- Post-hoc biotype-filtering using `pytximport.utils.filter_by_biotype`.
 
 Features unique to `tximport`:
 - Alevin single-cell RNA-seq data support
@@ -155,6 +165,10 @@ make coverage-report
 The documentation can be build locally by navigating to the `docs` folder and running: `make html`.
 This requires that the development requirements of the package as well as the package itself have been installed in the same virtual environment and that `pandoc` has been added, e.g. by running `brew install pandoc` on macOS operating systems.
 
+## Development status
+
+`pytximport` is still in development and has not yet reached version 1.0.0 in the [SemVer](https://semver.org/) versioning scheme. While it should work for almost all use cases and we regularly compare outputs against the R implementation, breaking changes between minor versions may occur. If you encounter any problems, please open a GitHub issue. If you are a Python developer, we welcome pull requests implementing missing features, adding more extensive unit tests and bug fixes.
+
 ## Data sources
 
 The quantification files used for the unit tests are partly adopted from [tximportData](https://doi.org/doi:10.18129/B9.bioc.tximportData) which in turn used a subsample of the GEUVADIS data:

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -18,7 +18,7 @@
 author = "Malte Kuehl"
 
 # The full version, including alpha/beta/rc tags
-release = "0.10.0"
+release = "0.11.0"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/example.ipynb b/docs/source/example.ipynb
diff --git a/docs/source/start.md b/docs/source/start.md
@@ -8,7 +8,8 @@
 [![Documentation Status](https://readthedocs.org/projects/pytximport/badge/?version=latest)](https://pytximport.readthedocs.io/en/latest/?badge=latest)
 [![Codecov](https://codecov.io/gh/complextissue/pytximport/graph/badge.svg?token=M9JEHJVXYI)](https://codecov.io/gh/complextissue/pytximport)
 [![Install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pytximport/README.html)
-![PyPI - Downloads](https://img.shields.io/pypi/dm/pytximport)
+![Conda Downloads](https://img.shields.io/conda/d/bioconda/pytximport)
+![Pepy Total Downloads](https://img.shields.io/pepy/dt/pytximport?label=PyPi%7Cdownloads)
 [![Python Version Required](https://img.shields.io/pypi/pyversions/pytximport)](https://pypi.org/project/pytximport/)
 [![Code Style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
@@ -70,9 +71,18 @@ Common options are:
 - `-tx`: Provide this flag to return transcript-level instead of gene-summarized data. Incompatible with gene-level input and `counts_from_abundance=length_scaled_tpm`.
 - `--help`: Display all configuration options.
 
-## Development status
+Transcript-to-gene mappings can also be generated from the command line:
 
-`pytximport` is still in development and has not yet reached version 1.0.0 in the [SemVer](https://semver.org/) versioning scheme. While it should work for almost all use cases and we regularly compare outputs against the R implementation, breaking changes between minor versions may occur. If you encounter any problems, please open a GitHub issue. If you are a Python developer, we welcome pull requests implementing missing features, adding more extensive unit tests and bug fixes.
+```bash
+pytximport create-map -i ./data/annotation.gtf -o tx2gene.csv -ow
+```
+
+Command options are:
+
+- `-i`: The path to an annotation file in GTF format.
+- `-o`: The output path to save the resulting transcript-to-gene mapping to.
+- `-ow`: Provide this flag to overwrite an existing file at the output path.
+- `--help`: Display all configuration options.
 
 ## Motivation
 
@@ -103,7 +113,7 @@ Features unique to `pytximport`:
 - `SummarizedExperiment`-support to represent outputs in familiar Bioconductor data structures available through the [BiocPy](https://github.com/biocpy) ecosystem.
 - Saving outputs directly to file (use the `output_path` argument).
 - Removing transcript versions from **both** the quantification files and the transcript-to-gene map when `ignore_transcript_version` is provided.
-- Post-hoc biotype-filtering. Set `biotype_filter` to a whitelist of possible biotypes contained within the bar-separated values of your transcript ids.
+- Post-hoc biotype-filtering using `pytximport.utils.filter_by_biotype`.
 
 Features unique to `tximport`:
 - Alevin single-cell RNA-seq data support
@@ -142,6 +152,10 @@ make coverage-report
 The documentation can be build locally by navigating to the `docs` folder and running: `make html`.
 This requires that the development requirements of the package as well as the package itself have been installed in the same virtual environment and that `pandoc` has been added, e.g. by running `brew install pandoc` on macOS operating systems.
 
+## Development status
+
+`pytximport` is still in development and has not yet reached version 1.0.0 in the [SemVer](https://semver.org/) versioning scheme. While it should work for almost all use cases and we regularly compare outputs against the R implementation, breaking changes between minor versions may occur. If you encounter any problems, please open a GitHub issue. If you are a Python developer, we welcome pull requests implementing missing features, adding more extensive unit tests and bug fixes.
+
 ## Data sources
 
 The quantification files used for the unit tests are partly adopted from [tximportData](https://doi.org/doi:10.18129/B9.bioc.tximportData) which in turn used a subsample of the GEUVADIS data:

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ license = { file = "LICENSE" }
 authors = [{ name = "Malte Kuehl", email = "[email protected]" }]
 readme = { file = "README.md", content-type = "text/markdown" }
 classifiers = [
-    "Development Status :: 3 - Alpha",
+    "Development Status :: 4 - Beta",
     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
     "Intended Audience :: Science/Research",
     "Intended Audience :: Healthcare Industry",
@@ -27,6 +27,7 @@ dynamic = ["version"]
 dependencies = [
     "anndata>=0.8.0",
     "click>=8.0.0,<9",
+    "click_default_group>=1.2.0,<2",
     "flox>=0.9.0,<0.10.0",
     "h5py>=3.0.0,<4",
     "numpy>=1.19.0,<3",

diff --git a/pytximport/_cli.py b/pytximport/_cli.py
@@ -1,14 +1,33 @@
 """Expose the tximport function as a command-line tool."""
 
-from logging import basicConfig
+from logging import basicConfig, log, warning
+from pathlib import Path
 
 import click
 import numpy as np
+from click_default_group import DefaultGroup
 
 from .core import tximport
+from .utils import create_transcript_gene_map_from_annotation
 
 
-@click.command()
+@click.group(
+    cls=DefaultGroup,
+    default="run",
+    default_if_no_args=True,
+    help="Welcome to the pytximport command-line interface for importing transcript-level quantification files.",
+)
+@click.pass_context
+def cli(  # type: ignore
+    ctx: click.Context,
+):
+    """Welcome to the pytximport command-line interface for importing transcript-level quantification files."""
+    pass
+
+
+@cli.command(
+    no_args_is_help=True,
+)
 @click.option(
     "-i",
     "--file_paths",
@@ -141,31 +160,95 @@
     is_flag=True,
     help="Whether the existence of the files is optional.",
 )
-def cli(  # type: ignore
+def run(  # type: ignore
     **kwargs,
 ) -> None:
-    """Call the tximport function via the command line.
-
-    You can view the available options by running `pytximport --help`.
-
-    .. code-block:: bash
-
-        pytximport --help
-
-    For detailed information on pytximport's functionality, please refer to the README and online documentation.
-
-    Args:
-        **kwargs: The keyword arguments to pass to the tximport function.
+    """Call the tximport function via the command line."""
+    basicConfig(level=25, format="%(asctime)s: %(message)s")
 
-    Returns:
-        None
-    """
     # Add return_data to the kwargs with a default value of False
     kwargs["return_data"] = False
     kwargs["output_type"] = "anndata"
     kwargs["inferential_replicate_transformer"] = lambda x: np.median(x, axis=1)
 
-    # Set the logging level
+    tximport(**kwargs)  # type: ignore
+
+
+@cli.command(
+    no_args_is_help=True,
+)
+@click.option(
+    "-i",
+    "--input_file",
+    "--input",
+    type=click.Path(exists=True),
+    help="The path to the annotation GTF file.",
+    required=True,
+)
+@click.option(
+    "-o",
+    "--output_file",
+    "--output",
+    type=click.Path(),
+    help="The output path to save the resulting transcript-to-gene mapping file to.",
+    required=True,
+)
+@click.option(
+    "-ow",
+    "--output_path_overwrite",
+    "--save-path-overwrite",
+    is_flag=True,
+    help="Provide this flag to overwrite an existing file at the output path.",
+)
+@click.option(
+    "--source-field",
+    "--source_field",
+    type=str,
+    help="The annotation field to use as the source in the mapping file.",
+    required=False,
+)
+@click.option(
+    "--target-field",
+    "--target_field",
+    type=str,
+    multiple=True,
+    help="The annotation field(s) to use as the target in the mapping file.",
+    required=False,
+)
+@click.option(
+    "--keep-biotype",
+    "--keep_biotype",
+    is_flag=True,
+    help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.",
+)
+def create_map(  # type: ignore
+    **kwargs,
+) -> None:
+    """Create a transcript-to-gene mapping file via the command line."""
     basicConfig(level=25, format="%(asctime)s: %(message)s")
+    log(25, "Creating a transcript-to-gene mapping file.")
 
-    tximport(**kwargs)  # type: ignore
+    if isinstance(kwargs["target_field"], tuple):
+        kwargs["target_field"] = list(kwargs["target_field"])
+
+    df = create_transcript_gene_map_from_annotation(
+        kwargs["input_file"],
+        source_field=kwargs["source_field"] if kwargs["source_field"] else "transcript_id",
+        target_field=kwargs["target_field"] if kwargs["target_field"] else "gene_id",
+        keep_biotype=kwargs["keep_biotype"],
+    )
+    log(25, "Created the transcript-to-gene mapping file. Saving the file...")
+
+    output_file = Path(kwargs["output_file"])
+    if not output_file.exists() or kwargs["output_path_overwrite"]:
+        df.to_csv(
+            kwargs["output_file"],
+            sep=("," if kwargs["output_file"].endswith(".csv") else "\t"),
+            index=False,
+        )
+        log(25, f"Saved the transcript-to-gene mapping file to {kwargs['output_file']}.")
+    else:
+        warning(
+            f"Could not save the transcript-to-gene mapping file. File already exists at {kwargs['output_file']}. "
+            "Use the `-ow` flag to overwrite."
+        )
diff --git a/pytximport/_version.py b/pytximport/_version.py
@@ -1,4 +1,4 @@
 """Version information for the pytximport package."""
 
 # This package will follow Semantic Versioning after version 1.0.0: https://semver.org/
-__version__ = "0.10.0"
+__version__ = "0.11.0"
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
@@ -113,7 +113,8 @@
         return_data (bool, optional): Whether to return the gene-level expression. Defaults to True.
         biotype_filter (List[str], optional): Filter the transcripts by biotype, including only those provided. Enables
             post-hoc filtering of the data based on the biotype of the transcripts. Assumes that the biotype is present
-            in the transcript_id of the data, bar-separated. Defaults to None.
+            in the transcript_id of the data, bar-separated. If this is not the case, please use the `filter_by_biotype`
+            function from the `pytximport.utils` module instead. Defaults to None.
 
     Returns:
         Union[xr.Dataset, ad.AnnData, SummarizedExperiment, None]: The estimated gene-level or transcript-level
@@ -408,7 +409,7 @@
 
     if biotype_filter is not None:
         transcript_data = filter_by_biotype(
-            transcript_data, biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
+            transcript_data, biotype_filter=biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
         )
 
     # Remove appended gene names after underscore for RSEM data for both transcript and gene ids
@@ -515,6 +516,13 @@
             )
             output_format = "csv"
 
+        if output_path.suffix == ".h5ad" and output_format == "csv":
+            warning(
+                "The file extension of the `output_path` is `.h5ad` but the output format is `.csv`. "
+                "Changing the output format to `.h5ad`."
+            )
+            output_format = "h5ad"
+
         if output_format == "h5ad" and output_type != "anndata":
             warning(
                 "The output format is h5ad but the output type is not anndata. Changing the output type to anndata."
@@ -630,8 +638,6 @@
                     index=result.get_row_names(),
                     columns=result.get_column_names(),
                 )
-                df_gene_data.sort_index(inplace=True)
-                df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
             else:
                 if isinstance(result, ad.AnnData):
                     try:
@@ -649,8 +655,9 @@
                     index=(result[result_index] if output_type != "anndata" else result.var.index),
                     columns=(result.coords["file_path"].values if output_type != "anndata" else result.obs.index),
                 )
-                df_gene_data.sort_index(inplace=True)
-                df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
+
+            df_gene_data.sort_index(inplace=True)
+            df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
 
     # End the timer
     log(25, f"Finished the import in {time() - start_time:.2f} seconds.")