Update vignette

complextissue · Jul 11, 2024 · e5c535d · e5c535d
1 parent bb4f297
commit e5c535d
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 351 deletions.
diff --git a/README.md b/README.md
@@ -33,7 +33,18 @@ pip install pytximport
 
 ## Quick Start
 
-You can either use it from the command line:
+You can either import the `tximport` function in your Python files:
+
+```python
+from pytximport import tximport
+results = tximport(
+    file_paths,
+    "salmon",
+    transcript_gene_mapping,
+)
+```
+
+Or use it from the command line:
 
 ```bash
 pytximport -i ./sample_1.sf -i ./sample_2.sf -t salmon -m ./tx2gene_map.tsv -o ./output_counts.csv
@@ -50,17 +61,7 @@ Common options are:
 - `-counts`: The column name containing the transcript counts, in case it differs from the typical naming standards for the configured input file type.
 - `-length`: The column name containing the transcript lenghts, in case it differs from the typical naming standards for the configured input file type.
 - `-tpm`: The column name containing the transcript abundance, in case it differs from the typical naming standards for the configured input file type.
-
-Or import the `tximport` function in your Python files:
-
-```python
-from pytximport import tximport
-results = tximport(
-    file_paths,
-    "salmon",
-    transcript_gene_mapping,
-)
-```
+- `--help`: Display all configuration options.
 
 ## Citation
 

diff --git a/docs/source/example.ipynb b/docs/source/example.ipynb
diff --git a/docs/source/start.md b/docs/source/start.md
@@ -20,7 +20,18 @@ pip install pytximport
 
 ## Quick Start
 
-You can either use it from the command line:
+You can either import the `tximport` function in your Python files:
+
+```python
+from pytximport import tximport
+results = tximport(
+    file_paths,
+    "salmon",
+    transcript_gene_mapping,
+)
+```
+
+Or use it from the command line:
 
 ```bash
 pytximport -i ./sample_1.sf -i ./sample_2.sf -t salmon -m ./tx2gene_map.tsv -o ./output_counts.csv
@@ -37,17 +48,7 @@ Common options are:
 - `-counts`: The column name containing the transcript counts, in case it differs from the typical naming standards for the configured input file type.
 - `-length`: The column name containing the transcript lenghts, in case it differs from the typical naming standards for the configured input file type.
 - `-tpm`: The column name containing the transcript abundance, in case it differs from the typical naming standards for the configured input file type.
-
-Or import the `tximport` function in your Python files:
-
-```python
-from pytximport import tximport
-results = tximport(
-    file_paths,
-    "salmon",
-    transcript_gene_mapping,
-)
-```
+- `--help`: Display all configuration options.
 
 ## Motivation
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
     "h5py",
     "numpy",
     "pandas",
+    "pybiomart",
     "tqdm",
     "xarray",
 ]

diff --git a/pytximport/_cli.py b/pytximport/_cli.py
@@ -47,6 +47,13 @@
     help="The path to save the gene-level expression.",
     required=True,
 )
+@click.option(
+    "-ow",
+    "--save_path_override",
+    "--save-path-override",
+    is_flag=True,
+    help="Whether to override the save path.",
+)
 @click.option(
     "--ignore_after_bar",
     "--ignore-after-bar",
@@ -111,6 +118,7 @@
 )
 @click.option(
     "--existence_optional",
+    "--existence-optional",
     is_flag=True,
     help="Whether the existence of the files is optional.",
 )

diff --git a/pytximport/utils/_create_transcript_to_gene_map.py b/pytximport/utils/_create_transcript_to_gene_map.py
@@ -9,31 +9,31 @@
 def create_transcript_to_gene_map(
     species: Literal["human", "mouse"] = "human",
     host: str = "http://www.ensembl.org",
-    gene_id_identifier: Literal["external_gene_name", "ensembl_gene_id"] = "external_gene_name",
+    field: Literal["ensembl_gene_id", "external_gene_name", "external_transcript_name"] = "ensembl_gene_id",
 ) -> pd.DataFrame:
     """Create a mapping from transcript ids to gene ids using the Ensembl Biomart.
 
     Args:
         species (Literal["human", "mouse"], optional): The species to use. Defaults to "human".
         host (str, optional): The host to use. Defaults to "http://www.ensembl.org".
-        gene_id_identifier (Literal["external_gene_name", "gene_id"], optional): The identifier to use for the gene id.
-            Defaults to "external_gene_name".
+        field (Literal["ensembl_gene_id", "external_gene_name", "external_transcript_name"], optional): The
+            identifier to get for each transcript id. Defaults to "external_gene_name".
 
     Returns:
         pd.DataFrame: The mapping from transcript ids to gene ids.
     """
-    try:
-        from pybiomart import Dataset
-    except ImportError:
-        raise ImportError("Please install the pybiomart package first.")
+    from pybiomart import Dataset
 
     if species == "human":
         dataset = Dataset(name="hsapiens_gene_ensembl", host=host)
     elif species == "mouse":
         dataset = Dataset(name="mmusculus_gene_ensembl", host=host)
 
-    transcript_gene_map = dataset.query(attributes=["ensembl_transcript_id", gene_id_identifier])
-    transcript_gene_map.columns = ["transcript_id", "gene_id"]
+    transcript_gene_map = dataset.query(attributes=["ensembl_transcript_id", field])
+    transcript_gene_map.columns = [
+        "transcript_id",
+        ("gene_id" if field != "external_transcript_name" else "transcript_name"),
+    ]
 
     transcript_gene_map.dropna(inplace=True)
     transcript_gene_map.drop_duplicates(inplace=True)
@@ -44,18 +44,18 @@ def create_transcript_to_gene_map(
 
 def create_transcript_to_gene_map_from_gtf_annotation(
     file_path: Union[str, Path],
-    gene_id_identifier: Literal["gene_id", "gene_name"] = "gene_id",
+    field: Literal["gene_id", "gene_name"] = "gene_id",
     chunk_size: int = 100000,
     keep_gene_name: bool = True,
 ) -> pd.DataFrame:
     """Create a mapping from transcript ids to gene ids using a GTF annotation file.
 
     Args:
         file_path (Union[str, Path]): The path to the GTF annotation file.
-        gene_id_identifier (Literal["gene_id", "gene_name"], optional): The identifier to use for the gene id.
+        field (Literal["gene_id", "gene_name"], optional): The identifier to get for each transcript id.
             Defaults to "gene_id".
         chunk_size (int, optional): The number of lines to read at a time. Defaults to 100000.
-        keep_gene_name (bool, optional): Whether to keep the gene_name column when gene_id_identifier is "gene_id".
+        keep_gene_name (bool, optional): Whether to keep the gene_name column when field is "gene_id".
             Defaults to True.
 
     Returns:
@@ -98,9 +98,9 @@ def create_transcript_to_gene_map_from_gtf_annotation(
         transcript_gene_map["gene_name"],
     )
 
-    if gene_id_identifier == "gene_name":
+    if field == "gene_name":
         transcript_gene_map.drop("gene_id", axis=1, inplace=True).rename(columns={"gene_name": "gene_id"}, inplace=True)
-    elif gene_id_identifier == "gene_id" and not keep_gene_name:
+    elif field == "gene_id" and not keep_gene_name:
         transcript_gene_map.drop("gene_name", axis=1, inplace=True)
 
     transcript_gene_map.replace("", np.nan, inplace=True)

diff --git a/test/test_transcriptome_to_gene_map.py b/test/test_transcriptome_to_gene_map.py
@@ -17,7 +17,7 @@ def test_transcript_to_gene_map() -> None:
     df_transcript_to_gene = create_transcript_to_gene_map(
         species="human",
         host="http://www.ensembl.org",
-        gene_id_identifier="external_gene_name",
+        field="external_gene_name",
     )
 
     assert isinstance(df_transcript_to_gene, pd.DataFrame), "The output is not a DataFrame."
@@ -33,6 +33,7 @@ def test_transcript_to_gene_map_from_gtf_annotation(
     for keep_gene_name in [True, False]:
         df_transcript_to_gene = create_transcript_to_gene_map_from_gtf_annotation(
             gtf_annotation_file,
+            field="gene_id",
             keep_gene_name=keep_gene_name,
         )