Skip to content

Commit

Permalink
Update vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
maltekuehl committed Jul 11, 2024
1 parent bb4f297 commit e5c535d
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 351 deletions.
25 changes: 13 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,18 @@ pip install pytximport

## Quick Start

You can either use it from the command line:
You can either import the `tximport` function in your Python files:

```python
from pytximport import tximport
results = tximport(
file_paths,
"salmon",
transcript_gene_mapping,
)
```

Or use it from the command line:

```bash
pytximport -i ./sample_1.sf -i ./sample_2.sf -t salmon -m ./tx2gene_map.tsv -o ./output_counts.csv
Expand All @@ -50,17 +61,7 @@ Common options are:
- `-counts`: The column name containing the transcript counts, in case it differs from the typical naming standards for the configured input file type.
- `-length`: The column name containing the transcript lenghts, in case it differs from the typical naming standards for the configured input file type.
- `-tpm`: The column name containing the transcript abundance, in case it differs from the typical naming standards for the configured input file type.

Or import the `tximport` function in your Python files:

```python
from pytximport import tximport
results = tximport(
file_paths,
"salmon",
transcript_gene_mapping,
)
```
- `--help`: Display all configuration options.

## Citation

Expand Down
421 changes: 109 additions & 312 deletions docs/source/example.ipynb

Large diffs are not rendered by default.

25 changes: 13 additions & 12 deletions docs/source/start.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,18 @@ pip install pytximport

## Quick Start

You can either use it from the command line:
You can either import the `tximport` function in your Python files:

```python
from pytximport import tximport
results = tximport(
file_paths,
"salmon",
transcript_gene_mapping,
)
```

Or use it from the command line:

```bash
pytximport -i ./sample_1.sf -i ./sample_2.sf -t salmon -m ./tx2gene_map.tsv -o ./output_counts.csv
Expand All @@ -37,17 +48,7 @@ Common options are:
- `-counts`: The column name containing the transcript counts, in case it differs from the typical naming standards for the configured input file type.
- `-length`: The column name containing the transcript lenghts, in case it differs from the typical naming standards for the configured input file type.
- `-tpm`: The column name containing the transcript abundance, in case it differs from the typical naming standards for the configured input file type.

Or import the `tximport` function in your Python files:

```python
from pytximport import tximport
results = tximport(
file_paths,
"salmon",
transcript_gene_mapping,
)
```
- `--help`: Display all configuration options.

## Motivation

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"h5py",
"numpy",
"pandas",
"pybiomart",
"tqdm",
"xarray",
]
Expand Down
8 changes: 8 additions & 0 deletions pytximport/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@
help="The path to save the gene-level expression.",
required=True,
)
@click.option(
"-ow",
"--save_path_override",
"--save-path-override",
is_flag=True,
help="Whether to override the save path.",
)
@click.option(
"--ignore_after_bar",
"--ignore-after-bar",
Expand Down Expand Up @@ -111,6 +118,7 @@
)
@click.option(
"--existence_optional",
"--existence-optional",
is_flag=True,
help="Whether the existence of the files is optional.",
)
Expand Down
28 changes: 14 additions & 14 deletions pytximport/utils/_create_transcript_to_gene_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,31 @@
def create_transcript_to_gene_map(
species: Literal["human", "mouse"] = "human",
host: str = "http://www.ensembl.org",
gene_id_identifier: Literal["external_gene_name", "ensembl_gene_id"] = "external_gene_name",
field: Literal["ensembl_gene_id", "external_gene_name", "external_transcript_name"] = "ensembl_gene_id",
) -> pd.DataFrame:
"""Create a mapping from transcript ids to gene ids using the Ensembl Biomart.
Args:
species (Literal["human", "mouse"], optional): The species to use. Defaults to "human".
host (str, optional): The host to use. Defaults to "http://www.ensembl.org".
gene_id_identifier (Literal["external_gene_name", "gene_id"], optional): The identifier to use for the gene id.
Defaults to "external_gene_name".
field (Literal["ensembl_gene_id", "external_gene_name", "external_transcript_name"], optional): The
identifier to get for each transcript id. Defaults to "external_gene_name".
Returns:
pd.DataFrame: The mapping from transcript ids to gene ids.
"""
try:
from pybiomart import Dataset
except ImportError:
raise ImportError("Please install the pybiomart package first.")
from pybiomart import Dataset

if species == "human":
dataset = Dataset(name="hsapiens_gene_ensembl", host=host)
elif species == "mouse":
dataset = Dataset(name="mmusculus_gene_ensembl", host=host)

transcript_gene_map = dataset.query(attributes=["ensembl_transcript_id", gene_id_identifier])
transcript_gene_map.columns = ["transcript_id", "gene_id"]
transcript_gene_map = dataset.query(attributes=["ensembl_transcript_id", field])
transcript_gene_map.columns = [
"transcript_id",
("gene_id" if field != "external_transcript_name" else "transcript_name"),
]

transcript_gene_map.dropna(inplace=True)
transcript_gene_map.drop_duplicates(inplace=True)
Expand All @@ -44,18 +44,18 @@ def create_transcript_to_gene_map(

def create_transcript_to_gene_map_from_gtf_annotation(
file_path: Union[str, Path],
gene_id_identifier: Literal["gene_id", "gene_name"] = "gene_id",
field: Literal["gene_id", "gene_name"] = "gene_id",
chunk_size: int = 100000,
keep_gene_name: bool = True,
) -> pd.DataFrame:
"""Create a mapping from transcript ids to gene ids using a GTF annotation file.
Args:
file_path (Union[str, Path]): The path to the GTF annotation file.
gene_id_identifier (Literal["gene_id", "gene_name"], optional): The identifier to use for the gene id.
field (Literal["gene_id", "gene_name"], optional): The identifier to get for each transcript id.
Defaults to "gene_id".
chunk_size (int, optional): The number of lines to read at a time. Defaults to 100000.
keep_gene_name (bool, optional): Whether to keep the gene_name column when gene_id_identifier is "gene_id".
keep_gene_name (bool, optional): Whether to keep the gene_name column when field is "gene_id".
Defaults to True.
Returns:
Expand Down Expand Up @@ -98,9 +98,9 @@ def create_transcript_to_gene_map_from_gtf_annotation(
transcript_gene_map["gene_name"],
)

if gene_id_identifier == "gene_name":
if field == "gene_name":
transcript_gene_map.drop("gene_id", axis=1, inplace=True).rename(columns={"gene_name": "gene_id"}, inplace=True)
elif gene_id_identifier == "gene_id" and not keep_gene_name:
elif field == "gene_id" and not keep_gene_name:
transcript_gene_map.drop("gene_name", axis=1, inplace=True)

transcript_gene_map.replace("", np.nan, inplace=True)
Expand Down
3 changes: 2 additions & 1 deletion test/test_transcriptome_to_gene_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_transcript_to_gene_map() -> None:
df_transcript_to_gene = create_transcript_to_gene_map(
species="human",
host="http://www.ensembl.org",
gene_id_identifier="external_gene_name",
field="external_gene_name",
)

assert isinstance(df_transcript_to_gene, pd.DataFrame), "The output is not a DataFrame."
Expand All @@ -33,6 +33,7 @@ def test_transcript_to_gene_map_from_gtf_annotation(
for keep_gene_name in [True, False]:
df_transcript_to_gene = create_transcript_to_gene_map_from_gtf_annotation(
gtf_annotation_file,
field="gene_id",
keep_gene_name=keep_gene_name,
)

Expand Down

0 comments on commit e5c535d

Please sign in to comment.