diff --git a/.gitignore b/.gitignore index 4ddb12b..44f5741 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ ipython_config.py # Environments .env .venv +.python-version env/ venv/ ENV/ @@ -69,6 +70,7 @@ requirements.dev.txt manuscript.pdf /data/rpgn_example/ /rpgn/ +/benchmark/ # All .DS_Store files **/.DS_Store diff --git a/.python-version b/.python-version deleted file mode 100644 index 2c07333..0000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.11 diff --git a/pyproject.toml b/pyproject.toml index 61c8f11..6e94180 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ dev = [ "pandas-stubs", "pandoc", "pre-commit", - "pybiomart", "pytest", "sphinx", "sphinx-autoapi", @@ -66,7 +65,6 @@ dev = [ "sphinx-copybutton", "sphinx-design", "sphinx-rtd-theme", - "virtualenv", ] [project.scripts] diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py index 1b0e807..8436e9d 100644 --- a/pytximport/core/_tximport.py +++ b/pytximport/core/_tximport.py @@ -141,10 +141,14 @@ def tximport( raise FileNotFoundError(f"The transcript to gene mapping does not exist: {transcript_gene_map}") try: - if transcript_gene_map.suffix == ".csv": - transcript_gene_map = pd.read_csv(transcript_gene_map, header=0, engine="pyarrow") - else: - transcript_gene_map = pd.read_table(transcript_gene_map, header=0, engine="pyarrow") + transcript_gene_map = pd.read_csv( + transcript_gene_map, + header=0, + engine="pyarrow", + sep=("," if transcript_gene_map.suffix == ".csv" else "\t"), + usecols=["transcript_id", "gene_id"], + dtype={"transcript_id": str, "gene_id": str}, + ) except Exception as exception: raise ValueError(f"Could not read the transcript to gene mapping: {exception}") @@ -445,6 +449,7 @@ def tximport( if transcript_gene_map is None: raise ValueError("A transcript to gene mapping must be provided for `dtu_scaled_tpm`.") + log(25, "Calculating median gene length over isoforms.") transcript_data = get_median_length_over_isoform( transcript_data, transcript_gene_map, diff --git a/pytximport/importers/_read_kallisto.py b/pytximport/importers/_read_kallisto.py index 4f90b53..97574f1 100644 --- a/pytximport/importers/_read_kallisto.py +++ b/pytximport/importers/_read_kallisto.py @@ -3,7 +3,6 @@ from typing import Optional, Union import numpy as np -import pandas as pd from h5py import File from ..definitions import InferentialReplicates, TranscriptData diff --git a/pytximport/importers/_read_tsv.py b/pytximport/importers/_read_tsv.py index b5c9c41..b0e6968 100644 --- a/pytximport/importers/_read_tsv.py +++ b/pytximport/importers/_read_tsv.py @@ -84,25 +84,23 @@ def read_tsv( if not file_path.exists(): raise ImportError(f"The file does not exist: {file_path}") - # Read the quantification file as a tsv, tab separated and the first line is the column names - if file_path.suffix == ".gz": - transcript_dataframe = pd.read_table(file_path, header=0, compression="gzip", sep="\t") - else: - usecols = [id_column, counts_column, length_column] - dtype = {id_column: str, counts_column: np.float64, length_column: np.float64} - - if abundance_column is not None: - usecols.append(abundance_column) - dtype[abundance_column] = np.float64 - - transcript_dataframe = pd.read_table( - file_path, - header=0, - sep="\t", - engine="pyarrow", - usecols=usecols, - dtype=dtype, - ) + # Read the quantification file as a tsv, tab separated with the first line being the column names + usecols = [id_column, counts_column, length_column] + dtype = {id_column: str, counts_column: np.float64, length_column: np.float64} + + if abundance_column is not None: + usecols.append(abundance_column) + dtype[abundance_column] = np.float64 + + transcript_dataframe = pd.read_table( + file_path, + header=0, + sep="\t", + compression=("gzip" if file_path.suffix == ".gz" else None), + engine="pyarrow", + usecols=usecols, + dtype=dtype, + ) return parse_dataframe( transcript_dataframe, diff --git a/pytximport/utils/_convert_abundance_to_counts.py b/pytximport/utils/_convert_abundance_to_counts.py index 5339a89..3390319 100644 --- a/pytximport/utils/_convert_abundance_to_counts.py +++ b/pytximport/utils/_convert_abundance_to_counts.py @@ -33,9 +33,6 @@ def convert_abundance_to_counts( raise ValueError("The count transform must be 'scaled_tpm' or 'length_scaled_tpm'.") # Scale the counts from abundance to the original sequencing depth of each sample - column_counts = counts.sum(axis=0) - new_counts = counts_transformed.sum(axis=0) - ratio = column_counts / new_counts - counts_transformed = (counts_transformed.T * ratio).T + counts_transformed = (counts_transformed.T * (counts.sum(axis=0) / counts_transformed.sum(axis=0))).T return counts_transformed diff --git a/pytximport/utils/_convert_counts_to_tpm.py b/pytximport/utils/_convert_counts_to_tpm.py index 9cde9e7..25b963d 100644 --- a/pytximport/utils/_convert_counts_to_tpm.py +++ b/pytximport/utils/_convert_counts_to_tpm.py @@ -14,5 +14,4 @@ def convert_counts_to_tpm( Returns: np.ndarray: The transcript-level expression data with the TPM. """ - normalization_factor = 1e6 / np.sum(counts / length) - return np.array(counts * normalization_factor / length) + return np.array(counts * (1e6 / np.sum(counts / length)) / length) diff --git a/pytximport/utils/_convert_transcripts_to_genes.py b/pytximport/utils/_convert_transcripts_to_genes.py index 925bf27..e54a5d8 100644 --- a/pytximport/utils/_convert_transcripts_to_genes.py +++ b/pytximport/utils/_convert_transcripts_to_genes.py @@ -35,7 +35,6 @@ def convert_transcripts_to_genes( xr.Dataset: The gene-level expression data from multiple samples. """ transcript_ids: Union[np.ndarray, List[str]] = transcript_data.coords["transcript_id"].values - transcript_ids = transcript_data.coords["transcript_id"].values if ignore_after_bar: # Ignore the part of the transcript ID after the bar @@ -72,19 +71,21 @@ def convert_transcripts_to_genes( ) # Remove the missing transcripts by only keeping the data for the transcripts present in the mapping transcript_ids_intersect = list(set(unique_transcripts).intersection(set(transcript_gene_map["transcript_id"]))) - transcript_ids_intersect_boolean = np.isin(transcript_ids, transcript_ids_intersect) transcript_data = transcript_data.isel( - transcript_id=transcript_ids_intersect_boolean, + transcript_id=np.isin(transcript_ids, transcript_ids_intersect), drop=True, ) - transcript_ids = transcript_data.coords["transcript_id"].values + # transcript_ids = transcript_data.coords["transcript_id"].values transcript_gene_map = transcript_gene_map[transcript_gene_map["transcript_id"].isin(transcript_ids_intersect)] # Add the corresponding gene to the transcript-level expression log(25, "Matching gene_ids.") - transcript_gene_dict = transcript_gene_map.set_index("transcript_id")["gene_id"].to_dict() - gene_ids_raw = transcript_data["transcript_id"].to_series().map(transcript_gene_dict).values - # gene_ids = np.repeat(gene_ids_raw, transcript_data["abundance"].shape[1]) # type: ignore + gene_ids_raw = ( + transcript_data["transcript_id"] + .to_series() + .map(transcript_gene_map.set_index("transcript_id")["gene_id"]) + .values + ) # Remove the transcript_id coordinate and rename the variable to gene_id transcript_data = ( diff --git a/pytximport/utils/_create_transcript_gene_map.py b/pytximport/utils/_create_transcript_gene_map.py index 97fa8bd..b2ad96d 100644 --- a/pytximport/utils/_create_transcript_gene_map.py +++ b/pytximport/utils/_create_transcript_gene_map.py @@ -101,7 +101,7 @@ def create_transcript_gene_map_from_annotation( # Each attribute line looks like this: # gene_id ""; transcript_id ""; gene_name ""; gene_source ""; gene_biotype ""; # transcript_name ""; transcript_source ""; - # We are only interested in the gene_id, gene_name, and transcript_id + # We are only interested in the gene_id, gene_name, transcript_id, transcript_name and gene_biotype attribute_columns = [ "transcript_id", "transcript_name", diff --git a/pytximport/utils/_get_median_length_over_isoform.py b/pytximport/utils/_get_median_length_over_isoform.py index 12f725a..b063dd9 100644 --- a/pytximport/utils/_get_median_length_over_isoform.py +++ b/pytximport/utils/_get_median_length_over_isoform.py @@ -22,31 +22,35 @@ def get_median_length_over_isoform( assert "length" in transcript_data.data_vars, "The transcript data does not contain a `length` variable." # Get the gene ids for each transcript - transcript_gene_dict = transcript_gene_map.set_index("transcript_id")["gene_id"].to_dict() - gene_ids = transcript_data["transcript_id"].to_series().map(transcript_gene_dict).values + gene_ids = ( + transcript_data["transcript_id"] + .to_series() + .map(transcript_gene_map.set_index("transcript_id")["gene_id"].to_dict()) + .values + ) # Check that no gene ids is nan assert not any(pd.isna(gene_ids)), "Not all transcript ids could be mapped to gene ids. Please check the mapping." - transcript_data_copy = transcript_data.drop_vars("transcript_id") - transcript_data_copy = transcript_data_copy.assign_coords(gene_id=gene_ids) - transcript_data_copy = transcript_data_copy.rename({"transcript_id": "gene_id"}) - # Get the row mean across samples for each transcript - average_transcript_length_across_samples = transcript_data_copy["length"].mean(axis=1) - median_gene_length = average_transcript_length_across_samples.groupby("gene_id").median().to_dataframe() - - transcript_median_gene_length = [median_gene_length.loc[gene_id, "length"] for gene_id in gene_ids] - transcript_median_gene_length_repeated = np.reshape( - np.repeat( - transcript_median_gene_length, - transcript_data["abundance"].shape[1], - ), - transcript_data["abundance"].shape, + median_gene_length = ( + transcript_data.drop_vars("transcript_id") + .assign_coords(gene_id=gene_ids) + .rename({"transcript_id": "gene_id"})["length"] + .mean(dim="file") + .groupby("gene_id") + .median() + .to_dataframe() ) transcript_data["median_isoform_length"] = xr.DataArray( - transcript_median_gene_length_repeated, + np.reshape( + np.repeat( + pd.Series(gene_ids).map(median_gene_length["length"]).to_numpy(), + transcript_data["abundance"].shape[1], + ), + transcript_data["abundance"].shape, + ), dims=("transcript_id", "file"), ) diff --git a/pytximport/utils/_replace_transcript_ids_with_names.py b/pytximport/utils/_replace_transcript_ids_with_names.py index c34756a..2215f77 100644 --- a/pytximport/utils/_replace_transcript_ids_with_names.py +++ b/pytximport/utils/_replace_transcript_ids_with_names.py @@ -24,7 +24,13 @@ def replace_transcript_ids_with_names( """ # Read the transcript to gene mapping if isinstance(transcript_name_map, str) or isinstance(transcript_name_map, Path): - transcript_name_map = pd.read_table(transcript_name_map, header=0) + transcript_name_map = pd.read_table( + transcript_name_map, + header=0, + engine="c", + usecols=["transcript_id", "transcript_name"], + dtype=str, + ) transcript_name_map = transcript_name_map.drop_duplicates() # Check that transcript_id and transcript_name are present in the mapping