Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Renaming #54

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
filters.GeneBioTypeFilter
filters.GeneNameFilter
filters.SeqNameFilter
filters.GeneRangesFilter
filters.GeneRangeFilter
filters.TxIDFilter
filters.TxBioTypeFilter
filters.ExonIDFilter
Expand Down
10 changes: 6 additions & 4 deletions docs/notebooks/basic_usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@
}
],
"source": [
"ensdb.genes(filter=gf.filters.GeneRangesFilter(\"1:10000-20000\"))"
"ensdb.genes(filter=gf.filters.GeneRangeFilter(\"1:10000-20000\"))"
]
},
{
Expand Down Expand Up @@ -855,7 +855,7 @@
"source": [
"ensdb.genes(\n",
" filter=gf.filters.GeneBioTypeFilter(\"lncRNA\")\n",
" & gf.filters.GeneRangesFilter(\"1:10000-20000\")\n",
" & gf.filters.GeneRangeFilter(\"1:10000-20000\")\n",
")"
]
},
Expand All @@ -870,7 +870,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Using the `cols` argument, you can get annotations from other tables in the database."
"Using the `columns` argument, you can get annotations from other tables in the database."
]
},
{
Expand Down Expand Up @@ -966,7 +966,9 @@
}
],
"source": [
"ensdb.genes(cols=[\"gene_id\", \"tx_id\", \"gene_name\", \"protein_id\", \"uniprot_id\"]).head()"
"ensdb.genes(\n",
" columns=[\"gene_id\", \"tx_id\", \"gene_name\", \"protein_id\", \"uniprot_id\"]\n",
").head()"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions src/genomic_features/_core/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def columns(self) -> set[str]:
return {"gene_name"}


class GeneRangesFilter(AbstractFilterRangeExpr):
class GeneRangeFilter(AbstractFilterRangeExpr):
"""
Filter features within a genomic range

Expand Down Expand Up @@ -251,7 +251,7 @@ class CanonicalTxFilter(AbstractFilterExpr):

>>> ensdb.transcripts(filter=gf.filters.CanonicalTxFilter())
>>> ensdb.exons(
... cols=["tx_id", "exon_id", "seq_name", "exon_seq_start", "exon_seq_end"],
... columns=["tx_id", "exon_id", "seq_name", "exon_seq_start", "exon_seq_end"],
... filter=gf.filters.CanonicalTxFilter(),
... )
"""
Expand Down
100 changes: 51 additions & 49 deletions src/genomic_features/ensembl/ensembldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,88 +144,90 @@ def __repr__(self) -> str:

def genes(
self,
cols: list[str] | None = None,
columns: list[str] | None = None,
filter: _filters.AbstractFilterExpr = filters.EmptyFilter(),
join_type: Literal["inner", "left"] = "inner",
) -> DataFrame:
"""Get gene annotations.

Parameters
----------
cols
columns
Which columns to retrieve from the database. Can be from other tables.
Returns all gene columns if None.
filters
Filters to apply to the query.
join_type
How to perform joins during the query (if cols or filters requires them).
How to perform joins during the query (if columns or filters requires them).


Usage
-----
>>> ensdb.genes(cols=["gene_id", "gene_name", "tx_id"])
>>> ensdb.genes(columns=["gene_id", "gene_name", "tx_id"])
"""
table: Final = "gene"
if cols is None:
if columns is None:
# TODO: check why R adds entrezid
cols = self.list_columns(table) # get all columns
columns = self.list_columns(table) # get all columns

cols = cols.copy()
if "gene_id" not in cols: # genes always needs gene_id
cols.append("gene_id")
columns = columns.copy()
if "gene_id" not in columns: # genes always needs gene_id
columns.append("gene_id")

query = self._build_query(table, cols, filter, join_type)
query = self._build_query(table, columns, filter, join_type)
return self._execute_query(query)

def transcripts(
self,
cols: list[str] | None = None,
columns: list[str] | None = None,
filter: _filters.AbstractFilterExpr = filters.EmptyFilter(),
join_type: Literal["inner", "left"] = "inner",
) -> DataFrame:
"""Get transcript annotations.

Parameters
----------
cols
columns
Which columns to retrieve from the database. Can be from other tables.
Returns all transcript columns if None.
filters
Filters to apply to the query.
join_type
How to perform joins during the query (if cols or filters requires them).
How to perform joins during the query (if columns or filters requires them).


Usage
-----
>>> ensdb.transcripts(cols=["tx_id", "tx_name", "gene_id"])
>>> ensdb.transcripts(columns=["tx_id", "tx_name", "gene_id"])
"""
table: Final = "tx"
if cols is None:
cols = self.list_columns(table) # get all columns
if columns is None:
columns = self.list_columns(table) # get all columns

cols = cols.copy()
columns = columns.copy()
# Require primary key in output
if "tx_id" not in cols:
cols.append("tx_id")
if "tx_id" not in columns:
columns.append("tx_id")
# seq_name is required for genomic range operations
if ("tx_seq_start" in cols or "tx_seq_end" in cols) and "seq_name" not in cols:
cols.append("seq_name")
if (
"tx_seq_start" in columns or "tx_seq_end" in columns
) and "seq_name" not in columns:
columns.append("seq_name")

query = self._build_query(table, cols, filter, join_type)
query = self._build_query(table, columns, filter, join_type)
return self._execute_query(query)

def exons(
self,
cols: list[str] | None = None,
columns: list[str] | None = None,
filter: _filters.AbstractFilterExpr = filters.EmptyFilter(),
join_type: Literal["inner", "left"] = "inner",
) -> DataFrame:
"""Get exons table.

Parameters
----------
cols
columns
Which columns to retrieve from the database. Can be from other tables.
Returns all exon columns if None.
filter
Expand All @@ -239,20 +241,20 @@ def exons(
>>> ensdb.exons()
"""
table: Final = "exon"
if cols is None:
cols = self.list_columns(table) # get all columns
if columns is None:
columns = self.list_columns(table) # get all columns

cols = cols.copy()
columns = columns.copy()
# Require primary key in output
if "exon_id" not in cols:
cols.append("exon_id")
if "exon_id" not in columns:
columns.append("exon_id")
# seq_name is required for genomic range operations
if (
"exon_seq_start" in cols or "exon_seq_end" in cols
) and "seq_name" not in cols:
cols.append("seq_name")
"exon_seq_start" in columns or "exon_seq_end" in columns
) and "seq_name" not in columns:
columns.append("seq_name")

query = self._build_query(table, cols, filter, join_type)
query = self._build_query(table, columns, filter, join_type)
return self._execute_query(query)

def _execute_query(self, query: IbisTable) -> DataFrame:
Expand All @@ -272,19 +274,19 @@ def chromosomes(self) -> DataFrame:
def _build_query(
self,
table: Literal["gene", "tx", "exon"],
cols: list[str],
columns: list[str],
filter: _filters.AbstractFilterExpr,
join_type: Literal["inner", "left"] = "inner",
) -> IbisTable:
"""Build a query for the genomic features table."""
# Finalize cols
self._clean_columns(cols)
# Finalize columns
self._clean_columns(columns)
for col in filter.columns():
if col not in cols:
cols.append(col)
if col not in columns:
columns.append(col)

# check if join is required
tables = self._get_required_tables(self._tables_for_columns(cols))
tables = self._get_required_tables(self._tables_for_columns(columns))

# Basically just to make sure exons stay in the query
if table not in tables:
Expand All @@ -295,7 +297,7 @@ def _build_query(
else:
query = self.db.table(table)
# add filter
query = query.filter(filter.convert()).select(cols)
query = query.filter(filter.convert()).select(columns)
return query

def _join_query(
Expand Down Expand Up @@ -438,26 +440,26 @@ def _clean_columns(self, columns: list[str]) -> list[str]:
columns = [columns]

valid_columns = set(self.list_columns())
cols = list(filter(lambda c: c in valid_columns, columns))
output_columns = list(filter(lambda c: c in valid_columns, columns))
invalid_columns = set(columns) - valid_columns
if invalid_columns:
raise ValueError(
f"The following columns are not found in any database: {invalid_columns}"
)
if not cols:
if not output_columns:
raise ValueError("No valid columns were found.")
return cols
return output_columns

def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list:
def _tables_for_columns(self, columns: list, start_with: str | None = None) -> list:
"""
Return a list of tables that contain the specified columns.

Parameters
----------
cols
columns
Columns that we're looking for.
"""
cols = self._clean_columns(cols)
columns = self._clean_columns(columns)
table_list = self._tables_by_degree() # list of table names

# remove start_with from table_list and add it to the beginning of the list
Expand All @@ -472,14 +474,14 @@ def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list
tables = []
for t in table_list:
# check if all columns are in one table
if set(cols).issubset(self.db.table(t).columns):
if set(columns).issubset(self.db.table(t).columns):
tables.append(t)
return tables
else:
# check if a single column is in the table
for c in cols.copy():
for c in columns.copy():
if c in self.db.table(t).columns:
if t not in tables:
tables.append(t)
cols.remove(c) # remove column from list
columns.remove(c) # remove column from list
return tables
4 changes: 2 additions & 2 deletions src/genomic_features/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
GeneBioTypeFilter,
GeneIDFilter,
GeneNameFilter,
GeneRangesFilter,
GeneRangeFilter,
SeqNameFilter,
TxBioTypeFilter,
TxIDFilter,
Expand All @@ -18,7 +18,7 @@
"CanonicalTxFilter",
"GeneIDFilter",
"GeneBioTypeFilter",
"GeneRangesFilter",
"GeneRangeFilter",
"EmptyFilter",
"ExonIDFilter",
"GeneNameFilter",
Expand Down
4 changes: 3 additions & 1 deletion tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def test_repr():

def test_invalid_join():
with pytest.raises(ValueError, match=r"Invalid join type: flarb"):
gf.ensembl.annotation("Hsapiens", 108).genes(cols=["tx_id"], join_type="flarb")
gf.ensembl.annotation("Hsapiens", 108).genes(
columns=["tx_id"], join_type="flarb"
)


def test_exons():
Expand Down
15 changes: 8 additions & 7 deletions tests/test_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def test_tables_by_degree(hsapiens108):
]
result = hsapiens108._tables_by_degree(tab=["protein", "exon"])
assert result == ["exon", "protein"]
result = hsapiens108._tables_by_degree(tab=["protein", "invalid_table"])
with pytest.warns(UserWarning, match="not in the database: invalid_table"):
result = hsapiens108._tables_by_degree(tab=["protein", "invalid_table"])
assert result == ["protein"]


Expand Down Expand Up @@ -60,7 +61,7 @@ def test_required_tables(hsapiens108):

# Test simple subsetting to columns in one table gene
def test_simple_subsetting(hsapiens108):
result = hsapiens108.genes(cols=["gene_id", "gene_name"])
result = hsapiens108.genes(columns=["gene_id", "gene_name"])
assert result.shape == (70616, 2)
assert result.columns.tolist() == ["gene_id", "gene_name"]

Expand All @@ -69,15 +70,15 @@ def test_simple_subsetting(hsapiens108):
def test_multiple_table_subsetting(hsapiens108):
# table genes and transcripts
result = hsapiens108.genes(
cols=["gene_id", "gene_name", "tx_id"],
columns=["gene_id", "gene_name", "tx_id"],
join_type="inner",
)
assert result.shape == (275721, 3)
assert list(result.columns) == ["gene_id", "gene_name", "tx_id"]

# table genes and transcripts with filter
result = hsapiens108.genes(
cols=["gene_id", "gene_name", "tx_id"],
columns=["gene_id", "gene_name", "tx_id"],
join_type="inner",
filter=gf.filters.GeneBioTypeFilter(["protein_coding"]),
)
Expand All @@ -86,7 +87,7 @@ def test_multiple_table_subsetting(hsapiens108):

# table genes, transcripts and exons and filter
result = hsapiens108.genes(
cols=["gene_id", "gene_name", "exon_id"],
columns=["gene_id", "gene_name", "exon_id"],
join_type="inner",
filter=gf.filters.GeneIDFilter(["ENSG00000139618"]),
)
Expand All @@ -97,7 +98,7 @@ def test_multiple_table_subsetting(hsapiens108):
# test left join
# table genes and transcripts
result = hsapiens108.genes(
cols=["gene_id", "gene_name", "protein_id"],
columns=["gene_id", "gene_name", "protein_id"],
join_type="left",
filter=gf.filters.GeneBioTypeFilter(["protein_coding"]),
)
Expand All @@ -115,7 +116,7 @@ def test_multiple_table_subsetting(hsapiens108):

def test_chromosome_columns(hsapiens108):
# https://github.com/scverse/genomic-features/pull/44/files#r1196331705
result = hsapiens108.genes(cols=["gene_id", "seq_name", "seq_length"])
result = hsapiens108.genes(columns=["gene_id", "seq_name", "seq_length"])
assert result.shape[0] == hsapiens108.db.table("gene").count().execute()

chroms = hsapiens108.chromosomes()
Expand Down
Loading
Loading