From 2e4cd1b9b09c9a86c3979f7aea84e5416e4b7005 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 20 Sep 2023 16:39:30 +0200 Subject: [PATCH 1/3] cols -> columns --- docs/notebooks/basic_usage.ipynb | 4 +- src/genomic_features/_core/filters.py | 2 +- src/genomic_features/ensembl/ensembldb.py | 100 +++++++++++----------- tests/test_basic.py | 4 +- tests/test_columns.py | 12 +-- tests/test_filters.py | 4 +- 6 files changed, 65 insertions(+), 61 deletions(-) diff --git a/docs/notebooks/basic_usage.ipynb b/docs/notebooks/basic_usage.ipynb index 0a90d2f..279c254 100644 --- a/docs/notebooks/basic_usage.ipynb +++ b/docs/notebooks/basic_usage.ipynb @@ -869,7 +869,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the `cols` argument, you can get annotations from other tables in the database." + "Using the `columns` argument, you can get annotations from other tables in the database." ] }, { @@ -965,7 +965,7 @@ } ], "source": [ - "ensdb.genes(cols=[\"gene_id\", \"tx_id\", \"gene_name\", \"protein_id\", \"uniprot_id\"]).head()" + "ensdb.genes(columns=[\"gene_id\", \"tx_id\", \"gene_name\", \"protein_id\", \"uniprot_id\"]).head()" ] }, { diff --git a/src/genomic_features/_core/filters.py b/src/genomic_features/_core/filters.py index 0c74429..5062a05 100644 --- a/src/genomic_features/_core/filters.py +++ b/src/genomic_features/_core/filters.py @@ -251,7 +251,7 @@ class CanonicalTxFilter(AbstractFilterExpr): >>> ensdb.transcripts(filter=gf.filters.CanonicalTxFilter()) >>> ensdb.exons( - ... cols=["tx_id", "exon_id", "seq_name", "exon_seq_start", "exon_seq_end"], + ... columns=["tx_id", "exon_id", "seq_name", "exon_seq_start", "exon_seq_end"], ... filter=gf.filters.CanonicalTxFilter() ... ) """ diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index a731470..4c4f80e 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -144,7 +144,7 @@ def __repr__(self) -> str: def genes( self, - cols: list[str] | None = None, + columns: list[str] | None = None, filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", ) -> DataFrame: @@ -152,34 +152,34 @@ def genes( Parameters ---------- - cols + columns Which columns to retrieve from the database. Can be from other tables. Returns all gene columns if None. filters Filters to apply to the query. join_type - How to perform joins during the query (if cols or filters requires them). + How to perform joins during the query (if columns or filters requires them). Usage ----- - >>> ensdb.genes(cols=["gene_id", "gene_name", "tx_id"]) + >>> ensdb.genes(columns=["gene_id", "gene_name", "tx_id"]) """ table: Final = "gene" - if cols is None: + if columns is None: # TODO: check why R adds entrezid - cols = self.list_columns(table) # get all columns + columns = self.list_columns(table) # get all columns - cols = cols.copy() - if "gene_id" not in cols: # genes always needs gene_id - cols.append("gene_id") + columns = columns.copy() + if "gene_id" not in columns: # genes always needs gene_id + columns.append("gene_id") - query = self._build_query(table, cols, filter, join_type) + query = self._build_query(table, columns, filter, join_type) return self._execute_query(query) def transcripts( self, - cols: list[str] | None = None, + columns: list[str] | None = None, filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", ) -> DataFrame: @@ -187,37 +187,39 @@ def transcripts( Parameters ---------- - cols + columns Which columns to retrieve from the database. Can be from other tables. Returns all transcript columns if None. filters Filters to apply to the query. join_type - How to perform joins during the query (if cols or filters requires them). + How to perform joins during the query (if columns or filters requires them). Usage ----- - >>> ensdb.transcripts(cols=["tx_id", "tx_name", "gene_id"]) + >>> ensdb.transcripts(columns=["tx_id", "tx_name", "gene_id"]) """ table: Final = "tx" - if cols is None: - cols = self.list_columns(table) # get all columns + if columns is None: + columns = self.list_columns(table) # get all columns - cols = cols.copy() + columns = columns.copy() # Require primary key in output - if "tx_id" not in cols: - cols.append("tx_id") + if "tx_id" not in columns: + columns.append("tx_id") # seq_name is required for genomic range operations - if ("tx_seq_start" in cols or "tx_seq_end" in cols) and "seq_name" not in cols: - cols.append("seq_name") + if ( + "tx_seq_start" in columns or "tx_seq_end" in columns + ) and "seq_name" not in columns: + columns.append("seq_name") - query = self._build_query(table, cols, filter, join_type) + query = self._build_query(table, columns, filter, join_type) return self._execute_query(query) def exons( self, - cols: list[str] | None = None, + columns: list[str] | None = None, filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", ) -> DataFrame: @@ -225,7 +227,7 @@ def exons( Parameters ---------- - cols + columns Which columns to retrieve from the database. Can be from other tables. Returns all exon columns if None. filter @@ -239,20 +241,20 @@ def exons( >>> ensdb.exons() """ table: Final = "exon" - if cols is None: - cols = self.list_columns(table) # get all columns + if columns is None: + columns = self.list_columns(table) # get all columns - cols = cols.copy() + columns = columns.copy() # Require primary key in output - if "exon_id" not in cols: - cols.append("exon_id") + if "exon_id" not in columns: + columns.append("exon_id") # seq_name is required for genomic range operations if ( - "exon_seq_start" in cols or "exon_seq_end" in cols - ) and "seq_name" not in cols: - cols.append("seq_name") + "exon_seq_start" in columns or "exon_seq_end" in columns + ) and "seq_name" not in columns: + columns.append("seq_name") - query = self._build_query(table, cols, filter, join_type) + query = self._build_query(table, columns, filter, join_type) return self._execute_query(query) def _execute_query(self, query: IbisTable) -> DataFrame: @@ -272,19 +274,19 @@ def chromosomes(self) -> DataFrame: def _build_query( self, table: Literal["gene", "tx", "exon"], - cols: list[str], + columns: list[str], filter: _filters.AbstractFilterExpr, join_type: Literal["inner", "left"] = "inner", ) -> IbisTable: """Build a query for the genomic features table.""" - # Finalize cols - self._clean_columns(cols) + # Finalize columns + self._clean_columns(columns) for col in filter.columns(): - if col not in cols: - cols.append(col) + if col not in columns: + columns.append(col) # check if join is required - tables = self._get_required_tables(self._tables_for_columns(cols)) + tables = self._get_required_tables(self._tables_for_columns(columns)) # Basically just to make sure exons stay in the query if table not in tables: @@ -295,7 +297,7 @@ def _build_query( else: query = self.db.table(table) # add filter - query = query.filter(filter.convert()).select(cols) + query = query.filter(filter.convert()).select(columns) return query def _join_query( @@ -438,26 +440,26 @@ def _clean_columns(self, columns: list[str]) -> list[str]: columns = [columns] valid_columns = set(self.list_columns()) - cols = list(filter(lambda c: c in valid_columns, columns)) + output_columns = list(filter(lambda c: c in valid_columns, columns)) invalid_columns = set(columns) - valid_columns if invalid_columns: raise ValueError( f"The following columns are not found in any database: {invalid_columns}" ) - if not cols: + if not output_columns: raise ValueError("No valid columns were found.") - return cols + return output_columns - def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list: + def _tables_for_columns(self, columns: list, start_with: str | None = None) -> list: """ Return a list of tables that contain the specified columns. Parameters ---------- - cols + columns Columns that we're looking for. """ - cols = self._clean_columns(cols) + columns = self._clean_columns(columns) table_list = self._tables_by_degree() # list of table names # remove start_with from table_list and add it to the beginning of the list @@ -472,14 +474,14 @@ def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list tables = [] for t in table_list: # check if all columns are in one table - if set(cols).issubset(self.db.table(t).columns): + if set(columns).issubset(self.db.table(t).columns): tables.append(t) return tables else: # check if a single column is in the table - for c in cols.copy(): + for c in columns.copy(): if c in self.db.table(t).columns: if t not in tables: tables.append(t) - cols.remove(c) # remove column from list + columns.remove(c) # remove column from list return tables diff --git a/tests/test_basic.py b/tests/test_basic.py index 3e90722..ad10a9d 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -27,7 +27,9 @@ def test_repr(): def test_invalid_join(): with pytest.raises(ValueError, match=r"Invalid join type: flarb"): - gf.ensembl.annotation("Hsapiens", 108).genes(cols=["tx_id"], join_type="flarb") + gf.ensembl.annotation("Hsapiens", 108).genes( + columns=["tx_id"], join_type="flarb" + ) def test_exons(): diff --git a/tests/test_columns.py b/tests/test_columns.py index 29bb2e4..5e99c93 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -60,7 +60,7 @@ def test_required_tables(hsapiens108): # Test simple subsetting to columns in one table gene def test_simple_subsetting(hsapiens108): - result = hsapiens108.genes(cols=["gene_id", "gene_name"]) + result = hsapiens108.genes(columns=["gene_id", "gene_name"]) assert result.shape == (70616, 2) assert result.columns.tolist() == ["gene_id", "gene_name"] @@ -69,7 +69,7 @@ def test_simple_subsetting(hsapiens108): def test_multiple_table_subsetting(hsapiens108): # table genes and transcripts result = hsapiens108.genes( - cols=["gene_id", "gene_name", "tx_id"], + columns=["gene_id", "gene_name", "tx_id"], join_type="inner", ) assert result.shape == (275721, 3) @@ -77,7 +77,7 @@ def test_multiple_table_subsetting(hsapiens108): # table genes and transcripts with filter result = hsapiens108.genes( - cols=["gene_id", "gene_name", "tx_id"], + columns=["gene_id", "gene_name", "tx_id"], join_type="inner", filter=gf.filters.GeneBioTypeFilter(["protein_coding"]), ) @@ -86,7 +86,7 @@ def test_multiple_table_subsetting(hsapiens108): # table genes, transcripts and exons and filter result = hsapiens108.genes( - cols=["gene_id", "gene_name", "exon_id"], + columns=["gene_id", "gene_name", "exon_id"], join_type="inner", filter=gf.filters.GeneIDFilter(["ENSG00000139618"]), ) @@ -97,7 +97,7 @@ def test_multiple_table_subsetting(hsapiens108): # test left join # table genes and transcripts result = hsapiens108.genes( - cols=["gene_id", "gene_name", "protein_id"], + columns=["gene_id", "gene_name", "protein_id"], join_type="left", filter=gf.filters.GeneBioTypeFilter(["protein_coding"]), ) @@ -115,7 +115,7 @@ def test_multiple_table_subsetting(hsapiens108): def test_chromosome_columns(hsapiens108): # https://github.com/scverse/genomic-features/pull/44/files#r1196331705 - result = hsapiens108.genes(cols=["gene_id", "seq_name", "seq_length"]) + result = hsapiens108.genes(columns=["gene_id", "seq_name", "seq_length"]) assert result.shape[0] == hsapiens108.db.table("gene").count().execute() chroms = hsapiens108.chromosomes() diff --git a/tests/test_filters.py b/tests/test_filters.py index 33d5756..9ed3a07 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -77,7 +77,7 @@ def test_equality_filter_list(hsapiens108, filt, table_method): def test_canonical(hsapiens108, table_method): func = table_method(hsapiens108) result = func( - cols=["tx_id", "canonical_transcript"], filter=filters.CanonicalTxFilter() + columns=["tx_id", "canonical_transcript"], filter=filters.CanonicalTxFilter() ) assert result["tx_is_canonical"].sum() == result.shape[0] @@ -86,7 +86,7 @@ def test_canonical(hsapiens108, table_method): ) result_non_canonical = func( - cols=["tx_id", "canonical_transcript"], filter=~filters.CanonicalTxFilter() + columns=["tx_id", "canonical_transcript"], filter=~filters.CanonicalTxFilter() ) assert result_non_canonical["tx_is_canonical"].sum() == 0 From 62af0ad9742d8e5cd078039982a4f823d0eaed62 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 20 Sep 2023 16:58:45 +0200 Subject: [PATCH 2/3] GenomicRangesFilter -> GenomicRanngeFilter --- docs/api.md | 2 +- docs/notebooks/basic_usage.ipynb | 4 ++-- src/genomic_features/_core/filters.py | 2 +- src/genomic_features/filters.py | 4 ++-- tests/test_filters.py | 10 +++++----- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/api.md b/docs/api.md index a0d853c..0970c94 100644 --- a/docs/api.md +++ b/docs/api.md @@ -27,7 +27,7 @@ filters.GeneBioTypeFilter filters.GeneNameFilter filters.SeqNameFilter - filters.GeneRangesFilter + filters.GeneRangeFilter filters.TxIDFilter filters.TxBioTypeFilter filters.ExonIDFilter diff --git a/docs/notebooks/basic_usage.ipynb b/docs/notebooks/basic_usage.ipynb index 279c254..7fdc4d4 100644 --- a/docs/notebooks/basic_usage.ipynb +++ b/docs/notebooks/basic_usage.ipynb @@ -765,7 +765,7 @@ } ], "source": [ - "ensdb.genes(filter=gf.filters.GeneRangesFilter(\"1:10000-20000\"))" + "ensdb.genes(filter=gf.filters.GeneRangeFilter(\"1:10000-20000\"))" ] }, { @@ -854,7 +854,7 @@ ], "source": [ "ensdb.genes(\n", - " filter=gf.filters.GeneBioTypeFilter(\"lncRNA\") & gf.filters.GeneRangesFilter(\"1:10000-20000\")\n", + " filter=gf.filters.GeneBioTypeFilter(\"lncRNA\") & gf.filters.GeneRangeFilter(\"1:10000-20000\")\n", ")" ] }, diff --git a/src/genomic_features/_core/filters.py b/src/genomic_features/_core/filters.py index 5062a05..2f7736d 100644 --- a/src/genomic_features/_core/filters.py +++ b/src/genomic_features/_core/filters.py @@ -198,7 +198,7 @@ def columns(self) -> set[str]: return {"gene_name"} -class GeneRangesFilter(AbstractFilterRangeExpr): +class GeneRangeFilter(AbstractFilterRangeExpr): """ Filter features within a genomic range diff --git a/src/genomic_features/filters.py b/src/genomic_features/filters.py index 66a549b..799f997 100644 --- a/src/genomic_features/filters.py +++ b/src/genomic_features/filters.py @@ -5,7 +5,7 @@ GeneBioTypeFilter, GeneIDFilter, GeneNameFilter, - GeneRangesFilter, + GeneRangeFilter, SeqNameFilter, TxBioTypeFilter, TxIDFilter, @@ -18,7 +18,7 @@ "CanonicalTxFilter", "GeneIDFilter", "GeneBioTypeFilter", - "GeneRangesFilter", + "GeneRangeFilter", "EmptyFilter", "ExonIDFilter", "GeneNameFilter", diff --git a/tests/test_filters.py b/tests/test_filters.py index 9ed3a07..bfd6805 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -147,10 +147,10 @@ def test_or_filter(hsapiens108): def test_range_filter(hsapiens108): any_overlap_filter = hsapiens108.genes( - filter=filters.GeneRangesFilter("1:77000000-78000000") + filter=filters.GeneRangeFilter("1:77000000-78000000") ) within_overlap_filter = hsapiens108.genes( - filter=filters.GeneRangesFilter("1:77000000-78000000", type="within") + filter=filters.GeneRangeFilter("1:77000000-78000000", type="within") ) assert all(within_overlap_filter.seq_name == "1") & all( any_overlap_filter.seq_name == "1" @@ -164,14 +164,14 @@ def test_range_filter(hsapiens108): ) # Test input with pytest.raises(ValueError): - hsapiens108.genes(filter=filters.GeneRangesFilter("1_77000000_78000000")) + hsapiens108.genes(filter=filters.GeneRangeFilter("1_77000000_78000000")) with pytest.raises(ValueError): - hsapiens108.genes(filter=filters.GeneRangesFilter("1-77000000-78000000")) + hsapiens108.genes(filter=filters.GeneRangeFilter("1-77000000-78000000")) with pytest.raises(ValueError): hsapiens108.genes( - filter=filters.GeneRangesFilter("1:77000000-78000000", type="start") + filter=filters.GeneRangeFilter("1:77000000-78000000", type="start") ) From 699910c357c62b3a499aada45b6190deb6378581 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 20 Sep 2023 17:00:04 +0200 Subject: [PATCH 3/3] Fix userwarning in test --- tests/test_columns.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_columns.py b/tests/test_columns.py index 5e99c93..bb2e95a 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -25,7 +25,8 @@ def test_tables_by_degree(hsapiens108): ] result = hsapiens108._tables_by_degree(tab=["protein", "exon"]) assert result == ["exon", "protein"] - result = hsapiens108._tables_by_degree(tab=["protein", "invalid_table"]) + with pytest.warns(UserWarning, match="not in the database: invalid_table"): + result = hsapiens108._tables_by_degree(tab=["protein", "invalid_table"]) assert result == ["protein"]