diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 5a554ad..025367d 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -310,12 +310,10 @@ async def get_genome_features_by_ids( ) parents, ( WITH attrs_tmp AS ( - SELECT gfak.attr_key AS attr_key, array_agg(gfav.attr_val) attr_vals - FROM genome_feature_attributes gfa - JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id - JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id - WHERE gfa.feature = gf.id - GROUP BY gfak.attr_key + SELECT gfav.attr_key, array_agg(attr_val) attr_vals + FROM genome_feature_attributes_view gfav + WHERE gfav.feature = gf.id + GROUP BY gfav.attr_key ) SELECT jsonb_object_agg(attrs_tmp.attr_key, attrs_tmp.attr_vals) FROM attrs_tmp ) attributes @@ -337,8 +335,9 @@ async def query_genome_features( g_id: str, /, q: str | None = None, + q_fzy: bool = False, name: str | None = None, - name_q: str | None = None, + name_fzy: bool = False, position: str | None = None, start: int | None = None, end: int | None = None, @@ -348,7 +347,9 @@ async def query_genome_features( ) -> tuple[list[GenomeFeature], dict]: # list of genome features + pagination dict object # TODO: refactor to use standard Bento search in the future, when Bento search makes more sense + gf_select_items: list[str] = [] gf_where_items: list[str] = [] + gf_order_items: list[str] = [] gfe_where_items: list[str] = [] q_params: list[str | int] = [] @@ -358,6 +359,7 @@ def _q_param(pv: str | int) -> str: if q: query_param = _q_param(q) + q_op = "%" if q_fzy else "~" gf_where_items.append( f""" gf.feature_id IN ( @@ -365,35 +367,33 @@ def _q_param(pv: str | int) -> str: SELECT feature_id, feature_name, - feature_type, - ({self._feature_inner_entries_query(None, "gf_tmp_1")}) entries, - ( - SELECT array_agg(gfav.attr_val) - FROM genome_feature_attributes gfa - JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id - JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id - WHERE gfa.feature = gf_tmp_1.id AND gfav.attr_val ~ {query_param} - ) attributes + feature_type FROM genome_features gf_tmp_1 WHERE - gf_tmp_1.genome_id = $1 + gf_tmp_1.genome_id = $1 AND ( + gf_tmp_1.feature_id {q_op} {query_param} + OR gf_tmp_1.feature_name {q_op} {query_param} + OR EXISTS ( + SELECT attr_val FROM genome_feature_attributes_view gfav + WHERE gfav.feature = gf_tmp_1.id AND gfav.attr_val {q_op} {query_param} + ) + ) ) gf_tmp_2 - WHERE - array_length(gf_tmp_2.attributes, 1) > 0 - OR gf_tmp_2.feature_id ~ {query_param} - OR gf_tmp_2.feature_name ~ {query_param} ) """ ) if name: - gf_where_items.append(f"gf.feature_name = {_q_param(name)}") - - if name_q: - gf_where_items.append(f"gf.feature_name ~ {_q_param(name_q)}") + param = _q_param(name) + if name_fzy: + gf_select_items.append(f"similarity(gf.feature_name, {param}) gf_fn_sml") + gf_where_items.append(f"gf.feature_name % {param}") + gf_order_items.append("gf_fn_sml DESC") + else: + gf_where_items.append(f"gf.feature_name = {param}") if position: - gfe_where_items.append(f"gfe.position_text ~ {_q_param(position)}") + gfe_where_items.append(f"gfe.position_text ILIKE {_q_param(position + '%')}") if start is not None: gfe_where_items.append(f"gfe.start_pos >= {_q_param(start)}") @@ -411,7 +411,7 @@ def _q_param(pv: str | int) -> str: gfe_where_clause = " AND ".join(gfe_where_items) if gfe_where_items else None id_query = f""" - SELECT feature_id FROM ( + SELECT feature_id {", " + ", ".join(gf_select_items) if gf_select_items else ""} FROM ( SELECT feature_id, feature_name, @@ -424,6 +424,7 @@ def _q_param(pv: str | int) -> str: WHERE {"jsonb_array_length(gf.entries) > 0 AND" if gfe_where_clause else ""} {where_clause} + {"ORDER BY " + ", ".join(gf_order_items) if gf_order_items else ""} OFFSET $2 LIMIT $3 """ diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 8e9f278..a51aff5 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -162,8 +162,9 @@ async def genomes_detail_features( db: DatabaseDependency, genome_id: str, q: str | None = None, + q_fzy: bool = False, name: str | None = None, - name_q: str | None = None, + name_fzy: bool = False, position: str | None = None, start: int | None = None, end: int | None = None, @@ -174,7 +175,7 @@ async def genomes_detail_features( st = datetime.now() results, pagination = await db.query_genome_features( - genome_id, q, name, name_q, position, start, end, feature_type, offset, limit + genome_id, q, q_fzy, name, name_fzy, position, start, end, feature_type, offset, limit ) return { diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index b305874..8a2d2e6 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -152,6 +152,12 @@ CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_key_idx CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_val_idx ON genome_feature_attributes (feature, attr_val); +CREATE OR REPLACE VIEW genome_feature_attributes_view AS + SELECT gfa.feature feature, gfak.attr_key attr_key, gfav.attr_val attr_val + FROM genome_feature_attributes gfa + JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id + JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id; + DO $$ BEGIN CREATE TYPE task_kind AS ENUM ('ingest_features'); diff --git a/tests/conftest.py b/tests/conftest.py index 8af7ae0..d15ea08 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,6 +41,8 @@ async def db_cleanup(db: Database): DROP TYPE IF EXISTS task_kind; DROP TYPE IF EXISTS task_status; + DROP VIEW genome_feature_attributes_view; + DROP INDEX IF EXISTS genome_features_feature_id_trgm_gin; DROP INDEX IF EXISTS genome_features_feature_name_trgm_gin; DROP INDEX IF EXISTS genome_feature_entries_position_text_trgm_gin; diff --git a/tests/test_db.py b/tests/test_db.py index 27f688d..8742858 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -112,11 +112,13 @@ async def test_genome_features_summary(db: Database, db_cleanup): [ # SARS-CoV-2 (SARS_COV_2_GENOME_ID, dict(name="ORF1ab"), 3), # should get back 2 genes and 1 transcript - (SARS_COV_2_GENOME_ID, dict(name_q="ORF1"), 6), # ORF1ab, ORF1a, ORF10 + # ORF1ab, ORF1a, ORF10 should be top 6 results, but we get more back since it's fuzzy + # (ORF3a, ORF6, ORF7[a|b], ORF8): + (SARS_COV_2_GENOME_ID, dict(name="ORF1", name_fzy=True, limit=100), 16), (SARS_COV_2_GENOME_ID, dict(start=1, end=1000), 9), # region + 8 related to ORF1ab (SARS_COV_2_GENOME_ID, dict(q="ORF1ab"), 3), (SARS_COV_2_GENOME_ID, dict(q="ENSSASG00005000002"), 1), - (SARS_COV_2_GENOME_ID, dict(q="protein_coding", limit=100), 24), + (SARS_COV_2_GENOME_ID, dict(q="protein_coding", q_fzy=True, limit=100), 24), # hg38 subset (HG38_CHR1_F100K_GENOME_ID, dict(position="chr1:11869-"), 3), (HG38_CHR1_F100K_GENOME_ID, dict(start=12000), 10), diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index cba98e0..a1dc078 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -266,25 +266,37 @@ async def test_genome_feature_endpoints(test_client: TestClient, aioresponse: ai # Test we can query genome features sr = test_client.get(f"/genomes/{genome.id}/feature_types") + assert sr.status_code == 200 srd = sr.json() assert sum(srd.values()) == expected_features # Test we can query genome features + + # - regular expression sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003"}) + assert sr.status_code == 200 srd = sr.json() assert len(srd["results"]) == 1 assert srd["pagination"]["total"] == 1 assert isinstance(srd.get("time"), float) assert srd["time"] < 0.2 # this is a very basic operation on a small dataset and should be fast. + # - fuzzy search + sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003", "q_fzy": "true"}) + assert sr.status_code == 200 + srd = sr.json() + assert len(srd["results"]) == 10 # fuzzy search yields many results + # Test we can filter genome features (ID used as name) sr = test_client.get(f"/genomes/{genome.id}/features", params={"name": "CDS:ENSSASP00005000003"}) + assert sr.status_code == 200 srd = sr.json() assert len(srd["results"]) == 1 assert srd["pagination"]["total"] == 1 # Test we can list genome features - we get back the first 10 sr = test_client.get(f"/genomes/{genome.id}/features") + assert sr.status_code == 200 srd = sr.json() assert len(srd["results"]) == 10 assert srd["pagination"]["offset"] == 0