Skip to content

Commit

Permalink
feat: fuzzy search params for feature search args: q/name
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed May 17, 2024
1 parent 1425399 commit b73f11a
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 31 deletions.
55 changes: 28 additions & 27 deletions bento_reference_service/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,12 +310,10 @@ async def get_genome_features_by_ids(
) parents,
(
WITH attrs_tmp AS (
SELECT gfak.attr_key AS attr_key, array_agg(gfav.attr_val) attr_vals
FROM genome_feature_attributes gfa
JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id
JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id
WHERE gfa.feature = gf.id
GROUP BY gfak.attr_key
SELECT gfav.attr_key, array_agg(attr_val) attr_vals
FROM genome_feature_attributes_view gfav
WHERE gfav.feature = gf.id
GROUP BY gfav.attr_key
)
SELECT jsonb_object_agg(attrs_tmp.attr_key, attrs_tmp.attr_vals) FROM attrs_tmp
) attributes
Expand All @@ -337,8 +335,9 @@ async def query_genome_features(
g_id: str,
/,
q: str | None = None,
q_fzy: bool = False,
name: str | None = None,
name_q: str | None = None,
name_fzy: bool = False,
position: str | None = None,
start: int | None = None,
end: int | None = None,
Expand All @@ -348,7 +347,9 @@ async def query_genome_features(
) -> tuple[list[GenomeFeature], dict]: # list of genome features + pagination dict object
# TODO: refactor to use standard Bento search in the future, when Bento search makes more sense

gf_select_items: list[str] = []
gf_where_items: list[str] = []
gf_order_items: list[str] = []
gfe_where_items: list[str] = []
q_params: list[str | int] = []

Expand All @@ -358,42 +359,41 @@ def _q_param(pv: str | int) -> str:

if q:
query_param = _q_param(q)
q_op = "%" if q_fzy else "~"
gf_where_items.append(
f"""
gf.feature_id IN (
SELECT feature_id FROM (
SELECT
feature_id,
feature_name,
feature_type,
({self._feature_inner_entries_query(None, "gf_tmp_1")}) entries,
(
SELECT array_agg(gfav.attr_val)
FROM genome_feature_attributes gfa
JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id
JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id
WHERE gfa.feature = gf_tmp_1.id AND gfav.attr_val ~ {query_param}
) attributes
feature_type
FROM genome_features gf_tmp_1
WHERE
gf_tmp_1.genome_id = $1
gf_tmp_1.genome_id = $1 AND (
gf_tmp_1.feature_id {q_op} {query_param}
OR gf_tmp_1.feature_name {q_op} {query_param}
OR EXISTS (
SELECT attr_val FROM genome_feature_attributes_view gfav
WHERE gfav.feature = gf_tmp_1.id AND gfav.attr_val {q_op} {query_param}
)
)
) gf_tmp_2
WHERE
array_length(gf_tmp_2.attributes, 1) > 0
OR gf_tmp_2.feature_id ~ {query_param}
OR gf_tmp_2.feature_name ~ {query_param}
)
"""
)

if name:
gf_where_items.append(f"gf.feature_name = {_q_param(name)}")

if name_q:
gf_where_items.append(f"gf.feature_name ~ {_q_param(name_q)}")
param = _q_param(name)
if name_fzy:
gf_select_items.append(f"similarity(gf.feature_name, {param}) gf_fn_sml")
gf_where_items.append(f"gf.feature_name % {param}")
gf_order_items.append("gf_fn_sml DESC")
else:
gf_where_items.append(f"gf.feature_name = {param}")

if position:
gfe_where_items.append(f"gfe.position_text ~ {_q_param(position)}")
gfe_where_items.append(f"gfe.position_text ILIKE {_q_param(position + '%')}")

if start is not None:
gfe_where_items.append(f"gfe.start_pos >= {_q_param(start)}")
Expand All @@ -411,7 +411,7 @@ def _q_param(pv: str | int) -> str:
gfe_where_clause = " AND ".join(gfe_where_items) if gfe_where_items else None

id_query = f"""
SELECT feature_id FROM (
SELECT feature_id {", " + ", ".join(gf_select_items) if gf_select_items else ""} FROM (
SELECT
feature_id,
feature_name,
Expand All @@ -424,6 +424,7 @@ def _q_param(pv: str | int) -> str:
WHERE
{"jsonb_array_length(gf.entries) > 0 AND" if gfe_where_clause else ""}
{where_clause}
{"ORDER BY " + ", ".join(gf_order_items) if gf_order_items else ""}
OFFSET $2
LIMIT $3
"""
Expand Down
5 changes: 3 additions & 2 deletions bento_reference_service/routers/genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,9 @@ async def genomes_detail_features(
db: DatabaseDependency,
genome_id: str,
q: str | None = None,
q_fzy: bool = False,
name: str | None = None,
name_q: str | None = None,
name_fzy: bool = False,
position: str | None = None,
start: int | None = None,
end: int | None = None,
Expand All @@ -174,7 +175,7 @@ async def genomes_detail_features(
st = datetime.now()

results, pagination = await db.query_genome_features(
genome_id, q, name, name_q, position, start, end, feature_type, offset, limit
genome_id, q, q_fzy, name, name_fzy, position, start, end, feature_type, offset, limit
)

return {
Expand Down
6 changes: 6 additions & 0 deletions bento_reference_service/sql/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,12 @@ CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_key_idx
CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_val_idx
ON genome_feature_attributes (feature, attr_val);

CREATE OR REPLACE VIEW genome_feature_attributes_view AS
SELECT gfa.feature feature, gfak.attr_key attr_key, gfav.attr_val attr_val
FROM genome_feature_attributes gfa
JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id
JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id;


DO $$ BEGIN
CREATE TYPE task_kind AS ENUM ('ingest_features');
Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ async def db_cleanup(db: Database):
DROP TYPE IF EXISTS task_kind;
DROP TYPE IF EXISTS task_status;
DROP VIEW genome_feature_attributes_view;
DROP INDEX IF EXISTS genome_features_feature_id_trgm_gin;
DROP INDEX IF EXISTS genome_features_feature_name_trgm_gin;
DROP INDEX IF EXISTS genome_feature_entries_position_text_trgm_gin;
Expand Down
6 changes: 4 additions & 2 deletions tests/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,13 @@ async def test_genome_features_summary(db: Database, db_cleanup):
[
# SARS-CoV-2
(SARS_COV_2_GENOME_ID, dict(name="ORF1ab"), 3), # should get back 2 genes and 1 transcript
(SARS_COV_2_GENOME_ID, dict(name_q="ORF1"), 6), # ORF1ab, ORF1a, ORF10
# ORF1ab, ORF1a, ORF10 should be top 6 results, but we get more back since it's fuzzy
# (ORF3a, ORF6, ORF7[a|b], ORF8):
(SARS_COV_2_GENOME_ID, dict(name="ORF1", name_fzy=True, limit=100), 16),
(SARS_COV_2_GENOME_ID, dict(start=1, end=1000), 9), # region + 8 related to ORF1ab
(SARS_COV_2_GENOME_ID, dict(q="ORF1ab"), 3),
(SARS_COV_2_GENOME_ID, dict(q="ENSSASG00005000002"), 1),
(SARS_COV_2_GENOME_ID, dict(q="protein_coding", limit=100), 24),
(SARS_COV_2_GENOME_ID, dict(q="protein_coding", q_fzy=True, limit=100), 24),
# hg38 subset
(HG38_CHR1_F100K_GENOME_ID, dict(position="chr1:11869-"), 3),
(HG38_CHR1_F100K_GENOME_ID, dict(start=12000), 10),
Expand Down
12 changes: 12 additions & 0 deletions tests/test_genome_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,25 +266,37 @@ async def test_genome_feature_endpoints(test_client: TestClient, aioresponse: ai

# Test we can query genome features
sr = test_client.get(f"/genomes/{genome.id}/feature_types")
assert sr.status_code == 200
srd = sr.json()
assert sum(srd.values()) == expected_features

# Test we can query genome features

# - regular expression
sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003"})
assert sr.status_code == 200
srd = sr.json()
assert len(srd["results"]) == 1
assert srd["pagination"]["total"] == 1
assert isinstance(srd.get("time"), float)
assert srd["time"] < 0.2 # this is a very basic operation on a small dataset and should be fast.

# - fuzzy search
sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003", "q_fzy": "true"})
assert sr.status_code == 200
srd = sr.json()
assert len(srd["results"]) == 10 # fuzzy search yields many results

# Test we can filter genome features (ID used as name)
sr = test_client.get(f"/genomes/{genome.id}/features", params={"name": "CDS:ENSSASP00005000003"})
assert sr.status_code == 200
srd = sr.json()
assert len(srd["results"]) == 1
assert srd["pagination"]["total"] == 1

# Test we can list genome features - we get back the first 10
sr = test_client.get(f"/genomes/{genome.id}/features")
assert sr.status_code == 200
srd = sr.json()
assert len(srd["results"]) == 10
assert srd["pagination"]["offset"] == 0
Expand Down

0 comments on commit b73f11a

Please sign in to comment.