Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CURIE ID query support #173

Merged
merged 12 commits into from
Apr 19, 2024
116 changes: 103 additions & 13 deletions src/config_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
https://mychem.info/
Chemical and Drug Annotation as a Service.
"""

import copy
import re

Expand All @@ -15,31 +16,120 @@
ES_INDICES = {
"chem": "mychem_current",
"drug": "mychem_current",
"compound": "mychem_current"
"compound": "mychem_current",
}
ES_SCROLL_TIME = '10m'
ES_SCROLL_TIME = "10m"

# *****************************************************************************
# Endpoint Specifics
# *****************************************************************************

# *** NOTE ***
# The CHEBI prefix must have a regex_term_pattern without a named <term> grouping.
# example query: CHEBI:57966:
# code snippet location: <biothings.api/web/query/builder.py>
# With a named term grouping of <term>, we produce the following which will fail
# named_groups = match.groupdict() -> {"term": 57966}
# q = named_groups.get(self.gpname.term) or q -> "57966"
# Without a named term grouping of <term> we orduce the following which will pass
# named_groups = match.groupdict() -> {}
# q = named_groups.get(self.gpname.term) or q -> "CHEBI:57966"

BIOLINK_MODEL_PREFIX_BIOTHINGS_CHEM_MAPPING = {
"INCHIKEY": {"type": "chem"},
"CHEMBL.COMPOUND": {
"type": "chem",
"field": "chembl.molecule_chembl_id",
"regex_term_pattern": "(?P<term>chembl[0-9]+)",
# "converter": lambda x: x.replace("CHEMBL.COMPOUND:", "CHEMBL"),
},
"PUBCHEM.COMPOUND": {
"type": "chem",
"field": "pubchem.cid",
"regex_term_pattern": "(?P<term>[0-9]+)",
},
"CHEBI": {
"type": "chem",
"field": ["chebi.id", "chebi.secondary_chebi_id"],
"regex_term_pattern": "(?P<term>CHEBI:[0-9]+)",
},
"UNII": {
"type": "chem",
"field": "unii.unii",
"regex_term_pattern": "(?P<term>[A-Z0-9]{10})",
},
}

# CURIE ID support based on BioLink Model
biolink_curie_regex_list = []
for (
biolink_prefix,
mapping,
) in BIOLINK_MODEL_PREFIX_BIOTHINGS_CHEM_MAPPING.items():
field_match = mapping.get("field", [])
term_pattern = mapping.get("regex_term_pattern", None)
if term_pattern is None:
term_pattern = "(?P<term>[^:]+)"

raw_expression = rf"({biolink_prefix}):{term_pattern}"
compiled_expression = re.compile(raw_expression, re.I)

pattern = (compiled_expression, field_match)
biolink_curie_regex_list.append(pattern)

# Custom prefix handling for chem specific identifiers
chem_prefix_handling = [
(
re.compile(r"((chembl\:(?P<term>chembl[0-9]+))|(chembl[0-9]+))", re.I),
"chembl.molecule_chembl_id",
),
(re.compile(r"chebi\:[0-9]+", re.I), ["chebi.id", "chebi.secondary_chebi_id"]),
(re.compile(r"((unii\:(?P<term>[A-Z0-9]{10}))|([A-Z0-9]{10}))", re.I), "unii.unii"),
(
re.compile(r"((drugbank\:(?P<term>db[0-9]+))|(db[0-9]+))", re.I),
[
"unichem.drugbank",
"chebi.xrefs.drugbank",
"drugcentral.xrefs.drugbank_id",
"pharmgkb.xrefs.drugbank",
],
),
(
re.compile(r"((pharmgkb.drug\:(?P<term>pa[0-9]+))|(pa[0-9]+))", re.I),
"pharmgkb.id",
),
(
re.compile(
r"((((pubchem.compound\:)|(cid\:))(?P<term>[0-9]+))|([0-9]+))", re.I
),
["pubchem.cid"],
),
(
re.compile(
r"((((sid\:)|(pubchem.substance\:))(?P<term>[0-9]+))|([0-9]+))", re.I
),
["fda_orphan_drug.pubchem_sid"],
),
]

default_chem_regex = re.compile(r"(?P<scope>[^:]+):(?P<term>[\W\w]+)")
default_chem_fields = ()
default_chem_regex_pattern = (default_chem_regex, default_chem_fields)


ANNOTATION_ID_REGEX_LIST = [
(re.compile(r'chembl[0-9]+', re.I), 'chembl.molecule_chembl_id'),
(re.compile(r'chebi\:[0-9]+', re.I), ['chebi.id', 'chebi.secondary_chebi_id']),
(re.compile(r'[A-Z0-9]{10}'), 'unii.unii'),
(re.compile(r'db[0-9]+', re.I), ['unichem.drugbank', 'chebi.xrefs.drugbank', 'drugcentral.xrefs.drugbank_id', 'pharmgkb.xrefs.drugbank']),
(re.compile(r'pa[0-9]+', re.I), 'pharmgkb.id'),
(re.compile(r'((cid\:(?P<term>[0-9]+))|([0-9]+))', re.I), ['pubchem.cid', 'fda_orphan_drug.pubchem_sid'])
*biolink_curie_regex_list,
*chem_prefix_handling,
default_chem_regex_pattern,
]


STATUS_CHECK = {
'id': 'USNINKBPBVKHHZ-CYUUQNCZSA-L', # penicillin
'index': 'mychem_current',
"id": "USNINKBPBVKHHZ-CYUUQNCZSA-L", # penicillin
"index": "mychem_current",
}

_extra_kwargs = {
"list_filter": {"type": str, "default": None}
}
_extra_kwargs = {"list_filter": {"type": str, "default": None}}
ANNOTATION_KWARGS = copy.deepcopy(ANNOTATION_KWARGS)
ANNOTATION_KWARGS["*"].update(_extra_kwargs)
QUERY_KWARGS = copy.deepcopy(QUERY_KWARGS)
Expand Down
189 changes: 189 additions & 0 deletions src/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import logging

import pytest
import requests

from biothings.tests.web import BiothingsDataTest


logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class TestMyChemCurieIdParsing(BiothingsDataTest):
host = "mygene.info"
prefix = "v1"

@pytest.mark.xfail(
reason="CURIE ID SUPPORT NOT CURRENTLY ENABLED ON MYCHEM.INFO HOST",
run=True,
strict=True,
)
def test_001_curie_id_annotation_endpoint_GET(self):
"""
Tests the annotation endpoint support for the biolink CURIE ID.

If support is enabled then we should retrieve the exact same document
for all the provided queries

A mirror copy of the tests we have in the biothings_client
package (chem.py)
"""
curie_id_testing_collection = [
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"CHEMBL297569",
"CHEMBL.COMPOUND:CHEMBL297569",
"chembl.compound:CHEMBL297569",
"cHEmbl.ComPOUND:CHEMBL297569",
"chembl.molecule_chembl_id:CHEMBL297569",
),
(
"AKUPVPKIFATOBM-UHFFFAOYSA-N",
"120933777",
120933777,
"PUBCHEM.COMPOUND:120933777",
"pubchem.compound:120933777",
"PuBcHEm.COMPound:120933777",
"pubchem.cid:120933777",
),
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"CHEBI:CHEBI:57966",
"chebi:CHEBI:57966",
"CheBi:CHEBI:57966",
"chebi.id:CHEBI:57966",
),
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"11P2JDE17B",
"UNII:11P2JDE17B",
"unii:11P2JDE17B",
"uNIi:11P2JDE17B",
"unii.unii:11P2JDE17B",
),
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"dB03107",
"DRUGBANK:dB03107",
"drugbank:dB03107",
"DrugBaNK:dB03107",
"drugbank.id:dB03107",
),
]
aggregation_query_groups = []
endpoint = "chem"
for query_collection in curie_id_testing_collection:
query_result_storage = []
for similar_query in query_collection:
query_result = self.request(f"{endpoint}/{similar_query}", expect=200)
query_result = self.request(f"{endpoint}/{similar_query}")
assert isinstance(query_result, requests.models.Response)
assert query_result.url == self.get_url(
path=f"{endpoint}/{similar_query}"
)
query_result_storage.append(query_result.json())

results_aggregation = [
query == query_result_storage[0] for query in query_result_storage[1:]
]

if all(results_aggregation):
logger.info(f"Query group {query_collection} succeeded")
else:
logger.info(f"Query group {query_collection} failed")

aggregation_query_groups.append(all(results_aggregation))
assert all(aggregation_query_groups)

@pytest.mark.xfail(
reason="CURIE ID SUPPORT NOT CURRENTLY ENABLED ON MYCHEM.INFO HOST",
run=True,
strict=True,
)
def test_002_curie_id_annotation_endpoint_POST(self):
"""
Tests the annotations endpoint support for the biolink CURIE ID.

Batch query testing against the POST endpoint to verify that the CURIE ID can work with
multiple

If support is enabled then we should retrieve the exact same document for all the provided
queries

A mirror copy of the tests we have in the biothings_client
package (chem.py)
"""
curie_id_testing_collection = [
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"CHEMBL297569",
"CHEMBL.COMPOUND:CHEMBL297569",
"chembl.compound:CHEMBL297569",
"cHEmbl.ComPOUND:CHEMBL297569",
"chembl.molecule_chembl_id:CHEMBL297569",
),
(
"AKUPVPKIFATOBM-UHFFFAOYSA-N",
"120933777",
120933777,
"PUBCHEM.COMPOUND:120933777",
"pubchem.compound:120933777",
"PuBcHEm.COMPound:120933777",
"pubchem.cid:120933777",
),
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"CHEBI:CHEBI:57966",
"chebi:CHEBI:57966",
"CheBi:CHEBI:57966",
"chebi.id:CHEBI:57966",
),
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"11P2JDE17B",
"UNII:11P2JDE17B",
"unii:11P2JDE17B",
"uNIi:11P2JDE17B",
"unii.unii:11P2JDE17B",
),
(
"UCMIRNVEIXFBKS-UHFFFAOYSA-N",
"dB03107",
"DRUGBANK:dB03107",
"drugbank:dB03107",
"DrugBaNK:dB03107",
"drugbank.id:dB03107",
),
]

results_aggregation = []
endpoint = "chem"
for query_collection in curie_id_testing_collection:
base_result = self.request(f"{endpoint}/{query_collection[0]}", expect=200)

delimiter = ","
data_mapping = {
"ids": delimiter.join([f'"{query}"' for query in query_collection])
}

query_results = self.request(
endpoint, method="POST", data=data_mapping
).json()
assert len(query_results) == len(query_collection)

batch_result = []
for query_result, query_entry in zip(query_results, query_collection):
return_query_field = query_result.pop("query")
assert return_query_field == str(query_entry)
batch_result.append(base_result.json() == query_result)

aggregate_result = all(results_aggregation)

if aggregate_result:
logger.info(f"Query group {query_collection} succeeded")
else:
logger.info(f"Query group {query_collection} failed")

results_aggregation.append(aggregate_result)
assert all(results_aggregation)
Loading
Loading