biothings · ctrl-schaff · Apr 19, 2024 · Apr 20, 2022 · Feb 26, 2024 · Feb 26, 2024
diff --git a/src/config_web.py b/src/config_web.py
@@ -3,6 +3,7 @@
     https://mychem.info/
     Chemical and Drug Annotation as a Service.
 """
+
 import copy
 import re
 
@@ -15,31 +16,120 @@
 ES_INDICES = {
     "chem": "mychem_current",
     "drug": "mychem_current",
-    "compound": "mychem_current"
+    "compound": "mychem_current",
 }
-ES_SCROLL_TIME = '10m'
+ES_SCROLL_TIME = "10m"
 
 # *****************************************************************************
 # Endpoint Specifics
 # *****************************************************************************
 
+# *** NOTE ***
+# The CHEBI prefix must have a regex_term_pattern without a named <term> grouping.
+# example query: CHEBI:57966:
+# code snippet location: <biothings.api/web/query/builder.py>
+# With a named term grouping of <term>, we produce the following which will fail
+#     named_groups = match.groupdict() -> {"term": 57966}
+#     q = named_groups.get(self.gpname.term) or q -> "57966"
+# Without a named term grouping of <term> we orduce the following which will pass
+#     named_groups = match.groupdict() -> {}
+#     q = named_groups.get(self.gpname.term) or q -> "CHEBI:57966"
+
+BIOLINK_MODEL_PREFIX_BIOTHINGS_CHEM_MAPPING = {
+    "INCHIKEY": {"type": "chem"},
+    "CHEMBL.COMPOUND": {
+        "type": "chem",
+        "field": "chembl.molecule_chembl_id",
+        "regex_term_pattern": "(?P<term>chembl[0-9]+)",
+        # "converter": lambda x: x.replace("CHEMBL.COMPOUND:", "CHEMBL"),
+    },
+    "PUBCHEM.COMPOUND": {
+        "type": "chem",
+        "field": "pubchem.cid",
+        "regex_term_pattern": "(?P<term>[0-9]+)",
+    },
+    "CHEBI": {
+        "type": "chem",
+        "field": ["chebi.id", "chebi.secondary_chebi_id"],
+        "regex_term_pattern": "(?P<term>CHEBI:[0-9]+)",
+    },
+    "UNII": {
+        "type": "chem",
+        "field": "unii.unii",
+        "regex_term_pattern": "(?P<term>[A-Z0-9]{10})",
+    },
+}
+
+# CURIE ID support based on BioLink Model
+biolink_curie_regex_list = []
+for (
+    biolink_prefix,
+    mapping,
+) in BIOLINK_MODEL_PREFIX_BIOTHINGS_CHEM_MAPPING.items():
+    field_match = mapping.get("field", [])
+    term_pattern = mapping.get("regex_term_pattern", None)
+    if term_pattern is None:
+        term_pattern = "(?P<term>[^:]+)"
+
+    raw_expression = rf"({biolink_prefix}):{term_pattern}"
+    compiled_expression = re.compile(raw_expression, re.I)
+
+    pattern = (compiled_expression, field_match)
+    biolink_curie_regex_list.append(pattern)
+
+# Custom prefix handling for chem specific identifiers
+chem_prefix_handling = [
+    (
+        re.compile(r"((chembl\:(?P<term>chembl[0-9]+))|(chembl[0-9]+))", re.I),
+        "chembl.molecule_chembl_id",
+    ),
+    (re.compile(r"chebi\:[0-9]+", re.I), ["chebi.id", "chebi.secondary_chebi_id"]),
+    (re.compile(r"((unii\:(?P<term>[A-Z0-9]{10}))|([A-Z0-9]{10}))", re.I), "unii.unii"),
+    (
+        re.compile(r"((drugbank\:(?P<term>db[0-9]+))|(db[0-9]+))", re.I),
+        [
+            "unichem.drugbank",
+            "chebi.xrefs.drugbank",
+            "drugcentral.xrefs.drugbank_id",
+            "pharmgkb.xrefs.drugbank",
+        ],
+    ),
+    (
+        re.compile(r"((pharmgkb.drug\:(?P<term>pa[0-9]+))|(pa[0-9]+))", re.I),
+        "pharmgkb.id",
+    ),
+    (
+        re.compile(
+            r"((((pubchem.compound\:)|(cid\:))(?P<term>[0-9]+))|([0-9]+))", re.I
+        ),
+        ["pubchem.cid"],
+    ),
+    (
+        re.compile(
+            r"((((sid\:)|(pubchem.substance\:))(?P<term>[0-9]+))|([0-9]+))", re.I
+        ),
+        ["fda_orphan_drug.pubchem_sid"],
+    ),
+]
+
+default_chem_regex = re.compile(r"(?P<scope>[^:]+):(?P<term>[\W\w]+)")
+default_chem_fields = ()
+default_chem_regex_pattern = (default_chem_regex, default_chem_fields)
+
+
 ANNOTATION_ID_REGEX_LIST = [
-    (re.compile(r'chembl[0-9]+', re.I), 'chembl.molecule_chembl_id'),
-    (re.compile(r'chebi\:[0-9]+', re.I), ['chebi.id', 'chebi.secondary_chebi_id']),
-    (re.compile(r'[A-Z0-9]{10}'), 'unii.unii'),
-    (re.compile(r'db[0-9]+', re.I), ['unichem.drugbank', 'chebi.xrefs.drugbank', 'drugcentral.xrefs.drugbank_id', 'pharmgkb.xrefs.drugbank']),
-    (re.compile(r'pa[0-9]+', re.I), 'pharmgkb.id'),
-    (re.compile(r'((cid\:(?P<term>[0-9]+))|([0-9]+))', re.I), ['pubchem.cid', 'fda_orphan_drug.pubchem_sid'])
+    *biolink_curie_regex_list,
+    *chem_prefix_handling,
+    default_chem_regex_pattern,
 ]
 
+
 STATUS_CHECK = {
-    'id': 'USNINKBPBVKHHZ-CYUUQNCZSA-L',  # penicillin
-    'index': 'mychem_current',
+    "id": "USNINKBPBVKHHZ-CYUUQNCZSA-L",  # penicillin
+    "index": "mychem_current",
 }
 
-_extra_kwargs = {
-    "list_filter": {"type": str, "default": None}
-}
+_extra_kwargs = {"list_filter": {"type": str, "default": None}}
 ANNOTATION_KWARGS = copy.deepcopy(ANNOTATION_KWARGS)
 ANNOTATION_KWARGS["*"].update(_extra_kwargs)
 QUERY_KWARGS = copy.deepcopy(QUERY_KWARGS)

diff --git a/src/tests/test_data.py b/src/tests/test_data.py
@@ -0,0 +1,189 @@
+import logging
+
+import pytest
+import requests
+
+from biothings.tests.web import BiothingsDataTest
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+class TestMyChemCurieIdParsing(BiothingsDataTest):
+    host = "mygene.info"
+    prefix = "v1"
+
+    @pytest.mark.xfail(
+        reason="CURIE ID SUPPORT NOT CURRENTLY ENABLED ON MYCHEM.INFO HOST",
+        run=True,
+        strict=True,
+    )
+    def test_001_curie_id_annotation_endpoint_GET(self):
+        """
+        Tests the annotation endpoint support for the biolink CURIE ID.
+
+        If support is enabled then we should retrieve the exact same document
+        for all the provided queries
+
+        A mirror copy of the tests we have in the biothings_client
+        package (chem.py)
+        """
+        curie_id_testing_collection = [
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "CHEMBL297569",
+                "CHEMBL.COMPOUND:CHEMBL297569",
+                "chembl.compound:CHEMBL297569",
+                "cHEmbl.ComPOUND:CHEMBL297569",
+                "chembl.molecule_chembl_id:CHEMBL297569",
+            ),
+            (
+                "AKUPVPKIFATOBM-UHFFFAOYSA-N",
+                "120933777",
+                120933777,
+                "PUBCHEM.COMPOUND:120933777",
+                "pubchem.compound:120933777",
+                "PuBcHEm.COMPound:120933777",
+                "pubchem.cid:120933777",
+            ),
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "CHEBI:CHEBI:57966",
+                "chebi:CHEBI:57966",
+                "CheBi:CHEBI:57966",
+                "chebi.id:CHEBI:57966",
+            ),
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "11P2JDE17B",
+                "UNII:11P2JDE17B",
+                "unii:11P2JDE17B",
+                "uNIi:11P2JDE17B",
+                "unii.unii:11P2JDE17B",
+            ),
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "dB03107",
+                "DRUGBANK:dB03107",
+                "drugbank:dB03107",
+                "DrugBaNK:dB03107",
+                "drugbank.id:dB03107",
+            ),
+        ]
+        aggregation_query_groups = []
+        endpoint = "chem"
+        for query_collection in curie_id_testing_collection:
+            query_result_storage = []
+            for similar_query in query_collection:
+                query_result = self.request(f"{endpoint}/{similar_query}", expect=200)
+                query_result = self.request(f"{endpoint}/{similar_query}")
+                assert isinstance(query_result, requests.models.Response)
+                assert query_result.url == self.get_url(
+                    path=f"{endpoint}/{similar_query}"
+                )
+                query_result_storage.append(query_result.json())
+
+            results_aggregation = [
+                query == query_result_storage[0] for query in query_result_storage[1:]
+            ]
+
+            if all(results_aggregation):
+                logger.info(f"Query group {query_collection} succeeded")
+            else:
+                logger.info(f"Query group {query_collection} failed")
+
+            aggregation_query_groups.append(all(results_aggregation))
+        assert all(aggregation_query_groups)
+
+    @pytest.mark.xfail(
+        reason="CURIE ID SUPPORT NOT CURRENTLY ENABLED ON MYCHEM.INFO HOST",
+        run=True,
+        strict=True,
+    )
+    def test_002_curie_id_annotation_endpoint_POST(self):
+        """
+        Tests the annotations endpoint support for the biolink CURIE ID.
+
+        Batch query testing against the POST endpoint to verify that the CURIE ID can work with
+        multiple
+
+        If support is enabled then we should retrieve the exact same document for all the provided
+        queries
+
+        A mirror copy of the tests we have in the biothings_client
+        package (chem.py)
+        """
+        curie_id_testing_collection = [
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "CHEMBL297569",
+                "CHEMBL.COMPOUND:CHEMBL297569",
+                "chembl.compound:CHEMBL297569",
+                "cHEmbl.ComPOUND:CHEMBL297569",
+                "chembl.molecule_chembl_id:CHEMBL297569",
+            ),
+            (
+                "AKUPVPKIFATOBM-UHFFFAOYSA-N",
+                "120933777",
+                120933777,
+                "PUBCHEM.COMPOUND:120933777",
+                "pubchem.compound:120933777",
+                "PuBcHEm.COMPound:120933777",
+                "pubchem.cid:120933777",
+            ),
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "CHEBI:CHEBI:57966",
+                "chebi:CHEBI:57966",
+                "CheBi:CHEBI:57966",
+                "chebi.id:CHEBI:57966",
+            ),
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "11P2JDE17B",
+                "UNII:11P2JDE17B",
+                "unii:11P2JDE17B",
+                "uNIi:11P2JDE17B",
+                "unii.unii:11P2JDE17B",
+            ),
+            (
+                "UCMIRNVEIXFBKS-UHFFFAOYSA-N",
+                "dB03107",
+                "DRUGBANK:dB03107",
+                "drugbank:dB03107",
+                "DrugBaNK:dB03107",
+                "drugbank.id:dB03107",
+            ),
+        ]
+
+        results_aggregation = []
+        endpoint = "chem"
+        for query_collection in curie_id_testing_collection:
+            base_result = self.request(f"{endpoint}/{query_collection[0]}", expect=200)
+
+            delimiter = ","
+            data_mapping = {
+                "ids": delimiter.join([f'"{query}"' for query in query_collection])
+            }
+
+            query_results = self.request(
+                endpoint, method="POST", data=data_mapping
+            ).json()
+            assert len(query_results) == len(query_collection)
+
+            batch_result = []
+            for query_result, query_entry in zip(query_results, query_collection):
+                return_query_field = query_result.pop("query")
+                assert return_query_field == str(query_entry)
+                batch_result.append(base_result.json() == query_result)
+
+            aggregate_result = all(results_aggregation)
+
+            if aggregate_result:
+                logger.info(f"Query group {query_collection} succeeded")
+            else:
+                logger.info(f"Query group {query_collection} failed")
+
+            results_aggregation.append(aggregate_result)
+        assert all(results_aggregation)