Bring back oak and phenio (text annotator) (#497)

Closes #234 --------- Co-authored-by: Kevin Schaper <[email protected]> Co-authored-by: madanucd <[email protected]> Co-authored-by: glass-ships <[email protected]>
monarch-initiative · Dec 15, 2023 · 7c0ce2b · 7c0ce2b
1 parent 8c90aaf
commit 7c0ce2b
Show file tree

Hide file tree

Showing 22 changed files with 2,976 additions and 3,562 deletions.
diff --git a/Makefile b/Makefile
@@ -131,7 +131,7 @@ data:
 .PHONY: dev-frontend
 dev-frontend: frontend/src/api/model.ts
 	cd frontend && \
-		yarn dev
+		VITE_API=local yarn dev
 
 
 .PHONY: dev-api

diff --git a/backend/poetry.lock b/backend/poetry.lock
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -19,7 +19,7 @@ packages = [
 python = "^3.9"
 pydantic = "^1.10.2"
 curies = "<1"
-linkml = "^1.6.2"
+linkml = ">=1.6.3"
 prefixmaps = "^0.1.7"
 
 requests = "^2.28.1"

diff --git a/backend/src/monarch_py/api/additional_models.py b/backend/src/monarch_py/api/additional_models.py
@@ -32,3 +32,7 @@ class SemsimSearchRequest(BaseModel):
     termset: List[str] = Field(..., title="Termset to search")
     category: SemsimSearchCategory = Field(..., title="Category to search for")
     limit: Optional[int] = Field(10, title="Limit the number of results", ge=1, le=50)
+
+
+class TextAnnotationRequest(BaseModel):
+    content: str = Field(..., title="The text content to annotate")
diff --git a/backend/src/monarch_py/api/config.py b/backend/src/monarch_py/api/config.py
@@ -6,6 +6,7 @@
 from pydantic import BaseSettings
 
 from monarch_py.implementations.solr.solr_implementation import SolrImplementation
+from monarch_py.implementations.oak.oak_implementation import OakImplementation
 from monarch_py.datamodels.model import TermSetPairwiseSimilarity, SemsimSearchResult
 
 
@@ -43,6 +44,11 @@ def convert_nans(input_dict, to_value=None):
 class SemsimianHTTPRequester:
     """A class that makes HTTP requests to the semsimian_server."""
 
+    def compare(self, subjects, objects):
+        host = f"http://{settings.semsimian_server_host}:{settings.semsimian_server_port}"
+        path = f"/compare/{','.join(subjects)}/{','.join(objects)}"
+        url = f"{host}/{path}"
+
     def convert_tsps_data(self, data):
         """Convert to a format that can be coerced into a TermSetPairwiseSimilarity model
 
@@ -104,3 +110,10 @@ def search(self, termset: List[str], prefix: str, limit: int):
 @lru_cache(maxsize=1)
 def semsimian():
     return SemsimianHTTPRequester()
+
+
+@lru_cache(maxsize=1)
+def oak():
+    oak_implementation = OakImplementation()
+    oak_implementation.init_phenio_adapter(force_update=False)
+    return oak_implementation
diff --git a/backend/src/monarch_py/api/main.py b/backend/src/monarch_py/api/main.py
@@ -2,8 +2,8 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
-from monarch_py.api import association, entity, histopheno, search, semsim
-from monarch_py.api.config import semsimian
+from monarch_py.api import association, entity, histopheno, search, semsim, text_annotation
+from monarch_py.api.config import semsimian, oak
 from monarch_py.api.middleware.logging_middleware import LoggingMiddleware
 from monarch_py.service.curie_service import CurieService
 
@@ -18,7 +18,7 @@
 @app.on_event("startup")
 async def initialize_app():
     semsimian()
-    # Let the curie service singleton initialize itself
+    oak()
     CurieService()
 
 
@@ -27,6 +27,7 @@ async def initialize_app():
 app.include_router(histopheno.router, prefix=f"{PREFIX}/histopheno")
 app.include_router(search.router, prefix=PREFIX)
 app.include_router(semsim.router, prefix=f"{PREFIX}/semsim")
+app.include_router(text_annotation.router, prefix=PREFIX)
 
 # Allow CORS
 app.add_middleware(

diff --git a/backend/src/monarch_py/api/text_annotation.py b/backend/src/monarch_py/api/text_annotation.py
@@ -0,0 +1,19 @@
+from fastapi import APIRouter, Path  # , Depends, HTTPException, Query
+
+from monarch_py.api.additional_models import TextAnnotationRequest
+from monarch_py.api.config import oak
+
+router = APIRouter(tags=["text_annotation"], responses={404: {"description": "Not Found"}})
+
+
+@router.get("/annotate/{content}")
+def _annotate(content: str = Path(title="The text content to annotate")):
+    print(f"\n\nRunning oak annotate (GET):\n{content}\n")
+    return oak().annotate_text(content)
+
+
+@router.post("/annotate")
+def _post_annotate(request: TextAnnotationRequest):
+    print(f"\n\nRunning oak annotate (POST):\n{request.content}\n")
+    # print(request.content.split("\n"))
+    return oak().annotate_text(request.content)
diff --git a/backend/src/monarch_py/api/utils/get_text_annotations.py b/backend/src/monarch_py/api/utils/get_text_annotations.py
@@ -0,0 +1,75 @@
+from monarch_py.api.config import oak
+import re
+
+phenio_adapter = oak().phenio_adapter
+
+
+def annotate_text(text):
+    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
+    result = ""
+    for sentence in sentences:
+        entities = []
+        for ann in phenio_adapter.annotate_text(sentence):  # type: ignore
+            if len(ann.object_label) >= 4:
+                element = [ann.subject_start, ann.subject_end, str(ann.object_label) + "," + str(ann.object_id)]
+                if (get_word_length(sentence, ann.subject_start - 1) - len(ann.object_label)) < 2:
+                    entities.append(element)
+        try:
+            # Trying to access an element that doesn't exist in the list
+            entities.sort()
+            entities = concatenate_same_entities(entities)
+            entities = concatenate_ngram_entities(entities)
+            replaced_text = replace_entities(sentence, entities)
+            result += replaced_text + " "
+        except IndexError as error:
+            # Handling the list index out of range error
+            result += sentence + " "
+            print("Error occurred:", error)
+    return result
+
+
+def get_word_length(text, start):
+    word = ""
+    index = start
+    while index < len(text) and text[index].isalpha():
+        word += text[index]
+        index += 1
+    return len(word)
+
+
+def concatenate_same_entities(lst):
+    result = {}
+    for elem in lst:
+        key = (elem[0], elem[1])
+        if key in result:
+            result[key] += "|" + elem[2]
+        else:
+            result[key] = elem[2]
+    concatenated_list = [[key[0], key[1], value] for key, value in result.items()]
+    return concatenated_list
+
+
+def concatenate_ngram_entities(lst):
+    merged_list = []
+    start, end, text = lst[0]
+    for element in lst[1:]:
+        if element[0] <= end:  # Check if range overlaps
+            end = max(end, element[1])  # Merge the range
+            text += "|" + element[2]  # Concatenate the texts
+        else:
+            merged_list.append([start, end, text])  # Add the merged element to the result
+            start, end, text = element  # Move to the next element
+    merged_list.append([start, end, text])  # Add the last merged element
+    return merged_list
+
+
+def replace_entities(text, entities):
+    replaced_text = text
+    # Sort the entities in descending order of start character indices
+    entities = sorted(entities, key=lambda x: x[0], reverse=True)
+    for entity in entities:
+        start, end = entity[0] - 1, entity[1]
+        # entity_value = entity[2]
+        entity_value = f'<span class="sciCrunchAnnotation" data-sciGraph="{entity[2]}">{text[start:end]}</span>'
+        replaced_text = replaced_text[:start] + entity_value + replaced_text[end:]
+    return replaced_text
diff --git a/backend/src/monarch_py/implementations/oak/annotation_utils.py b/backend/src/monarch_py/implementations/oak/annotation_utils.py
@@ -0,0 +1,89 @@
+"""
+Utility functions for annotating text with OAK.
+"""
+
+import re
+import json
+from typing import List
+
+
+def get_word_length(text, start):
+    word = ""
+    index = start
+    while index < len(text) and text[index].isalpha():
+        word += text[index]
+        index += 1
+    return len(word)
+
+
+def concatenate_same_entities(lst):
+    result = {}
+    for elem in lst:
+        key = (elem[0], elem[1])
+        if key in result:
+            result[key] += "|" + elem[2]
+        else:
+            result[key] = elem[2]
+    concatenated_list = [[key[0], key[1], value] for key, value in result.items()]
+    return concatenated_list
+
+
+def concatenate_ngram_entities(lst):
+    merged_list = []
+    start, end, text = lst[0]
+    for element in lst[1:]:
+        if element[0] <= end:  # Check if range overlaps
+            end = max(end, element[1])  # Merge the range
+            text += "|" + element[2]  # Concatenate the texts
+        else:
+            merged_list.append([start, end, text])  # Add the merged element to the result
+            start, end, text = element  # Move to the next element
+    merged_list.append([start, end, text])  # Add the last merged element
+    return merged_list
+
+
+def replace_entities(text, entities):
+    replaced_text = text
+    # Sort the entities in descending order of start character indices
+    entities = sorted(entities, key=lambda x: x[0], reverse=True)
+    for entity in entities:
+        start, end = entity[0] - 1, entity[1]
+        # entity_value = entity[2]
+        entity_value = f'<span class="sciCrunchAnnotation" data-sciGraph="{entity[2]}">{text[start:end]}</span>'
+        replaced_text = replaced_text[:start] + entity_value + replaced_text[end:]
+    return replaced_text
+
+
+def convert_to_json(paragraphs: List[str]):
+    result = []
+    span_pattern = re.compile(r'<span class="sciCrunchAnnotation" data-sciGraph="([^"]+)">([^<]+)</span>')
+
+    for paragraph in paragraphs:
+        start_index = 0
+        for match in span_pattern.finditer(paragraph):
+            span_data = match.group(1)
+            span_text = match.group(2)
+
+            if start_index < match.start():
+                non_span_text = paragraph[start_index : match.start()]
+                result.append({"text": non_span_text})
+
+            tokens = []
+            for token_data in span_data.split("|"):
+                token_parts = token_data.split(",")
+                tokens.append({"id": token_parts[1], "name": token_parts[0]})
+
+            result.append({"text": span_text, "tokens": tokens})
+            start_index = match.end()
+
+        if start_index < len(paragraph):
+            non_span_text = paragraph[start_index:]
+            result.append({"text": non_span_text})
+
+        result.append({"text": "\n"})
+
+    api_response = json.dumps(result)
+    # Load the JSON
+    data = json.loads(api_response)
+
+    return data
diff --git a/backend/src/monarch_py/implementations/oak/oak_implementation.py b/backend/src/monarch_py/implementations/oak/oak_implementation.py
@@ -1,15 +1,23 @@
+import re
 import time
 from dataclasses import dataclass
 from typing import List
 
-from loguru import logger
-
-from monarch_py.datamodels.model import TermSetPairwiseSimilarity
 from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface
 from oaklib.selector import get_adapter
 from linkml_runtime.dumpers.json_dumper import JSONDumper
+from loguru import logger
 import pystow
 
+from monarch_py.datamodels.model import TermSetPairwiseSimilarity
+from monarch_py.implementations.oak.annotation_utils import (
+    get_word_length,
+    concatenate_same_entities,
+    concatenate_ngram_entities,
+    replace_entities,
+    convert_to_json,
+)
+
 
 @dataclass
 class OakImplementation(SemanticSimilarityInterface):
@@ -20,6 +28,36 @@ class OakImplementation(SemanticSimilarityInterface):
     default_predicates = ["rdfs:subClassOf", "BFO:0000050", "UPHENO:0000001"]
 
     default_phenio_db_url = "https://data.monarchinitiative.org/monarch-kg-dev/latest/phenio.db.gz"
+    phenio_adapter = None
+
+    def init_phenio_adapter(self, phenio_path: str = None, force_update: bool = False):
+        if self.phenio_adapter is None:
+            logger.info("Warming up semsimian")
+            start = time.time()
+            # self.phenio_adapter = get_adapter(f"sqlite:obo:phenio")
+
+            if phenio_path:
+                logger.debug(f"Creating phenio adapter using phenio_path at {phenio_path}")
+                self.phenio_adapter = get_adapter(f"sqlite:{phenio_path}")
+            else:
+                monarchstow = pystow.module("monarch")
+
+                with monarchstow.ensure_gunzip(
+                    "phenio", url=self.default_phenio_db_url, force=force_update
+                ) as stowed_phenio_path:
+                    logger.debug(f"Creating phenio adapter using pystow at {stowed_phenio_path}")
+                    self.phenio_adapter = get_adapter(f"sqlite:{stowed_phenio_path}")
+
+            # run a query to get the adapter to initialize properly
+            logger.debug("Running query to initialize adapter")
+
+            # TODO: run a little bit of text annotation here to get oak warmed up
+            print("Oaklib adapter warmup")
+            for ann in self.phenio_adapter.annotate_text("Nystagmus, strabismus and fundus."):
+                print(ann.subject_start, ann.subject_end, ann.object_id, ann.object_label)
+
+            logger.info(f"Phenio adapter ready, warmup time: {time.time() - start} sec")
+            return self
 
     def init_semsim(self, phenio_path: str = None, force_update: bool = False):
         if self.semsim is None:
@@ -57,7 +95,7 @@ def compare(
         predicates = predicates or self.default_predicates
         logger.debug(f"Comparing {subjects} to {objects} using {predicates}")
         compare_time = time.time()
-        response = self.semsim.termset_pairwise_similarity(
+        response = self.semsim.termset_pairwise_similarity(  # type: ignore
             subjects=subjects,
             objects=objects,
             predicates=predicates,
@@ -67,3 +105,32 @@ def compare(
 
         response_dict = self.json_dumper.to_dict(response)
         return TermSetPairwiseSimilarity(**response_dict)
+
+    def annotate_text(self, text):
+        """Annotate text using OAK"""
+        paragraphs = text.split("\n")
+        paragraphs_annotated = []
+        for paragraph in paragraphs:
+            result = ""
+            sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", paragraph)
+            for sentence in sentences:
+                entities = []
+                for ann in self.phenio_adapter.annotate_text(sentence):  # type: ignore
+                    if len(ann.object_label) >= 4:
+                        element = [ann.subject_start, ann.subject_end, str(ann.object_label) + "," + str(ann.object_id)]
+                        if (get_word_length(sentence, ann.subject_start - 1) - len(ann.object_label)) < 2:
+                            entities.append(element)
+                try:
+                    entities.sort()
+                    entities = concatenate_same_entities(entities)
+                    entities = concatenate_ngram_entities(entities)
+                    replaced_text = replace_entities(sentence, entities)
+                    result += replaced_text + " "
+                except IndexError as error:
+                    # Handling the list index out of range error
+                    result += sentence + " "
+                    print("Error occurred:", error)
+
+            paragraphs_annotated.append(result)
+        result = convert_to_json(paragraphs_annotated)
+        return result
diff --git a/frontend/fixtures/index.ts b/frontend/fixtures/index.ts
@@ -33,7 +33,7 @@ export const handlers = [
   http.get("*/autocomplete", () => HttpResponse.json(autocomplete)),
 
   /** text annotator */
-  http.post("*/nlp/annotate", () => HttpResponse.json(textAnnotator)),
+  http.post("*/annotate", () => HttpResponse.json(textAnnotator)),
 
   /** phenotype explorer */
   http.get("*/sim/search", () => HttpResponse.json(phenotypeExplorerSearch)),