Skip to content

Commit

Permalink
Bring back oak and phenio (text annotator) (#497)
Browse files Browse the repository at this point in the history
Closes #234

---------

Co-authored-by: Kevin Schaper <[email protected]>
Co-authored-by: madanucd <[email protected]>
Co-authored-by: glass-ships <[email protected]>
  • Loading branch information
4 people authored Dec 15, 2023
1 parent 8c90aaf commit 7c0ce2b
Show file tree
Hide file tree
Showing 22 changed files with 2,976 additions and 3,562 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ data:
.PHONY: dev-frontend
dev-frontend: frontend/src/api/model.ts
cd frontend && \
yarn dev
VITE_API=local yarn dev


.PHONY: dev-api
Expand Down
1,255 changes: 624 additions & 631 deletions backend/poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ packages = [
python = "^3.9"
pydantic = "^1.10.2"
curies = "<1"
linkml = "^1.6.2"
linkml = ">=1.6.3"
prefixmaps = "^0.1.7"

requests = "^2.28.1"
Expand Down
4 changes: 4 additions & 0 deletions backend/src/monarch_py/api/additional_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,7 @@ class SemsimSearchRequest(BaseModel):
termset: List[str] = Field(..., title="Termset to search")
category: SemsimSearchCategory = Field(..., title="Category to search for")
limit: Optional[int] = Field(10, title="Limit the number of results", ge=1, le=50)


class TextAnnotationRequest(BaseModel):
content: str = Field(..., title="The text content to annotate")
13 changes: 13 additions & 0 deletions backend/src/monarch_py/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pydantic import BaseSettings

from monarch_py.implementations.solr.solr_implementation import SolrImplementation
from monarch_py.implementations.oak.oak_implementation import OakImplementation
from monarch_py.datamodels.model import TermSetPairwiseSimilarity, SemsimSearchResult


Expand Down Expand Up @@ -43,6 +44,11 @@ def convert_nans(input_dict, to_value=None):
class SemsimianHTTPRequester:
"""A class that makes HTTP requests to the semsimian_server."""

def compare(self, subjects, objects):
host = f"http://{settings.semsimian_server_host}:{settings.semsimian_server_port}"
path = f"/compare/{','.join(subjects)}/{','.join(objects)}"
url = f"{host}/{path}"

def convert_tsps_data(self, data):
"""Convert to a format that can be coerced into a TermSetPairwiseSimilarity model
Expand Down Expand Up @@ -104,3 +110,10 @@ def search(self, termset: List[str], prefix: str, limit: int):
@lru_cache(maxsize=1)
def semsimian():
return SemsimianHTTPRequester()


@lru_cache(maxsize=1)
def oak():
oak_implementation = OakImplementation()
oak_implementation.init_phenio_adapter(force_update=False)
return oak_implementation
7 changes: 4 additions & 3 deletions backend/src/monarch_py/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import RedirectResponse
from monarch_py.api import association, entity, histopheno, search, semsim
from monarch_py.api.config import semsimian
from monarch_py.api import association, entity, histopheno, search, semsim, text_annotation
from monarch_py.api.config import semsimian, oak
from monarch_py.api.middleware.logging_middleware import LoggingMiddleware
from monarch_py.service.curie_service import CurieService

Expand All @@ -18,7 +18,7 @@
@app.on_event("startup")
async def initialize_app():
semsimian()
# Let the curie service singleton initialize itself
oak()
CurieService()


Expand All @@ -27,6 +27,7 @@ async def initialize_app():
app.include_router(histopheno.router, prefix=f"{PREFIX}/histopheno")
app.include_router(search.router, prefix=PREFIX)
app.include_router(semsim.router, prefix=f"{PREFIX}/semsim")
app.include_router(text_annotation.router, prefix=PREFIX)

# Allow CORS
app.add_middleware(
Expand Down
19 changes: 19 additions & 0 deletions backend/src/monarch_py/api/text_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from fastapi import APIRouter, Path # , Depends, HTTPException, Query

from monarch_py.api.additional_models import TextAnnotationRequest
from monarch_py.api.config import oak

router = APIRouter(tags=["text_annotation"], responses={404: {"description": "Not Found"}})


@router.get("/annotate/{content}")
def _annotate(content: str = Path(title="The text content to annotate")):
print(f"\n\nRunning oak annotate (GET):\n{content}\n")
return oak().annotate_text(content)


@router.post("/annotate")
def _post_annotate(request: TextAnnotationRequest):
print(f"\n\nRunning oak annotate (POST):\n{request.content}\n")
# print(request.content.split("\n"))
return oak().annotate_text(request.content)
75 changes: 75 additions & 0 deletions backend/src/monarch_py/api/utils/get_text_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from monarch_py.api.config import oak
import re

phenio_adapter = oak().phenio_adapter


def annotate_text(text):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
result = ""
for sentence in sentences:
entities = []
for ann in phenio_adapter.annotate_text(sentence): # type: ignore
if len(ann.object_label) >= 4:
element = [ann.subject_start, ann.subject_end, str(ann.object_label) + "," + str(ann.object_id)]
if (get_word_length(sentence, ann.subject_start - 1) - len(ann.object_label)) < 2:
entities.append(element)
try:
# Trying to access an element that doesn't exist in the list
entities.sort()
entities = concatenate_same_entities(entities)
entities = concatenate_ngram_entities(entities)
replaced_text = replace_entities(sentence, entities)
result += replaced_text + " "
except IndexError as error:
# Handling the list index out of range error
result += sentence + " "
print("Error occurred:", error)
return result


def get_word_length(text, start):
word = ""
index = start
while index < len(text) and text[index].isalpha():
word += text[index]
index += 1
return len(word)


def concatenate_same_entities(lst):
result = {}
for elem in lst:
key = (elem[0], elem[1])
if key in result:
result[key] += "|" + elem[2]
else:
result[key] = elem[2]
concatenated_list = [[key[0], key[1], value] for key, value in result.items()]
return concatenated_list


def concatenate_ngram_entities(lst):
merged_list = []
start, end, text = lst[0]
for element in lst[1:]:
if element[0] <= end: # Check if range overlaps
end = max(end, element[1]) # Merge the range
text += "|" + element[2] # Concatenate the texts
else:
merged_list.append([start, end, text]) # Add the merged element to the result
start, end, text = element # Move to the next element
merged_list.append([start, end, text]) # Add the last merged element
return merged_list


def replace_entities(text, entities):
replaced_text = text
# Sort the entities in descending order of start character indices
entities = sorted(entities, key=lambda x: x[0], reverse=True)
for entity in entities:
start, end = entity[0] - 1, entity[1]
# entity_value = entity[2]
entity_value = f'<span class="sciCrunchAnnotation" data-sciGraph="{entity[2]}">{text[start:end]}</span>'
replaced_text = replaced_text[:start] + entity_value + replaced_text[end:]
return replaced_text
89 changes: 89 additions & 0 deletions backend/src/monarch_py/implementations/oak/annotation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Utility functions for annotating text with OAK.
"""

import re
import json
from typing import List


def get_word_length(text, start):
word = ""
index = start
while index < len(text) and text[index].isalpha():
word += text[index]
index += 1
return len(word)


def concatenate_same_entities(lst):
result = {}
for elem in lst:
key = (elem[0], elem[1])
if key in result:
result[key] += "|" + elem[2]
else:
result[key] = elem[2]
concatenated_list = [[key[0], key[1], value] for key, value in result.items()]
return concatenated_list


def concatenate_ngram_entities(lst):
merged_list = []
start, end, text = lst[0]
for element in lst[1:]:
if element[0] <= end: # Check if range overlaps
end = max(end, element[1]) # Merge the range
text += "|" + element[2] # Concatenate the texts
else:
merged_list.append([start, end, text]) # Add the merged element to the result
start, end, text = element # Move to the next element
merged_list.append([start, end, text]) # Add the last merged element
return merged_list


def replace_entities(text, entities):
replaced_text = text
# Sort the entities in descending order of start character indices
entities = sorted(entities, key=lambda x: x[0], reverse=True)
for entity in entities:
start, end = entity[0] - 1, entity[1]
# entity_value = entity[2]
entity_value = f'<span class="sciCrunchAnnotation" data-sciGraph="{entity[2]}">{text[start:end]}</span>'
replaced_text = replaced_text[:start] + entity_value + replaced_text[end:]
return replaced_text


def convert_to_json(paragraphs: List[str]):
result = []
span_pattern = re.compile(r'<span class="sciCrunchAnnotation" data-sciGraph="([^"]+)">([^<]+)</span>')

for paragraph in paragraphs:
start_index = 0
for match in span_pattern.finditer(paragraph):
span_data = match.group(1)
span_text = match.group(2)

if start_index < match.start():
non_span_text = paragraph[start_index : match.start()]
result.append({"text": non_span_text})

tokens = []
for token_data in span_data.split("|"):
token_parts = token_data.split(",")
tokens.append({"id": token_parts[1], "name": token_parts[0]})

result.append({"text": span_text, "tokens": tokens})
start_index = match.end()

if start_index < len(paragraph):
non_span_text = paragraph[start_index:]
result.append({"text": non_span_text})

result.append({"text": "\n"})

api_response = json.dumps(result)
# Load the JSON
data = json.loads(api_response)

return data
75 changes: 71 additions & 4 deletions backend/src/monarch_py/implementations/oak/oak_implementation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
import re
import time
from dataclasses import dataclass
from typing import List

from loguru import logger

from monarch_py.datamodels.model import TermSetPairwiseSimilarity
from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface
from oaklib.selector import get_adapter
from linkml_runtime.dumpers.json_dumper import JSONDumper
from loguru import logger
import pystow

from monarch_py.datamodels.model import TermSetPairwiseSimilarity
from monarch_py.implementations.oak.annotation_utils import (
get_word_length,
concatenate_same_entities,
concatenate_ngram_entities,
replace_entities,
convert_to_json,
)


@dataclass
class OakImplementation(SemanticSimilarityInterface):
Expand All @@ -20,6 +28,36 @@ class OakImplementation(SemanticSimilarityInterface):
default_predicates = ["rdfs:subClassOf", "BFO:0000050", "UPHENO:0000001"]

default_phenio_db_url = "https://data.monarchinitiative.org/monarch-kg-dev/latest/phenio.db.gz"
phenio_adapter = None

def init_phenio_adapter(self, phenio_path: str = None, force_update: bool = False):
if self.phenio_adapter is None:
logger.info("Warming up semsimian")
start = time.time()
# self.phenio_adapter = get_adapter(f"sqlite:obo:phenio")

if phenio_path:
logger.debug(f"Creating phenio adapter using phenio_path at {phenio_path}")
self.phenio_adapter = get_adapter(f"sqlite:{phenio_path}")
else:
monarchstow = pystow.module("monarch")

with monarchstow.ensure_gunzip(
"phenio", url=self.default_phenio_db_url, force=force_update
) as stowed_phenio_path:
logger.debug(f"Creating phenio adapter using pystow at {stowed_phenio_path}")
self.phenio_adapter = get_adapter(f"sqlite:{stowed_phenio_path}")

# run a query to get the adapter to initialize properly
logger.debug("Running query to initialize adapter")

# TODO: run a little bit of text annotation here to get oak warmed up
print("Oaklib adapter warmup")
for ann in self.phenio_adapter.annotate_text("Nystagmus, strabismus and fundus."):
print(ann.subject_start, ann.subject_end, ann.object_id, ann.object_label)

logger.info(f"Phenio adapter ready, warmup time: {time.time() - start} sec")
return self

def init_semsim(self, phenio_path: str = None, force_update: bool = False):
if self.semsim is None:
Expand Down Expand Up @@ -57,7 +95,7 @@ def compare(
predicates = predicates or self.default_predicates
logger.debug(f"Comparing {subjects} to {objects} using {predicates}")
compare_time = time.time()
response = self.semsim.termset_pairwise_similarity(
response = self.semsim.termset_pairwise_similarity( # type: ignore
subjects=subjects,
objects=objects,
predicates=predicates,
Expand All @@ -67,3 +105,32 @@ def compare(

response_dict = self.json_dumper.to_dict(response)
return TermSetPairwiseSimilarity(**response_dict)

def annotate_text(self, text):
"""Annotate text using OAK"""
paragraphs = text.split("\n")
paragraphs_annotated = []
for paragraph in paragraphs:
result = ""
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", paragraph)
for sentence in sentences:
entities = []
for ann in self.phenio_adapter.annotate_text(sentence): # type: ignore
if len(ann.object_label) >= 4:
element = [ann.subject_start, ann.subject_end, str(ann.object_label) + "," + str(ann.object_id)]
if (get_word_length(sentence, ann.subject_start - 1) - len(ann.object_label)) < 2:
entities.append(element)
try:
entities.sort()
entities = concatenate_same_entities(entities)
entities = concatenate_ngram_entities(entities)
replaced_text = replace_entities(sentence, entities)
result += replaced_text + " "
except IndexError as error:
# Handling the list index out of range error
result += sentence + " "
print("Error occurred:", error)

paragraphs_annotated.append(result)
result = convert_to_json(paragraphs_annotated)
return result
2 changes: 1 addition & 1 deletion frontend/fixtures/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export const handlers = [
http.get("*/autocomplete", () => HttpResponse.json(autocomplete)),

/** text annotator */
http.post("*/nlp/annotate", () => HttpResponse.json(textAnnotator)),
http.post("*/annotate", () => HttpResponse.json(textAnnotator)),

/** phenotype explorer */
http.get("*/sim/search", () => HttpResponse.json(phenotypeExplorerSearch)),
Expand Down
Loading

0 comments on commit 7c0ce2b

Please sign in to comment.