-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Bring back oak and phenio (text annotator) (#497)
Closes #234 --------- Co-authored-by: Kevin Schaper <[email protected]> Co-authored-by: madanucd <[email protected]> Co-authored-by: glass-ships <[email protected]>
- Loading branch information
1 parent
8c90aaf
commit 7c0ce2b
Showing
22 changed files
with
2,976 additions
and
3,562 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from fastapi import APIRouter, Path # , Depends, HTTPException, Query | ||
|
||
from monarch_py.api.additional_models import TextAnnotationRequest | ||
from monarch_py.api.config import oak | ||
|
||
router = APIRouter(tags=["text_annotation"], responses={404: {"description": "Not Found"}}) | ||
|
||
|
||
@router.get("/annotate/{content}") | ||
def _annotate(content: str = Path(title="The text content to annotate")): | ||
print(f"\n\nRunning oak annotate (GET):\n{content}\n") | ||
return oak().annotate_text(content) | ||
|
||
|
||
@router.post("/annotate") | ||
def _post_annotate(request: TextAnnotationRequest): | ||
print(f"\n\nRunning oak annotate (POST):\n{request.content}\n") | ||
# print(request.content.split("\n")) | ||
return oak().annotate_text(request.content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from monarch_py.api.config import oak | ||
import re | ||
|
||
phenio_adapter = oak().phenio_adapter | ||
|
||
|
||
def annotate_text(text): | ||
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text) | ||
result = "" | ||
for sentence in sentences: | ||
entities = [] | ||
for ann in phenio_adapter.annotate_text(sentence): # type: ignore | ||
if len(ann.object_label) >= 4: | ||
element = [ann.subject_start, ann.subject_end, str(ann.object_label) + "," + str(ann.object_id)] | ||
if (get_word_length(sentence, ann.subject_start - 1) - len(ann.object_label)) < 2: | ||
entities.append(element) | ||
try: | ||
# Trying to access an element that doesn't exist in the list | ||
entities.sort() | ||
entities = concatenate_same_entities(entities) | ||
entities = concatenate_ngram_entities(entities) | ||
replaced_text = replace_entities(sentence, entities) | ||
result += replaced_text + " " | ||
except IndexError as error: | ||
# Handling the list index out of range error | ||
result += sentence + " " | ||
print("Error occurred:", error) | ||
return result | ||
|
||
|
||
def get_word_length(text, start): | ||
word = "" | ||
index = start | ||
while index < len(text) and text[index].isalpha(): | ||
word += text[index] | ||
index += 1 | ||
return len(word) | ||
|
||
|
||
def concatenate_same_entities(lst): | ||
result = {} | ||
for elem in lst: | ||
key = (elem[0], elem[1]) | ||
if key in result: | ||
result[key] += "|" + elem[2] | ||
else: | ||
result[key] = elem[2] | ||
concatenated_list = [[key[0], key[1], value] for key, value in result.items()] | ||
return concatenated_list | ||
|
||
|
||
def concatenate_ngram_entities(lst): | ||
merged_list = [] | ||
start, end, text = lst[0] | ||
for element in lst[1:]: | ||
if element[0] <= end: # Check if range overlaps | ||
end = max(end, element[1]) # Merge the range | ||
text += "|" + element[2] # Concatenate the texts | ||
else: | ||
merged_list.append([start, end, text]) # Add the merged element to the result | ||
start, end, text = element # Move to the next element | ||
merged_list.append([start, end, text]) # Add the last merged element | ||
return merged_list | ||
|
||
|
||
def replace_entities(text, entities): | ||
replaced_text = text | ||
# Sort the entities in descending order of start character indices | ||
entities = sorted(entities, key=lambda x: x[0], reverse=True) | ||
for entity in entities: | ||
start, end = entity[0] - 1, entity[1] | ||
# entity_value = entity[2] | ||
entity_value = f'<span class="sciCrunchAnnotation" data-sciGraph="{entity[2]}">{text[start:end]}</span>' | ||
replaced_text = replaced_text[:start] + entity_value + replaced_text[end:] | ||
return replaced_text |
89 changes: 89 additions & 0 deletions
89
backend/src/monarch_py/implementations/oak/annotation_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
""" | ||
Utility functions for annotating text with OAK. | ||
""" | ||
|
||
import re | ||
import json | ||
from typing import List | ||
|
||
|
||
def get_word_length(text, start): | ||
word = "" | ||
index = start | ||
while index < len(text) and text[index].isalpha(): | ||
word += text[index] | ||
index += 1 | ||
return len(word) | ||
|
||
|
||
def concatenate_same_entities(lst): | ||
result = {} | ||
for elem in lst: | ||
key = (elem[0], elem[1]) | ||
if key in result: | ||
result[key] += "|" + elem[2] | ||
else: | ||
result[key] = elem[2] | ||
concatenated_list = [[key[0], key[1], value] for key, value in result.items()] | ||
return concatenated_list | ||
|
||
|
||
def concatenate_ngram_entities(lst): | ||
merged_list = [] | ||
start, end, text = lst[0] | ||
for element in lst[1:]: | ||
if element[0] <= end: # Check if range overlaps | ||
end = max(end, element[1]) # Merge the range | ||
text += "|" + element[2] # Concatenate the texts | ||
else: | ||
merged_list.append([start, end, text]) # Add the merged element to the result | ||
start, end, text = element # Move to the next element | ||
merged_list.append([start, end, text]) # Add the last merged element | ||
return merged_list | ||
|
||
|
||
def replace_entities(text, entities): | ||
replaced_text = text | ||
# Sort the entities in descending order of start character indices | ||
entities = sorted(entities, key=lambda x: x[0], reverse=True) | ||
for entity in entities: | ||
start, end = entity[0] - 1, entity[1] | ||
# entity_value = entity[2] | ||
entity_value = f'<span class="sciCrunchAnnotation" data-sciGraph="{entity[2]}">{text[start:end]}</span>' | ||
replaced_text = replaced_text[:start] + entity_value + replaced_text[end:] | ||
return replaced_text | ||
|
||
|
||
def convert_to_json(paragraphs: List[str]): | ||
result = [] | ||
span_pattern = re.compile(r'<span class="sciCrunchAnnotation" data-sciGraph="([^"]+)">([^<]+)</span>') | ||
|
||
for paragraph in paragraphs: | ||
start_index = 0 | ||
for match in span_pattern.finditer(paragraph): | ||
span_data = match.group(1) | ||
span_text = match.group(2) | ||
|
||
if start_index < match.start(): | ||
non_span_text = paragraph[start_index : match.start()] | ||
result.append({"text": non_span_text}) | ||
|
||
tokens = [] | ||
for token_data in span_data.split("|"): | ||
token_parts = token_data.split(",") | ||
tokens.append({"id": token_parts[1], "name": token_parts[0]}) | ||
|
||
result.append({"text": span_text, "tokens": tokens}) | ||
start_index = match.end() | ||
|
||
if start_index < len(paragraph): | ||
non_span_text = paragraph[start_index:] | ||
result.append({"text": non_span_text}) | ||
|
||
result.append({"text": "\n"}) | ||
|
||
api_response = json.dumps(result) | ||
# Load the JSON | ||
data = json.loads(api_response) | ||
|
||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.