From 01c40996df1b1ee07d711a48c600c1445b812311 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 21 May 2024 13:38:25 -0400 Subject: [PATCH 1/2] Remove unused DATAMODELS constant --- src/ontogpt/engines/knowledge_engine.py | 26 ++++++------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/ontogpt/engines/knowledge_engine.py b/src/ontogpt/engines/knowledge_engine.py index 70c1633c5..a7feebc2b 100644 --- a/src/ontogpt/engines/knowledge_engine.py +++ b/src/ontogpt/engines/knowledge_engine.py @@ -1,4 +1,5 @@ """Main Knowledge Extractor class.""" + import logging import re from abc import ABC @@ -41,11 +42,13 @@ # if it's not installed try: from ontogpt.clients import OpenAIClient, GPT4AllClient + CLIENT_TYPES = Union[OpenAIClient, GPT4AllClient] except ImportError: logger.warning("GPT4All client not available. GPT4All support will be disabled.") from ontogpt.clients import OpenAIClient - CLIENT_TYPES = OpenAIClient # type: ignore + + CLIENT_TYPES = OpenAIClient # type: ignore # annotation metamodel ANNOTATION_KEY_PROMPT = "prompt" @@ -54,24 +57,6 @@ ANNOTATION_KEY_RECURSE = "ner.recurse" ANNOTATION_KEY_EXAMPLES = "prompt.examples" -# TODO: introspect -# TODO: move this to its own module -DATAMODELS = [ - "biological_process.BiologicalProcess", - "biotic_interaction.BioticInteraction", - "cell_type.CellTypeDocument", - "ctd.ChemicalToDiseaseDocument", - "diagnostic_procedure.DiagnosticProceduretoPhenotypeAssociation", - "drug.DrugMechanism", - "environmental_sample.Study", - "gocam.GoCamAnnotations", - "mendelian_disease.MendelianDisease", - "phenotype.Trait", - "reaction.Reaction", - "recipe.Recipe", - "treatment.DiseaseTreatmentSummary", -] - def chunk_text(text: str, window_size=3) -> Iterator[str]: """Chunk text into windows of sentences.""" @@ -152,7 +137,8 @@ class KnowledgeEngine(ABC): """Min proportion of overlap in characters between text and grounding. TODO: use tokenization""" named_entities: List[NamedEntity] = field(default_factory=list) - """Cache of all named entities""" + """Cache of all named entities. This is not written to output directly as each input + has its own corresponding named entities.""" auto_prefix: str = "" """If set then non-normalized named entities will be mapped to this prefix""" From f4ad7b9245288805eb47ffdbd637d0c7424455bd Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 21 May 2024 14:05:03 -0400 Subject: [PATCH 2/2] Add extracted named entities buffer --- src/ontogpt/engines/knowledge_engine.py | 12 ++++++++++-- src/ontogpt/engines/spires_engine.py | 7 ++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/ontogpt/engines/knowledge_engine.py b/src/ontogpt/engines/knowledge_engine.py index a7feebc2b..d9d571d10 100644 --- a/src/ontogpt/engines/knowledge_engine.py +++ b/src/ontogpt/engines/knowledge_engine.py @@ -140,6 +140,9 @@ class KnowledgeEngine(ABC): """Cache of all named entities. This is not written to output directly as each input has its own corresponding named entities.""" + extracted_named_entities: List[NamedEntity] = field(default_factory=list) + """Temporary cache of named entities, to be cleared between extractions.""" + auto_prefix: str = "" """If set then non-normalized named entities will be mapped to this prefix""" @@ -335,14 +338,18 @@ def normalize_named_entity(self, text: str, range: ElementName) -> str: logger.info(f"Grounding {text} to {obj_id}; next step is to normalize") for normalized_id in self.normalize_identifier(obj_id, cls): if not any(e for e in self.named_entities if e.id == normalized_id): - self.named_entities.append(NamedEntity(id=normalized_id, label=text)) + ne = NamedEntity(id=normalized_id, label=text) + self.named_entities.append(ne) + self.extracted_named_entities.append(ne) logger.info(f"Normalized {text} with {obj_id} to {normalized_id}") return normalized_id logger.info(f"Could not ground and normalize {text} to {cls.name}") if self.auto_prefix: obj_id = f"{self.auto_prefix}:{quote(text)}" if not any(e for e in self.named_entities if e.id == obj_id): - self.named_entities.append(NamedEntity(id=obj_id, label=text)) + ne = NamedEntity(id=normalized_id, label=text) + self.named_entities.append(ne) + self.extracted_named_entities.append(ne) else: obj_id = text if ANNOTATION_KEY_RECURSE in cls.annotations: @@ -356,6 +363,7 @@ def normalize_named_entity(self, text: str, range: ElementName) -> str: except ValueError as e: logger.error(f"No id for {obj} {e}") self.named_entities.append(obj) + self.extracted_named_entities.append(obj) return obj_id def is_valid_identifier(self, input_id: str, cls: ClassDefinition) -> bool: diff --git a/src/ontogpt/engines/spires_engine.py b/src/ontogpt/engines/spires_engine.py index eec3875e1..fa0eb620e 100644 --- a/src/ontogpt/engines/spires_engine.py +++ b/src/ontogpt/engines/spires_engine.py @@ -69,6 +69,8 @@ def extract_from_text( :param object: optional stub object :return: """ + self.extracted_named_entities = [] # Clear the named entity buffer + if self.sentences_per_window: chunks = chunk_text(text, self.sentences_per_window) extracted_object = None @@ -95,12 +97,15 @@ def extract_from_text( extracted_object = self.parse_completion_payload( raw_text, cls, object=object # type: ignore ) + return ExtractionResult( input_text=text, raw_completion_output=raw_text, prompt=self.last_prompt, extracted_object=extracted_object, - named_entities=self.named_entities, + named_entities=self.extracted_named_entities, + # Note these are the named entities from the last extraction, + # not the full list of all named entities across all extractions ) def _extract_from_text_to_dict(self, text: str, cls: ClassDefinition = None) -> RESPONSE_DICT: