Refactor relation extraction logic and handle isolated entities

AuvaLab · Jul 14, 2024 · 346815b · 346815b
1 parent 97869fc
commit 346815b
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -14,16 +14,22 @@ pip install itext2kg
 
 The ```iText2KG``` package consists of four main modules that work together to construct and visualize knowledge graphs from unstructured text. An overview of the overall architecture:
 
-1. **Document Distiller**: This module processes raw documents and reformulates them into semantic blocks based on a user-defined schema. It improves the signal-to-noise ratio by focusing on relevant information and structuring it in a predefined format.
+1. **Document Distiller**: This module processes raw documents and reformulates them into semantic blocks based on a user-defined schema. It improves the signal-to-noise ratio by focusing on relevant information and structuring it in a predefined format. 
 
 2. **Incremental Entity Extractor**: This module extracts unique entities from the semantic blocks and resolves ambiguities to ensure each entity is clearly defined. It uses cosine similarity measures to match local entities with global entities.
 
-3. **Incremental Relation Extractor**: This module identifies relationships between the extracted entities. It can operate in two modes: using global entities to enrich the graph with potential information or using local entities for more precise relationships.
+
+
+3. **Incremental Relation Extractor**: This module identifies relationships between the extracted entities. It can operate in two modes: using global entities to enrich the graph with potential information or using local entities for more precise relationships. 
 
 4. **Graph Integrator and Visualization**: This module integrates the extracted entities and relationships into a Neo4j database, providing a visual representation of the knowledge graph. It allows for interactive exploration and analysis of the structured data.
 
 ![itext2kg](./docs/itext2kg.png)
 
+The LLM is prompted to extract entities representing one unique concept to avoid semantically mixed entities. The following figure presents the entity and relation extraction prompts using the Langchain JSON Parser. They are categorized as follows: Blue - prompts automatically formatted by Langchain; Regular - prompts we have designed; and Italic - specifically designed prompts for entity and relation extraction. (a) prompts for relation extraction and (b) prompts for entity extraction.
+
+![prompts](./docs/prompts.jpg)
+
 ## Modules and Examples
 
 The Document Distiller module reformulates raw documents into predefined and semantic blocks using LLMs. It utilizes a schema to guide the extraction of specific information from each document.
@@ -81,6 +87,7 @@ class Article(BaseModel):
 
 ```
 
+
 ### The ```iText2KG```
 The iText2KG module is the core component of the package, responsible for integrating various functionalities to construct the knowledge graph. It uses the distilled semantic sections from documents to extract entities and relationships, and then builds the knowledge graph incrementally. 
 

diff --git a/itext2kg/graph_integration/itext2kg.py b/itext2kg/graph_integration/itext2kg.py
@@ -42,19 +42,36 @@ def extract_relations_for_all_sections(self, sections:List[str], entities, rel_t
 
         relations_with_isolated_entities = self.data_handler.find_relations_with_isolated_entities(global_entities=entities, relations=global_relationships)
         if relations_with_isolated_entities:
-            corrected_relations = self.irelations_extractor.extract_relations_for_isolated_entities(context=sections[0], entities=entities, relations_with_isolated_entities=relations_with_isolated_entities)
+            corrected_relations = self.irelations_extractor.correct_relations_for_isolated_entities(context=sections[0], entities=entities, relations_with_isolated_entities=relations_with_isolated_entities)
             global_relationships = [rel for rel in global_relationships if rel not in relations_with_isolated_entities] + [corrected_relations]
+
+
+        isolated_entities = self.data_handler.find_isolated_entities(global_entities=entities, relations=global_relationships)
+        if isolated_entities:
+            corrected_relations = self.irelations_extractor.extract_relations_for_isolated_entities(context=sections[0], isolated_entities=isolated_entities)
+            global_relationships.extend(corrected_relations)
 
+
         for i in range(1, len(sections)):
             print("[INFO] Extracting Relations from the Document", i+1)
             entities = self.irelations_extractor.extract_relations(context= sections[i], entities=entities)
             processed_relationships, global_relationships_ = self.matcher.process_lists(list1 = entities, list2=global_relationships, for_entity_or_relation="relation", threshold = rel_threshold)
 
+            print("proce", processed_relationships)
+
             relations_with_isolated_entities = self.data_handler.find_relations_with_isolated_entities(global_entities=entities, relations=processed_relationships)
             if relations_with_isolated_entities:
-                corrected_relations = self.irelations_extractor.extract_relations_for_isolated_entities(context=sections[i], entities=entities, relations_with_isolated_entities=relations_with_isolated_entities)
+                corrected_relations = self.irelations_extractor.correct_relations_for_isolated_entities(context=sections[i], entities=entities, relations_with_isolated_entities=relations_with_isolated_entities)
                 processed_relationships = [rel for rel in processed_relationships if rel not in relations_with_isolated_entities] + [corrected_relations]
-
+
+                print("first case corrected ...", corrected_relations)
+
+            isolated_entities = self.data_handler.find_isolated_entities(global_entities=entities, relations=processed_relationships)
+            if isolated_entities:
+                corrected_relations = self.irelations_extractor.extract_relations_for_isolated_entities(context=sections[i], isolated_entities=isolated_entities)
+                print("second case corrected ...", corrected_relations)
+                processed_relationships.extend(corrected_relations)
+
             global_relationships.extend(processed_relationships)
         #return self.data_handler.handle_data(global_relationships, data_type="relation")
         return global_relationships
@@ -76,9 +93,9 @@ def build_graph(self, sections:List[str], ent_threshold:float = 0.7, rel_thresho
             #relationships = relationship_extraction(context= sections[i], entities=list(map(lambda w:w["name"], processed_entities)))
             print("[INFO] Extracting Relations from the Document", i+1)
             relationships = self.irelations_extractor.extract_relations(context= sections, entities=list(map(lambda w:w["name"], processed_entities)))
-            processed_relationships, global_relationships = self.matcher.process_lists(list1 = relationships, list2=global_relationships, for_entity_or_relation="relation", threshold=rel_threshold)
+            processed_relationships, _ = self.matcher.process_lists(list1 = relationships, list2=global_relationships, for_entity_or_relation="relation", threshold=rel_threshold)
 
-            #global_relationships.extend(processed_relationships)
+            global_relationships.extend(processed_relationships)
 
         return self.data_handler.handle_data(global_entities, data_type="entity"), self.data_handler.handle_data(global_relationships, data_type="relation")