adding extraction mock

the-deep-nlp · Dec 3, 2024 · be6c21f · be6c21f
1 parent 3c20d76
commit be6c21f
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 0 deletions.
diff --git a/analysis_module/mock_templates.py b/analysis_module/mock_templates.py
@@ -1804,3 +1804,120 @@
         }
     ]
 }
+
+MOCK_ENTRY_EXTRACTION_LLM  = {
+    # this model prediction refer to the framework_id: 2691
+    # structured_text task: 007d8b24-92c7-407d-8ec1-eb9c1eb7bcdf (s3://nlp-tasks-processed-results-prod-20230602041457655100000005/textextraction/structured/007d8b24-92c7-407d-8ec1-eb9c1eb7bcdf/extracted_text.json)
+    # project_id: 4677 (DRC Lebanon Protection Monitoring Framework)
+    "client_id": "entry-classification-llm-client-6000",
+     "metadata": {
+        "total_pages": 10,
+        "total_words_count": 5876
+    },
+    "blocks": [
+    {
+      "type":"text",
+      "text":"UNHCR’s partners continue to facilitate cloth mask production by refugee and host community women. So far, 115,000 cloth masks have been produced, of which close to 72,000 have been distributed, including 29,000 to the elderly population.",
+      "page":5,
+      "textOrder":7,
+      "relevant": True,
+      "prediction_status": True,
+      "geolocations": [],
+      "classification":{
+         "aers0cn11qld0nfv":{
+            "ru816f687bjnfos5":{
+               "edgb9tmi3nizdwji":{
+                  "0f9k0w9nd7eewqov":[
+
+                  ]
+               },
+               "epzm3h0tal56xcul":{
+                  "0f9k0w9nd7eewqov":[
+
+                  ]
+               },
+               "0tuqi4jholiq0qzl":{
+                  "0f9k0w9nd7eewqov":[
+
+                  ]
+               }
+            }
+         },
+         "aofu2gc37ga68oyr":{
+            "1mihrcvim7h9j5nq":{
+               "dxhur2cuu63ukfqx":True
+            }
+         },
+         "vq1y3nsp3svrfb3w":{
+            "06j0a9eec1dwthbm":{
+               "kphdesz6b5uzzxki":True,
+               "b8qbfc5t89ftroft":True
+            }
+         }
+      }
+   },
+   {
+      "type":"text",
+      "text":"An additional 2,000 Taka (about US$ 24) will be distributed to each of the same households next month. The cash payments are an additional support to many families due to livelihoods having been affected by lockdowns and general economic pressures due to the COVID-19 impact on economies globally and in Bangladesh.",
+      "page":5,
+      "textOrder":8,
+      "relevant": True,
+      "prediction_status": True,
+      "geolocations": [
+        {
+            "entity": "Somalia",
+            "meta": {
+                "offset_start": 88,
+                "offset_end": 94,
+                "latitude": -10,
+                "longitude": -55
+            }
+        },
+        {
+            "entity": "Portugal",
+            "meta": {
+                "offset_start": 183,
+                "offset_end": 191,
+                "latitude": 39.6945,
+                "longitude": -8.13057
+            }
+        }
+     ],
+      "classification":{
+         "aers0cn11qld0nfv":{
+            "ru816f687bjnfos5":{
+               "edgb9tmi3nizdwji":{
+                  "0f9k0w9nd7eewqov":[
+
+                  ]
+               },
+               "0tuqi4jholiq0qzl":{
+                  "0f9k0w9nd7eewqov":[
+
+                  ]
+               }
+            }
+         },
+         "aofu2gc37ga68oyr":{
+            "1mihrcvim7h9j5nq":{
+               "dxhur2cuu63ukfqx":True
+            }
+         },
+         "l44malh7rjlhq312":{
+            "357ps7h8r3ati9jz":{
+               "kthspg6najppllz7": True
+            },
+            "6qz2zednb92k36rz":{
+               "53bwesslq9emavfk": True,
+               "rph8vhyl6sc4zlxo": True
+            }
+         },
+         "vq1y3nsp3svrfb3w":{
+            "djc1q3rh59eig0jp":{
+               "6pxate9tivbmynh8":True
+            }
+         }
+      }
+   }
+]
+}
diff --git a/analysis_module/mockserver.py b/analysis_module/mockserver.py
@@ -19,6 +19,7 @@
 from .mock_templates import (MOCK_ENTRY_CLASSIFICATION, 
                              MOCK_ENTRY_CLASSIFICATION_LLM,
                              MOCK_ENTRY_CLASSIFICATION_FORMATTED, 
+                             MOCK_ENTRY_EXTRACTION_LLM,
                              MOCK_GEOLOCATION)  # noqa
 from .utils import send_callback_url_request
 
@@ -499,6 +500,53 @@ def process_entry_extraction_mock(body) -> Any:
         except Exception:
             logger.error("Could not send data to callback url", exc_info=True)
 
+@shared_task
+def process_entry_extraction_llm_mock(body) -> Any:
+    documents = body.get("documents") or []
+
+    callback_url = body.get("callback_url")
+    if not documents or not callback_url:
+        return
+
+    for document in documents:
+        client_id = document["client_id"]
+        text_extraction_id = document["text_extraction_id"]
+        # random_extracted_text = "This is some random entry extracted text"
+        random_entry_extraction_classification = MOCK_ENTRY_EXTRACTION_LLM
+        random_entry_extraction_classification.update({
+            "classification_model_info": {
+                "name": "llm_model",
+                "version": "1.0.0"
+            },
+            "client_id": client_id,
+            "entry_extraction_id": "73f9ca13-deb2-4f39-8e86-a856490bfc0d",  # random
+            "text_extraction_id": text_extraction_id
+        })
+        filepath = save_data_local_and_get_url(
+            "entry_extraction", client_id, random_entry_extraction_classification
+        )
+
+        """
+        the text_extraction_id is not something easy to retrieve in case the request is
+        set with the "url". In both cases, with the url, or the textextractionid, the text
+        was already extracted, and it's not (easily) to retrieve the id from the presigned url.
+        In the case of a request with the id, is instead possible to get the right document.
+        """
+        callback_data = {
+            "client_id": client_id,
+            "entry_extraction_classification_path": filepath,
+            "text_extraction_id": text_extraction_id,
+            "status": 1
+        }
+        try:
+            requests.post(
+                callback_url,
+                json=callback_data,
+                timeout=30,
+            )
+            logger.info("Successfully send data on callback url for entry extraction.")
+        except Exception:
+            logger.error("Could not send data to callback url", exc_info=True)
 
 def entry_classification_mock(body) -> Any:
     process_entry_classification_mock.apply_async(
@@ -568,6 +616,7 @@ def process_entry_classification_llm_mock(body) -> Any:
     "geolocation": geolocation_mock_model,
     "text-extraction": text_extraction_mock,
     "entry-extraction-classification": entry_extraction_mock,
+    "entry-extraction-classification-llm": entry_classification_llm_mock,
     "entry-classification": entry_classification_mock,
     "entry-classification-llm": entry_classification_llm_mock
 }

diff --git a/analysis_module/utils.py b/analysis_module/utils.py
@@ -287,6 +287,7 @@ def get_ecs_id_param_name(request_type: NLPRequest.FeaturesType):
         NLPRequest.FeaturesType.TOPICMODEL: "topicmodel_id",
         NLPRequest.FeaturesType.GEOLOCATION: "geolocation_id",
         NLPRequest.FeaturesType.ENTRY_EXTRACTION: "entryextraction_id",
+        NLPRequest.FeaturesType.ENTRY_CLASSIFICATION_LLM: "entryextraction_llm_id",
         NLPRequest.FeaturesType.TEXT_EXTRACTION: "textextraction_id",
         NLPRequest.FeaturesType.SUMMARIZATION_V3: "summarization_id"
     }
@@ -298,6 +299,7 @@ def get_ecs_url(request_type: NLPRequest.FeaturesType):
         NLPRequest.FeaturesType.TOPICMODEL: urljoin(TOPICMODEL_ECS_ENDPOINT, "/get_excerpt_clusters"),
         NLPRequest.FeaturesType.GEOLOCATION: urljoin(GEOLOCATION_ECS_ENDPOINT, "/get_geolocations"),
         NLPRequest.FeaturesType.ENTRY_EXTRACTION: urljoin(ENTRYEXTRACTION_ECS_ENDPOINT, "/extract_entries"),
+        NLPRequest.FeaturesType.ENTRY_CLASSIFICATION_LLM: urljoin(ENTRYEXTRACTION_ECS_ENDPOINT, "/extract_entries_llm"),
         NLPRequest.FeaturesType.TEXT_EXTRACTION: urljoin(TEXT_EXTRACTION_ECS_ENDPOINT, "/extract_document"),
         NLPRequest.FeaturesType.SUMMARIZATION_V3: urljoin(SUMMARIZATION_V3_ECS_ENDPOINT, "/generate_report")
     }