Skip to content

Commit

Permalink
adding extraction mock
Browse files Browse the repository at this point in the history
  • Loading branch information
aembryonic committed Dec 3, 2024
1 parent 3c20d76 commit be6c21f
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 0 deletions.
117 changes: 117 additions & 0 deletions analysis_module/mock_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1804,3 +1804,120 @@
}
]
}

MOCK_ENTRY_EXTRACTION_LLM = {
# this model prediction refer to the framework_id: 2691
# structured_text task: 007d8b24-92c7-407d-8ec1-eb9c1eb7bcdf (s3://nlp-tasks-processed-results-prod-20230602041457655100000005/textextraction/structured/007d8b24-92c7-407d-8ec1-eb9c1eb7bcdf/extracted_text.json)
# project_id: 4677 (DRC Lebanon Protection Monitoring Framework)
"client_id": "entry-classification-llm-client-6000",
"metadata": {
"total_pages": 10,
"total_words_count": 5876
},
"blocks": [
{
"type":"text",
"text":"UNHCR’s partners continue to facilitate cloth mask production by refugee and host community women. So far, 115,000 cloth masks have been produced, of which close to 72,000 have been distributed, including 29,000 to the elderly population.",
"page":5,
"textOrder":7,
"relevant": True,
"prediction_status": True,
"geolocations": [],
"classification":{
"aers0cn11qld0nfv":{
"ru816f687bjnfos5":{
"edgb9tmi3nizdwji":{
"0f9k0w9nd7eewqov":[

]
},
"epzm3h0tal56xcul":{
"0f9k0w9nd7eewqov":[

]
},
"0tuqi4jholiq0qzl":{
"0f9k0w9nd7eewqov":[

]
}
}
},
"aofu2gc37ga68oyr":{
"1mihrcvim7h9j5nq":{
"dxhur2cuu63ukfqx":True
}
},
"vq1y3nsp3svrfb3w":{
"06j0a9eec1dwthbm":{
"kphdesz6b5uzzxki":True,
"b8qbfc5t89ftroft":True
}
}
}
},
{
"type":"text",
"text":"An additional 2,000 Taka (about US$ 24) will be distributed to each of the same households next month. The cash payments are an additional support to many families due to livelihoods having been affected by lockdowns and general economic pressures due to the COVID-19 impact on economies globally and in Bangladesh.",
"page":5,
"textOrder":8,
"relevant": True,
"prediction_status": True,
"geolocations": [
{
"entity": "Somalia",
"meta": {
"offset_start": 88,
"offset_end": 94,
"latitude": -10,
"longitude": -55
}
},
{
"entity": "Portugal",
"meta": {
"offset_start": 183,
"offset_end": 191,
"latitude": 39.6945,
"longitude": -8.13057
}
}
],
"classification":{
"aers0cn11qld0nfv":{
"ru816f687bjnfos5":{
"edgb9tmi3nizdwji":{
"0f9k0w9nd7eewqov":[

]
},
"0tuqi4jholiq0qzl":{
"0f9k0w9nd7eewqov":[

]
}
}
},
"aofu2gc37ga68oyr":{
"1mihrcvim7h9j5nq":{
"dxhur2cuu63ukfqx":True
}
},
"l44malh7rjlhq312":{
"357ps7h8r3ati9jz":{
"kthspg6najppllz7": True
},
"6qz2zednb92k36rz":{
"53bwesslq9emavfk": True,
"rph8vhyl6sc4zlxo": True
}
},
"vq1y3nsp3svrfb3w":{
"djc1q3rh59eig0jp":{
"6pxate9tivbmynh8":True
}
}
}
}
]
}
49 changes: 49 additions & 0 deletions analysis_module/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .mock_templates import (MOCK_ENTRY_CLASSIFICATION,
MOCK_ENTRY_CLASSIFICATION_LLM,
MOCK_ENTRY_CLASSIFICATION_FORMATTED,
MOCK_ENTRY_EXTRACTION_LLM,
MOCK_GEOLOCATION) # noqa
from .utils import send_callback_url_request

Expand Down Expand Up @@ -499,6 +500,53 @@ def process_entry_extraction_mock(body) -> Any:
except Exception:
logger.error("Could not send data to callback url", exc_info=True)

@shared_task
def process_entry_extraction_llm_mock(body) -> Any:
documents = body.get("documents") or []

callback_url = body.get("callback_url")
if not documents or not callback_url:
return

for document in documents:
client_id = document["client_id"]
text_extraction_id = document["text_extraction_id"]
# random_extracted_text = "This is some random entry extracted text"
random_entry_extraction_classification = MOCK_ENTRY_EXTRACTION_LLM
random_entry_extraction_classification.update({
"classification_model_info": {
"name": "llm_model",
"version": "1.0.0"
},
"client_id": client_id,
"entry_extraction_id": "73f9ca13-deb2-4f39-8e86-a856490bfc0d", # random
"text_extraction_id": text_extraction_id
})
filepath = save_data_local_and_get_url(
"entry_extraction", client_id, random_entry_extraction_classification
)

"""
the text_extraction_id is not something easy to retrieve in case the request is
set with the "url". In both cases, with the url, or the textextractionid, the text
was already extracted, and it's not (easily) to retrieve the id from the presigned url.
In the case of a request with the id, is instead possible to get the right document.
"""
callback_data = {
"client_id": client_id,
"entry_extraction_classification_path": filepath,
"text_extraction_id": text_extraction_id,
"status": 1
}
try:
requests.post(
callback_url,
json=callback_data,
timeout=30,
)
logger.info("Successfully send data on callback url for entry extraction.")
except Exception:
logger.error("Could not send data to callback url", exc_info=True)

def entry_classification_mock(body) -> Any:
process_entry_classification_mock.apply_async(
Expand Down Expand Up @@ -568,6 +616,7 @@ def process_entry_classification_llm_mock(body) -> Any:
"geolocation": geolocation_mock_model,
"text-extraction": text_extraction_mock,
"entry-extraction-classification": entry_extraction_mock,
"entry-extraction-classification-llm": entry_classification_llm_mock,
"entry-classification": entry_classification_mock,
"entry-classification-llm": entry_classification_llm_mock
}
Expand Down
2 changes: 2 additions & 0 deletions analysis_module/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ def get_ecs_id_param_name(request_type: NLPRequest.FeaturesType):
NLPRequest.FeaturesType.TOPICMODEL: "topicmodel_id",
NLPRequest.FeaturesType.GEOLOCATION: "geolocation_id",
NLPRequest.FeaturesType.ENTRY_EXTRACTION: "entryextraction_id",
NLPRequest.FeaturesType.ENTRY_CLASSIFICATION_LLM: "entryextraction_llm_id",
NLPRequest.FeaturesType.TEXT_EXTRACTION: "textextraction_id",
NLPRequest.FeaturesType.SUMMARIZATION_V3: "summarization_id"
}
Expand All @@ -298,6 +299,7 @@ def get_ecs_url(request_type: NLPRequest.FeaturesType):
NLPRequest.FeaturesType.TOPICMODEL: urljoin(TOPICMODEL_ECS_ENDPOINT, "/get_excerpt_clusters"),
NLPRequest.FeaturesType.GEOLOCATION: urljoin(GEOLOCATION_ECS_ENDPOINT, "/get_geolocations"),
NLPRequest.FeaturesType.ENTRY_EXTRACTION: urljoin(ENTRYEXTRACTION_ECS_ENDPOINT, "/extract_entries"),
NLPRequest.FeaturesType.ENTRY_CLASSIFICATION_LLM: urljoin(ENTRYEXTRACTION_ECS_ENDPOINT, "/extract_entries_llm"),
NLPRequest.FeaturesType.TEXT_EXTRACTION: urljoin(TEXT_EXTRACTION_ECS_ENDPOINT, "/extract_document"),
NLPRequest.FeaturesType.SUMMARIZATION_V3: urljoin(SUMMARIZATION_V3_ECS_ENDPOINT, "/generate_report")
}
Expand Down

0 comments on commit be6c21f

Please sign in to comment.