Skip to content

Commit

Permalink
Switch to new DEEPL text extraction endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
thenav56 committed Sep 6, 2023
1 parent 4cc1327 commit d86ae27
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 34 deletions.
13 changes: 9 additions & 4 deletions apps/deepl_integration/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
LeadPreview,
LeadPreviewImage,
)
from lead.typings import NlpExtractorUrl
from lead.typings import NlpExtractorDocument
from entry.models import Entry
from analysis.models import (
TopicModel,
Expand Down Expand Up @@ -85,6 +85,11 @@ def _make_hash_value(self, instance, timestamp):
return str(type(instance)) + str(instance.pk) + str(timestamp)


class NlpRequestType:
SYSTEM = 0 # Note: SYSTEM refers to requests from CONNECTORS.
USER = 1


class BaseHandler:
REQUEST_HEADERS = {
'Content-Type': 'application/json',
Expand Down Expand Up @@ -352,14 +357,14 @@ class LeadExtractionHandler(BaseHandler):
@classmethod
def send_trigger_request_to_extractor(
cls,
urls: List[NlpExtractorUrl],
documents: List[NlpExtractorDocument],
callback_url: str,
high_priority=False,
):
payload = {
'urls': urls,
'documents': documents,
'callback_url': callback_url,
'type': 'user' if high_priority else 'system',
'request_type': NlpRequestType.USER if high_priority else NlpRequestType.SYSTEM,
}
try:
response = requests.post(
Expand Down
45 changes: 22 additions & 23 deletions apps/deepl_integration/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,23 @@ def validate(self, data):
return data


class DeeplServerBaseCallbackSerializer(BaseCallbackSerializer):
class Status(models.IntegerChoices):
# NOTE: Defined by NLP
# INITIATED = 1, 'Initiated' # Not needed or used by deep
SUCCESS = 2, 'Success'
FAILED = 3, 'Failed'
INPUT_URL_PROCESS_FAILED = 4, 'Input url process failed'

status = serializers.ChoiceField(choices=Status.choices)


# -- Lead
class LeadExtractCallbackSerializer(BaseCallbackSerializer):
class LeadExtractCallbackSerializer(DeeplServerBaseCallbackSerializer):
"""
Serialize deepl extractor
"""
url = serializers.CharField()
extraction_status = serializers.IntegerField() # 0 = Failed, 1 = Success
# Data fields
images_path = serializers.ListField(
child=serializers.CharField(allow_blank=True),
Expand All @@ -68,21 +78,21 @@ class LeadExtractCallbackSerializer(BaseCallbackSerializer):
def validate(self, data):
data = super().validate(data)
# Additional validation
if data['extraction_status'] == 1 and data.get('text_path') in [None, '']:
if data['status'] == self.Status.SUCCESS and data.get('text_path') in [None, '']:
raise serializers.ValidationError({
'text_path': 'text_path is required when extraction_status is success/1'
'text_path': 'text_path is required when extraction status is success'
})
if data['extraction_status'] == 1:
if data['status'] == self.Status.SUCCESS:
errors = {}
for key in ['text_path', 'total_words_count', 'total_pages']:
if key not in data:
errors[key] = f'<{key}> is missing. Required when the extraction is 1 (Success)'
errors[key] = f'<{key}> is missing. Required when the extraction status is Success'
if errors:
raise serializers.ValidationError(errors)
return data

def create(self, data):
success = data['extraction_status'] == 1
success = data['status'] == self.Status.SUCCESS
lead = data['object'] # Added from validate
if success:
lead = self.nlp_handler.save_data(
Expand All @@ -100,12 +110,11 @@ def create(self, data):


# -- Unified Connector
class UnifiedConnectorLeadExtractCallbackSerializer(BaseCallbackSerializer):
class UnifiedConnectorLeadExtractCallbackSerializer(DeeplServerBaseCallbackSerializer):
"""
Serialize deepl extractor
"""
url = serializers.CharField()
extraction_status = serializers.IntegerField() # 0 = Failed, 1 = Success
# Data fields
images_path = serializers.ListField(
child=serializers.CharField(allow_blank=True),
Expand All @@ -123,17 +132,17 @@ def validate(self, data):
raise serializers.ValidationError({
'url': 'Different url found provided vs original connector lead',
})
if data['extraction_status'] == 1:
if data['status'] == self.Status.SUCCESS:
errors = {}
for key in ['text_path', 'total_words_count', 'total_pages']:
if key not in data:
errors[key] = f'<{key}> is missing. Required when the extraction is 1 (Success)'
errors[key] = f'<{key}> is missing. Required when the extraction is Success'
if errors:
raise serializers.ValidationError(errors)
return data

def create(self, data):
success = data['extraction_status'] == 1
success = data['status'] == self.Status.SUCCESS
connector_lead = data['object'] # Added from validate
if success:
return self.nlp_handler.save_data(
Expand Down Expand Up @@ -197,17 +206,6 @@ def create(self, validated_data):
)


# -- Analysis
class DeeplServerBaseCallbackSerializer(BaseCallbackSerializer):
class Status(models.IntegerChoices):
# NOTE: Defined by NLP
SUCCESS = 2, 'Success'
FAILED = 3, 'Failed'
INPUT_URL_PROCESS_FAILED = 4, 'Input url process failed'

status = serializers.ChoiceField(choices=Status.choices)


class EntriesCollectionBaseCallbackSerializer(DeeplServerBaseCallbackSerializer):
model: Type[DeeplTrackBaseModel]
presigned_s3_url = serializers.URLField()
Expand All @@ -222,6 +220,7 @@ def create(self, validated_data):
return obj


# -- Analysis
class AnalysisTopicModelCallbackSerializer(EntriesCollectionBaseCallbackSerializer):
model = TopicModel
nlp_handler = AnalysisTopicModelHandler
Expand Down
5 changes: 3 additions & 2 deletions apps/lead/tests/test_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from lead.filter_set import LeadFilterSet
from lead.serializers import SimpleLeadGroupSerializer
from deepl_integration.handlers import LeadExtractionHandler
from deepl_integration.serializers import DeeplServerBaseCallbackSerializer
from entry.models import (
Entry,
ProjectEntryLabel,
Expand Down Expand Up @@ -1800,7 +1801,7 @@ def test_extractor_callback_url(self, get_file_mock, get_text_mock, index_lead_f
'url': 'http://random.com/pdf_file.pdf',
'total_words_count': 300,
'total_pages': 4,
'extraction_status': 0,
'status': DeeplServerBaseCallbackSerializer.Status.FAILED.value,
}

# After callback [Failure]
Expand All @@ -1811,7 +1812,7 @@ def test_extractor_callback_url(self, get_file_mock, get_text_mock, index_lead_f
self.assertEqual(LeadPreview.objects.filter(lead=self.lead).count(), 0)
self.assertEqual(LeadPreviewImage.objects.filter(lead=self.lead).count(), 0)

data['extraction_status'] = 1
data['status'] = DeeplServerBaseCallbackSerializer.Status.SUCCESS.value
# After callback [Success]
with self.captureOnCommitCallbacks(execute=True):
response = self.client.post(url, data)
Expand Down
2 changes: 1 addition & 1 deletion apps/lead/typings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import TypedDict


class NlpExtractorUrl(TypedDict):
class NlpExtractorDocument(TypedDict):
url: str
client_id: str
5 changes: 3 additions & 2 deletions apps/unified_connector/tests/test_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ConnectorLeadPreviewImage,
)
from deepl_integration.handlers import UnifiedConnectorLeadHandler
from deepl_integration.serializers import DeeplServerBaseCallbackSerializer
from unified_connector.factories import (
ConnectorLeadFactory,
ConnectorSourceFactory,
Expand Down Expand Up @@ -496,7 +497,7 @@ def _check_connector_lead_status(connector_lead, status):
text_path='https://example.com/url-where-data-is-fetched-from-mock-response',
total_words_count=100,
total_pages=10,
extraction_status=0, # Failed
status=DeeplServerBaseCallbackSerializer.Status.FAILED.value,
)

response = self.client.post(url, data)
Expand All @@ -522,7 +523,7 @@ def _check_connector_lead_status(connector_lead, status):
text_path='https://example.com/url-where-data-is-fetched-from-mock-response',
total_words_count=100,
total_pages=10,
extraction_status=1, # Failed
status=DeeplServerBaseCallbackSerializer.Status.SUCCESS.value,
)
response = self.client.post(url, data)
self.assert_400(response)
Expand Down
4 changes: 2 additions & 2 deletions deep/deepl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
class DeeplServiceEndpoint():
# DEEPL Service Endpoints (Existing/Legacy)
# NOTE: This will be moved to server endpoints in near future
DOCS_EXTRACTOR_ENDPOINT = f'{DEEPL_SERVICE_DOMAIN}/extract_docs'
ASSISTED_TAGGING_TAGS_ENDPOINT = f'{DEEPL_SERVICE_DOMAIN}/vf_tags'
ASSISTED_TAGGING_MODELS_ENDPOINT = f'{DEEPL_SERVICE_DOMAIN}/model_info'
ASSISTED_TAGGING_ENTRY_PREDICT_ENDPOINT = f'{DEEPL_SERVICE_DOMAIN}/entry_predict'

# DEEPL Server Endpoints (New/Legacy)
# DEEPL Server Endpoints (New)
DOCS_EXTRACTOR_ENDPOINT = f'{DEEPL_SERVER_DOMAIN}/api/v1/text-extraction/'
ANALYSIS_TOPIC_MODEL = f'{DEEPL_SERVER_DOMAIN}/api/v1/topicmodel/'
ANALYSIS_AUTOMATIC_SUMMARY = f'{DEEPL_SERVER_DOMAIN}/api/v1/summarization/'
ANALYSIS_AUTOMATIC_NGRAM = f'{DEEPL_SERVER_DOMAIN}/api/v1/ngrams/'
Expand Down

0 comments on commit d86ae27

Please sign in to comment.