Skip to content

Commit

Permalink
Merge pull request #1505 from the-deep/feature/ocr-integration
Browse files Browse the repository at this point in the history
Add OCR integration feature for NLP-based image and attachment
  • Loading branch information
subinasr authored Jul 9, 2024
2 parents 279227d + 0c6fb39 commit 6d9f690
Show file tree
Hide file tree
Showing 45 changed files with 1,327 additions and 224 deletions.
2 changes: 1 addition & 1 deletion apps/ary/export/affected_groups_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def get_affected_groups_info(assessment):
affected_group_type_dict = {choice.value: choice.label for choice in AssessmentRegistry.AffectedGroupType}
affected_groups = [affected_group_type_dict.get(group) for group in assessment.affected_groups if group]
max_level = max([len(v.split('/')) for k, v in AssessmentRegistry.AffectedGroupType.choices])
levels = [f'Level {i+1}' for i in range(max_level)]
levels = [f'Level {i + 1}' for i in range(max_level)]
affected_grp_list = []
for group in affected_groups:
group = group.split("/")
Expand Down
4 changes: 2 additions & 2 deletions apps/bulk_data_migration/entry_images_v2/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from utils.common import parse_number

from lead.models import LeadPreviewImage
from lead.models import LeadPreviewAttachment
from entry.models import Entry
from gallery.models import File

Expand Down Expand Up @@ -34,7 +34,7 @@ def _get_file_from_s3_url(entry, string):
return
# NOTE: For lead-preview generate gallery files
if file_path.startswith('lead-preview/'):
lead_preview = LeadPreviewImage.objects.filter(file=file_path).first()
lead_preview = LeadPreviewAttachment.objects.filter(file=file_path).first()
if lead_preview and lead_preview.file and lead_preview.file.storage.exists(lead_preview.file.name):
return lead_preview.clone_as_deep_file(entry.created_by)
return
Expand Down
8 changes: 4 additions & 4 deletions apps/commons/receivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

from lead.models import (
LeadPreview,
LeadPreviewImage,
LeadPreviewAttachment,
)
from unified_connector.models import ConnectorLeadPreviewImage
from unified_connector.models import ConnectorLeadPreviewAttachment


# Lead
@receiver(models.signals.post_delete, sender=LeadPreview)
@receiver(models.signals.post_delete, sender=LeadPreviewImage)
@receiver(models.signals.post_delete, sender=LeadPreviewAttachment)
# Unified Connector
@receiver(models.signals.post_delete, sender=ConnectorLeadPreviewImage)
@receiver(models.signals.post_delete, sender=ConnectorLeadPreviewAttachment)
def cleanup_file_on_instance_delete(sender, instance, **kwargs):
files = []
for field in instance._meta.get_fields():
Expand Down
145 changes: 115 additions & 30 deletions apps/deepl_integration/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import copy
import requests
import logging
from typing import List, Type
from typing import Dict, List, Type
from functools import reduce
from urllib.parse import urlparse

Expand All @@ -30,14 +30,14 @@
)
from unified_connector.models import (
ConnectorLead,
ConnectorLeadPreviewImage,
ConnectorLeadPreviewAttachment,
ConnectorSource,
UnifiedConnector,
)
from lead.models import (
Lead,
LeadPreview,
LeadPreviewImage,
LeadPreviewAttachment,
)
from lead.typings import NlpExtractorDocument
from entry.models import Entry
Expand Down Expand Up @@ -636,13 +636,14 @@ def trigger_lead_extract(cls, lead, task_instance=None):
def save_data(
lead: Lead,
text_source_uri: str,
images_uri: List[str],
images_uri: List[dict],
table_uri: List[dict],
word_count: int,
page_count: int,
text_extraction_id: str,
):
LeadPreview.objects.filter(lead=lead).delete()
LeadPreviewImage.objects.filter(lead=lead).delete()
LeadPreviewAttachment.objects.filter(lead=lead).delete()
# and create new one
LeadPreview.objects.create(
lead=lead,
Expand All @@ -651,30 +652,69 @@ def save_data(
page_count=page_count,
text_extraction_id=text_extraction_id,
)
# Save extracted images as LeadPreviewImage instances
# Save extracted images as LeadPreviewAttachment instances
# TODO: The logic is same for unified_connector leads as well. Maybe have a single func?
image_base_path = f'{lead.pk}'

attachment_base_path = f'{lead.pk}'
for image_uri in images_uri:
lead_image = LeadPreviewImage(lead=lead)
image_obj = RequestHelper(url=image_uri, ignore_error=True).get_file()
if image_obj:
lead_image.file.save(
os.path.join(image_base_path, os.path.basename(urlparse(image_uri).path)),
image_obj
for image in image_uri['images']:
image_obj = RequestHelper(url=image, ignore_error=True).get_file()
if image_obj:
lead_attachment = LeadPreviewAttachment(lead=lead)
lead_attachment.file.save(
os.path.join(
attachment_base_path,
os.path.basename(
urlparse(image).path
)
),
image_obj,
)
lead_attachment.page_number = image_uri['page_number']
lead_attachment.type = LeadPreviewAttachment.AttachmentFileType.IMAGE
lead_attachment.file_preview = lead_attachment.file
lead_attachment.save()

for table in table_uri:
table_img = RequestHelper(url=table['image_link'], ignore_error=True).get_file()
table_attachment = RequestHelper(url=table['content_link'], ignore_error=True).get_file()
if table_img:
lead_attachment = LeadPreviewAttachment(lead=lead)
lead_attachment.file_preview.save(
os.path.join(
attachment_base_path,
os.path.basename(
urlparse(table['image_link']).path
)
),
table_img,
)
lead_image.save()
lead_attachment.page_number = table['page_number']
lead_attachment.type = LeadPreviewAttachment.AttachmentFileType.XLSX
lead_attachment.file.save(
os.path.join(
attachment_base_path,
os.path.basename(
urlparse(table['content_link']).path
)
),
table_attachment,
)
lead_attachment.save()

lead.update_extraction_status(Lead.ExtractionStatus.SUCCESS)
return lead

@staticmethod
@transaction.atomic
def save_lead_data_using_connector_lead(
lead: Lead,
connector_lead: ConnectorLead,
):
if connector_lead.extraction_status != ConnectorLead.ExtractionStatus.SUCCESS:
return False
LeadPreview.objects.filter(lead=lead).delete()
LeadPreviewImage.objects.filter(lead=lead).delete()
LeadPreviewAttachment.objects.filter(lead=lead).delete()
# and create new one
LeadPreview.objects.create(
lead=lead,
Expand All @@ -683,14 +723,20 @@ def save_lead_data_using_connector_lead(
page_count=connector_lead.page_count,
text_extraction_id=connector_lead.text_extraction_id,
)
# Save extracted images as LeadPreviewImage instances
# Save extracted images as LeadPreviewAttachment instances
# TODO: The logic is same for unified_connector leads as well. Maybe have a single func?
for connector_lead_preview_image in connector_lead.preview_images.all():
lead_image = LeadPreviewImage(lead=lead)
lead_image.file.save(
connector_lead_preview_image.image.name,
connector_lead_preview_image.image,
for connector_lead_attachment in connector_lead.preview_images.all():
lead_attachment = LeadPreviewAttachment(lead=lead)
lead_attachment.order = connector_lead_attachment.order
lead_attachment.file.save(
connector_lead_attachment.file.name,
connector_lead_attachment.file,
)
lead_attachment.file_preview.save(
connector_lead_attachment.file_preview.name,
connector_lead_attachment.file_preview
)
lead_attachment.save()
lead.update_extraction_status(Lead.ExtractionStatus.SUCCESS)
return True

Expand All @@ -703,7 +749,8 @@ class UnifiedConnectorLeadHandler(BaseHandler):
def save_data(
connector_lead: ConnectorLead,
text_source_uri: str,
images_uri: List[str],
images_uri: List[Dict],
table_uri: List[Dict],
word_count: int,
page_count: int,
text_extraction_id: str,
Expand All @@ -712,16 +759,54 @@ def save_data(
connector_lead.word_count = word_count
connector_lead.page_count = page_count
connector_lead.text_extraction_id = text_extraction_id
image_base_path = f'{connector_lead.pk}'

attachment_base_path = f'{connector_lead.pk}'
for image_uri in images_uri:
lead_image = ConnectorLeadPreviewImage(connector_lead=connector_lead)
image_obj = RequestHelper(url=image_uri, ignore_error=True).get_file()
if image_obj:
lead_image.image.save(
os.path.join(image_base_path, os.path.basename(urlparse(image_uri).path)),
image_obj,
for image in image_uri['images']:
image_obj = RequestHelper(url=image, ignore_error=True).get_file()
if image_obj:
connector_lead_attachment = ConnectorLeadPreviewAttachment(connector_lead=connector_lead)
connector_lead_attachment.file.save(
os.path.join(
attachment_base_path,
os.path.basename(
urlparse(image).path
)
),
image_obj,
)
connector_lead_attachment.page_number = image_uri['page_number']
connector_lead_attachment.type = ConnectorLeadPreviewAttachment.ConnectorAttachmentFileType.IMAGE
connector_lead_attachment.file_preview = connector_lead_attachment.file
connector_lead_attachment.save()

for table in table_uri:
table_img = RequestHelper(url=table['image_link'], ignore_error=True).get_file()
table_attachment = RequestHelper(url=table['content_link'], ignore_error=True).get_file()
if table_img:
connector_lead_attachment = ConnectorLeadPreviewAttachment(connector_lead=connector_lead)
connector_lead_attachment.file_preview.save(
os.path.join(
attachment_base_path,
os.path.basename(
urlparse(table['image_link']).path
)
),
table_img,
)
connector_lead_attachment.page_number = table['page_number']
connector_lead_attachment.type = ConnectorLeadPreviewAttachment.ConnectorAttachmentFileType.XLSX
connector_lead_attachment.file.save(
os.path.join(
attachment_base_path,
os.path.basename(
urlparse(table['content_link']).path
)
),
table_attachment,
)
lead_image.save()
connector_lead_attachment.save()

connector_lead.update_extraction_status(ConnectorLead.ExtractionStatus.SUCCESS, commit=False)
connector_lead.save()
return connector_lead
Expand Down
41 changes: 33 additions & 8 deletions apps/deepl_integration/serializers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Type
from typing import Type, List, Dict
import logging
from rest_framework import serializers

Expand Down Expand Up @@ -63,16 +63,35 @@ class Status(models.IntegerChoices):
status = serializers.ChoiceField(choices=Status.choices)


class ImagePathSerializer(serializers.Serializer):
page_number = serializers.IntegerField(required=True)
images = serializers.ListField(
child=serializers.CharField(allow_blank=True),
default=[],
)


class TablePathSerializer(serializers.Serializer):
page_number = serializers.IntegerField(required=True)
order = serializers.IntegerField(required=True)
image_link = serializers.URLField(required=True)
content_link = serializers.URLField(required=True)


# -- Lead
class LeadExtractCallbackSerializer(DeeplServerBaseCallbackSerializer):
"""
Serialize deepl extractor
"""
url = serializers.CharField(required=False)
# Data fields
images_path = serializers.ListField(
child=serializers.CharField(allow_blank=True),
required=False, default=[],
images_path = serializers.ListSerializer(
child=ImagePathSerializer(required=True),
required=False
)
tables_path = serializers.ListSerializer(
child=TablePathSerializer(required=True),
required=False
)
text_path = serializers.CharField(required=False, allow_null=True)
total_words_count = serializers.IntegerField(required=False, default=0, allow_null=True)
Expand All @@ -98,14 +117,15 @@ def validate(self, data):
raise serializers.ValidationError(errors)
return data

def create(self, data):
def create(self, data: List[Dict]):
success = data['status'] == self.Status.SUCCESS
lead = data['object'] # Added from validate
if success:
lead = self.nlp_handler.save_data(
lead,
data['text_path'],
data.get('images_path', [])[:10], # TODO: Support for more images, too much image will error.
data.get('tables_path', [])[:10],
data.get('total_words_count'),
data.get('total_pages'),
data.get('text_extraction_id'),
Expand All @@ -123,9 +143,13 @@ class UnifiedConnectorLeadExtractCallbackSerializer(DeeplServerBaseCallbackSeria
Serialize deepl extractor
"""
# Data fields
images_path = serializers.ListField(
child=serializers.CharField(allow_blank=True),
required=False, default=[],
images_path = serializers.ListSerializer(
child=ImagePathSerializer(required=True),
required=False
)
tables_path = serializers.ListSerializer(
child=TablePathSerializer(required=True),
required=False
)
text_path = serializers.CharField(required=False, allow_null=True)
total_words_count = serializers.IntegerField(required=False, default=0, allow_null=True)
Expand Down Expand Up @@ -155,6 +179,7 @@ def create(self, data):
connector_lead,
data['text_path'],
data.get('images_path', [])[:10], # TODO: Support for more images, to much image will error.
data['tables_path'],
data['total_words_count'],
data['total_pages'],
data['text_extraction_id'],
Expand Down
8 changes: 7 additions & 1 deletion apps/entry/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from entry.models import (
Entry,
Attribute,
EntryAttachment,
FilterData,
ExportData,
EntryComment,
Expand Down Expand Up @@ -61,7 +62,7 @@ class EntryAdmin(VersionAdmin):
)
autocomplete_fields = (
'lead', 'project', 'created_by', 'modified_by', 'analysis_framework', 'tabular_field',
'image', 'controlled_changed_by', 'verified_by',
'image', 'controlled_changed_by', 'verified_by', 'entry_attachment',
)
ordering = ('project', 'created_by', 'created_at')

Expand All @@ -86,4 +87,9 @@ class ProjectEntryLabelAdmin(VersionAdmin):
list_display = ('__str__', 'color')


@admin.register(EntryAttachment)
class EntryAttachmentAdmin(VersionAdmin):
search_fields = ['entry_file_type',]


reversion.register(LeadEntryGroup)
1 change: 0 additions & 1 deletion apps/entry/dataloaders.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from collections import defaultdict

from promise import Promise
from django.utils.functional import cached_property
from django.db import models
Expand Down
Loading

0 comments on commit 6d9f690

Please sign in to comment.