From 0cbe417de1e7e6cfc49e81e81aff38b8a459f475 Mon Sep 17 00:00:00 2001 From: billmetangmo <25366207+billmetangmo@users.noreply.github.com> Date: Fri, 30 Aug 2024 00:21:31 +0000 Subject: [PATCH] refactor(#106): use textractor to simplify code --- infra/.env | 3 +- infra/api/config.py | 2 + infra/api/extract.py | 90 +++++++------------------------------------- infra/api/notify.py | 2 +- infra/api/scan.py | 1 - 5 files changed, 18 insertions(+), 80 deletions(-) diff --git a/infra/.env b/infra/.env index 8896c75..1aac069 100644 --- a/infra/.env +++ b/infra/.env @@ -1,5 +1,6 @@ +BRANCH_NAME="teraform-layer" API_KEY="${aws-ssm(region=eu-central-1):/mtchoun-mouh/zulip_api_key}" -BUCKET_NAME="${BRANCH_NAME}-djansang" +BUCKET_NAME="mtchoun-mouh-${BRANCH_NAME}-djansang" ENV="local" LINKS_TABLE="mtchoun-mouh-${BRANCH_NAME}-Link_table" MAINTAINER_MAIL="${aws-ssm(region=eu-central-1):/mtchoun-mouh/maintainer_mail}" diff --git a/infra/api/config.py b/infra/api/config.py index 36dfddf..9aab27f 100644 --- a/infra/api/config.py +++ b/infra/api/config.py @@ -28,6 +28,8 @@ "2025", "s'agit", "recepisse", + "ouvrables", + "communiqué" ] # consulat diff --git a/infra/api/extract.py b/infra/api/extract.py index e0b0d43..57f9a2c 100644 --- a/infra/api/extract.py +++ b/infra/api/extract.py @@ -3,7 +3,7 @@ from config import stopWords, images_url_path import os import zulip - +from textractor import Textractor def Images_in_Bucket(Bucket_Name): """Gets a list of all image names in an S3 bucket. @@ -98,93 +98,29 @@ def Extract_Users(s3BucketName, ImageName): # sourcery no-metrics list: A list of extracted user information dicts. """ region = os.environ["REGION"] - textract = boto3.client("textract", region_name=region) - reponse = textract.detect_document_text( - Document={"S3Object": {"Bucket": s3BucketName, "Name": ImageName}} - ) - # print(reponse) - columns = [] - lines = [] + extractor = Textractor(region_name=os.environ["REGION"]) + document = extractor.detect_document_text(file_source=f"s3://{s3BucketName}/{ImageName}") + filtered_lines = [] errors_tab = [] - for item in reponse["Blocks"]: - if item["BlockType"] == "LINE": - column_found = False - for index, column in enumerate(columns): - bbox_left = item["Geometry"]["BoundingBox"]["Left"] - bbox_right = ( - item["Geometry"]["BoundingBox"]["Left"] - + item["Geometry"]["BoundingBox"]["Width"] - ) - bbox_centre = ( - item["Geometry"]["BoundingBox"]["Left"] - + item["Geometry"]["BoundingBox"]["Width"] / 2 - ) - column_centre = column["left"] + column["right"] / 2 - - if (bbox_centre > column["left"] and bbox_centre < column["right"]) or ( - column_centre > bbox_left and column_centre < bbox_right - ): - # Bbox appears inside the column - lines.append([index, item["Text"]]) - column_found = True - break - if not column_found: - columns.append( - { - "left": item["Geometry"]["BoundingBox"]["Left"], - "right": item["Geometry"]["BoundingBox"]["Left"] - + item["Geometry"]["BoundingBox"]["Width"], - } - ) - lines.append([len(columns) - 1, item["Text"]]) - - lines.sort(key=lambda x: x[0]) + for line in document.lines: + # Vérifie si aucun mot de stop_words n'est présent dans la ligne + if not any(stop_word in str(line) for stop_word in stopWords): + filtered_lines.append(str(line)) + for line in filtered_lines: - filtered_lines = [] - for line in lines: - # print(line[1]) - detected_stop_words = [x for x in stopWords if x in line[1]] - if not detected_stop_words: - filtered_lines.append(line) - # TODO: Create a custom iterator: https://www.programiz.com/python-programming/iterator - iter_lines = iter(filtered_lines) - while True: try: - # get the next item - line = next(iter_lines) - # print(line[1]) - raise IndexError - - UserName = "" - if " " not in line[1]: - - # print ( "prev line:"+line[1]) - line = next(iter_lines) - # Sometimes the number. and names are detected separetely and not in order - # In this case, the next line can not be the name but also number. so we iterate until there is a name - while " " not in line[1]: - line = next(iter_lines) - - # print ( "next line:"+line[1]) - UserName = line[1] - - else: - UserName = line[1].split(". ")[1] if "." in line[1] else line[1] - # print(line) + UserName = line.split(". ")[1:] if "." in line else line + if isinstance(UserName, list): + UserName = ". ".join(UserName) if UserName != "": # We choosed to save all the names in lower former instead of upper because of the DU stopWord # Indeed if upper names , all persons DU like DURAND in their names will not be detected. + print(f"Username={UserName.lower()}") insert_dynamodb(UserName.lower(), ImageName) - print(f"Username={UserName}") - - except StopIteration: - break - except Exception as e: print(e) errors_tab.append({str(e) + " " + str(line): ImageName}) print(errors_tab) - # print(f"related image:{ImageName}") return errors_tab diff --git a/infra/api/notify.py b/infra/api/notify.py index 3790753..8c51412 100644 --- a/infra/api/notify.py +++ b/infra/api/notify.py @@ -34,7 +34,7 @@ def amazone_ses_mail(NAME, RECIPIENT, URL_IMAGE, maintainer=False): """ maintainer_mail = os.environ["MAINTAINER_MAIL"] NAME = NAME.upper() - SENDER = f"Collectif mongulu <{maintainer_mail}>" + SENDER = f"Collectif mongulu " AWS_REGION = "eu-central-1" if not maintainer: diff --git a/infra/api/scan.py b/infra/api/scan.py index 5453474..d15019a 100644 --- a/infra/api/scan.py +++ b/infra/api/scan.py @@ -111,7 +111,6 @@ def scan_consulate_passport_page(): notify_maintainer = False for src in srcs: - src.split("..") link_image_initial = src.split("..")[1] real_link = f"https://www.consulacam-marseille.fr{link_image_initial}" Scan_reponse_link = Scan_Link(real_link)