refactor(#106): use textractor to simplify code

mongulu-cm · Aug 30, 2024 · 0cbe417 · 0cbe417
1 parent 0166f85
commit 0cbe417
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 80 deletions.
diff --git a/infra/.env b/infra/.env
@@ -1,5 +1,6 @@
+BRANCH_NAME="teraform-layer"
 API_KEY="${aws-ssm(region=eu-central-1):/mtchoun-mouh/zulip_api_key}"
-BUCKET_NAME="${BRANCH_NAME}-djansang"
+BUCKET_NAME="mtchoun-mouh-${BRANCH_NAME}-djansang"
 ENV="local"
 LINKS_TABLE="mtchoun-mouh-${BRANCH_NAME}-Link_table"
 MAINTAINER_MAIL="${aws-ssm(region=eu-central-1):/mtchoun-mouh/maintainer_mail}"

diff --git a/infra/api/config.py b/infra/api/config.py
@@ -28,6 +28,8 @@
     "2025",
     "s'agit",
     "recepisse",
+    "ouvrables",
+    "communiqué"
 ]
 
 # consulat

diff --git a/infra/api/extract.py b/infra/api/extract.py
@@ -3,7 +3,7 @@
 from config import stopWords, images_url_path
 import os
 import zulip
-
+from textractor import Textractor
 
 def Images_in_Bucket(Bucket_Name):
     """Gets a list of all image names in an S3 bucket.
@@ -98,93 +98,29 @@ def Extract_Users(s3BucketName, ImageName):  # sourcery no-metrics
         list: A list of extracted user information dicts.
     """
     region = os.environ["REGION"]
-    textract = boto3.client("textract", region_name=region)
-    reponse = textract.detect_document_text(
-        Document={"S3Object": {"Bucket": s3BucketName, "Name": ImageName}}
-    )
-    # print(reponse)
-    columns = []
-    lines = []
+    extractor = Textractor(region_name=os.environ["REGION"])
+    document = extractor.detect_document_text(file_source=f"s3://{s3BucketName}/{ImageName}")
+    filtered_lines = []
     errors_tab = []
-    for item in reponse["Blocks"]:
-        if item["BlockType"] == "LINE":
-            column_found = False
-            for index, column in enumerate(columns):
-                bbox_left = item["Geometry"]["BoundingBox"]["Left"]
-                bbox_right = (
-                    item["Geometry"]["BoundingBox"]["Left"]
-                    + item["Geometry"]["BoundingBox"]["Width"]
-                )
-                bbox_centre = (
-                    item["Geometry"]["BoundingBox"]["Left"]
-                    + item["Geometry"]["BoundingBox"]["Width"] / 2
-                )
-                column_centre = column["left"] + column["right"] / 2
-
-                if (bbox_centre > column["left"] and bbox_centre < column["right"]) or (
-                    column_centre > bbox_left and column_centre < bbox_right
-                ):
-                    # Bbox appears inside the column
-                    lines.append([index, item["Text"]])
-                    column_found = True
-                    break
-            if not column_found:
-                columns.append(
-                    {
-                        "left": item["Geometry"]["BoundingBox"]["Left"],
-                        "right": item["Geometry"]["BoundingBox"]["Left"]
-                        + item["Geometry"]["BoundingBox"]["Width"],
-                    }
-                )
-                lines.append([len(columns) - 1, item["Text"]])
-
-    lines.sort(key=lambda x: x[0])
+    for line in document.lines:
+        # Vérifie si aucun mot de stop_words n'est présent dans la ligne
+        if not any(stop_word in str(line) for stop_word in stopWords):
+            filtered_lines.append(str(line))
+    for line in filtered_lines:
 
-    filtered_lines = []
-    for line in lines:
-        # print(line[1])
-        detected_stop_words = [x for x in stopWords if x in line[1]]
-        if not detected_stop_words:
-            filtered_lines.append(line)
-    # TODO: Create a custom iterator: https://www.programiz.com/python-programming/iterator
-    iter_lines = iter(filtered_lines)
-    while True:
         try:
-            # get the next item
-            line = next(iter_lines)
-            # print(line[1])
-            raise IndexError
-
-            UserName = ""
-            if " " not in line[1]:
-
-                # print ( "prev line:"+line[1])
-                line = next(iter_lines)
-                # Sometimes the number. and names are detected separetely and not in order
-                # In this case, the next line can not be the name but also number. so we iterate until there is a name
-                while " " not in line[1]:
-                    line = next(iter_lines)
-
-                # print ( "next line:"+line[1])
-                UserName = line[1]
-
-            else:
-                UserName = line[1].split(". ")[1] if "." in line[1] else line[1]
-            # print(line)
+            UserName = line.split(". ")[1:] if "." in line else line
+            if isinstance(UserName, list):
+                UserName = ". ".join(UserName)
             if UserName != "":
                 # We choosed to save all the names in lower former instead of upper because of the DU stopWord
                 # Indeed if upper names , all persons DU like DURAND in their names will not be detected.
+                print(f"Username={UserName.lower()}")
                 insert_dynamodb(UserName.lower(), ImageName)
-                print(f"Username={UserName}")
-
-        except StopIteration:
-            break
-
         except Exception as e:
             print(e)
             errors_tab.append({str(e) + " " + str(line): ImageName})
             print(errors_tab)
-            # print(f"related image:{ImageName}")
 
     return errors_tab
 

diff --git a/infra/api/notify.py b/infra/api/notify.py
@@ -34,7 +34,7 @@ def amazone_ses_mail(NAME, RECIPIENT, URL_IMAGE, maintainer=False):
     """
     maintainer_mail = os.environ["MAINTAINER_MAIL"]
     NAME = NAME.upper()
-    SENDER = f"Collectif mongulu <{maintainer_mail}>"
+    SENDER = f"Collectif mongulu <[email protected]>"
     AWS_REGION = "eu-central-1"
 
     if not maintainer:

diff --git a/infra/api/scan.py b/infra/api/scan.py
@@ -111,7 +111,6 @@ def scan_consulate_passport_page():
     notify_maintainer = False
 
     for src in srcs:
-        src.split("..")
         link_image_initial = src.split("..")[1]
         real_link = f"https://www.consulacam-marseille.fr{link_image_initial}"
         Scan_reponse_link = Scan_Link(real_link)