Skip to content

Commit

Permalink
refactor(#106): use textractor to simplify code
Browse files Browse the repository at this point in the history
  • Loading branch information
billmetangmo committed Aug 30, 2024
1 parent 0166f85 commit 0cbe417
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 80 deletions.
3 changes: 2 additions & 1 deletion infra/.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
BRANCH_NAME="teraform-layer"
API_KEY="${aws-ssm(region=eu-central-1):/mtchoun-mouh/zulip_api_key}"
BUCKET_NAME="${BRANCH_NAME}-djansang"
BUCKET_NAME="mtchoun-mouh-${BRANCH_NAME}-djansang"
ENV="local"
LINKS_TABLE="mtchoun-mouh-${BRANCH_NAME}-Link_table"
MAINTAINER_MAIL="${aws-ssm(region=eu-central-1):/mtchoun-mouh/maintainer_mail}"
Expand Down
2 changes: 2 additions & 0 deletions infra/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
"2025",
"s'agit",
"recepisse",
"ouvrables",
"communiqué"
]

# consulat
Expand Down
90 changes: 13 additions & 77 deletions infra/api/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from config import stopWords, images_url_path
import os
import zulip

from textractor import Textractor

def Images_in_Bucket(Bucket_Name):
"""Gets a list of all image names in an S3 bucket.
Expand Down Expand Up @@ -98,93 +98,29 @@ def Extract_Users(s3BucketName, ImageName): # sourcery no-metrics
list: A list of extracted user information dicts.
"""
region = os.environ["REGION"]
textract = boto3.client("textract", region_name=region)
reponse = textract.detect_document_text(
Document={"S3Object": {"Bucket": s3BucketName, "Name": ImageName}}
)
# print(reponse)
columns = []
lines = []
extractor = Textractor(region_name=os.environ["REGION"])
document = extractor.detect_document_text(file_source=f"s3://{s3BucketName}/{ImageName}")
filtered_lines = []
errors_tab = []
for item in reponse["Blocks"]:
if item["BlockType"] == "LINE":
column_found = False
for index, column in enumerate(columns):
bbox_left = item["Geometry"]["BoundingBox"]["Left"]
bbox_right = (
item["Geometry"]["BoundingBox"]["Left"]
+ item["Geometry"]["BoundingBox"]["Width"]
)
bbox_centre = (
item["Geometry"]["BoundingBox"]["Left"]
+ item["Geometry"]["BoundingBox"]["Width"] / 2
)
column_centre = column["left"] + column["right"] / 2

if (bbox_centre > column["left"] and bbox_centre < column["right"]) or (
column_centre > bbox_left and column_centre < bbox_right
):
# Bbox appears inside the column
lines.append([index, item["Text"]])
column_found = True
break
if not column_found:
columns.append(
{
"left": item["Geometry"]["BoundingBox"]["Left"],
"right": item["Geometry"]["BoundingBox"]["Left"]
+ item["Geometry"]["BoundingBox"]["Width"],
}
)
lines.append([len(columns) - 1, item["Text"]])

lines.sort(key=lambda x: x[0])
for line in document.lines:
# Vérifie si aucun mot de stop_words n'est présent dans la ligne
if not any(stop_word in str(line) for stop_word in stopWords):
filtered_lines.append(str(line))
for line in filtered_lines:

filtered_lines = []
for line in lines:
# print(line[1])
detected_stop_words = [x for x in stopWords if x in line[1]]
if not detected_stop_words:
filtered_lines.append(line)
# TODO: Create a custom iterator: https://www.programiz.com/python-programming/iterator
iter_lines = iter(filtered_lines)
while True:
try:
# get the next item
line = next(iter_lines)
# print(line[1])
raise IndexError

UserName = ""
if " " not in line[1]:

# print ( "prev line:"+line[1])
line = next(iter_lines)
# Sometimes the number. and names are detected separetely and not in order
# In this case, the next line can not be the name but also number. so we iterate until there is a name
while " " not in line[1]:
line = next(iter_lines)

# print ( "next line:"+line[1])
UserName = line[1]

else:
UserName = line[1].split(". ")[1] if "." in line[1] else line[1]
# print(line)
UserName = line.split(". ")[1:] if "." in line else line
if isinstance(UserName, list):
UserName = ". ".join(UserName)
if UserName != "":
# We choosed to save all the names in lower former instead of upper because of the DU stopWord
# Indeed if upper names , all persons DU like DURAND in their names will not be detected.
print(f"Username={UserName.lower()}")
insert_dynamodb(UserName.lower(), ImageName)
print(f"Username={UserName}")

except StopIteration:
break

except Exception as e:
print(e)
errors_tab.append({str(e) + " " + str(line): ImageName})
print(errors_tab)
# print(f"related image:{ImageName}")

return errors_tab

Expand Down
2 changes: 1 addition & 1 deletion infra/api/notify.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def amazone_ses_mail(NAME, RECIPIENT, URL_IMAGE, maintainer=False):
"""
maintainer_mail = os.environ["MAINTAINER_MAIL"]
NAME = NAME.upper()
SENDER = f"Collectif mongulu <{maintainer_mail}>"
SENDER = f"Collectif mongulu <[email protected]>"
AWS_REGION = "eu-central-1"

if not maintainer:
Expand Down
1 change: 0 additions & 1 deletion infra/api/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def scan_consulate_passport_page():
notify_maintainer = False

for src in srcs:
src.split("..")
link_image_initial = src.split("..")[1]
real_link = f"https://www.consulacam-marseille.fr{link_image_initial}"
Scan_reponse_link = Scan_Link(real_link)
Expand Down

0 comments on commit 0cbe417

Please sign in to comment.