Skip to content

Commit

Permalink
added getAttachments param and added new text/plain case
Browse files Browse the repository at this point in the history
  • Loading branch information
danielecalda committed Sep 30, 2023
1 parent d5748f4 commit 10c8dcc
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
24 changes: 17 additions & 7 deletions email-parser/app/imap/imap_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
class AsyncEmailExtraction(threading.Thread):

def __init__(self, mail_server, port, username, password, timestamp, datasource_id, folder, schedule_id, tenant_id,
index_acl):
index_acl, get_attachments):
super(AsyncEmailExtraction, self).__init__()

self.mail_server = mail_server
Expand All @@ -51,6 +51,7 @@ def __init__(self, mail_server, port, username, password, timestamp, datasource_
self.schedule_id = schedule_id
self.tenant_id = tenant_id
self.index_acl = index_acl
self.get_attachments = get_attachments

self.status_logger = logging.getLogger('email-logger')

Expand Down Expand Up @@ -109,29 +110,38 @@ def extract(self):
"email": struct_msg
}

body = struct_msg['body']

if struct_msg['date'] > self.timestamp:

payload = {
"datasourceId": self.datasource_id,
"contentId": str(msg_id).replace("<", "").replace(">", ""),
"parsingDate": int(end_timestamp),
"rawContent": "",
"rawContent": body,
"datasourcePayload": datasource_payload,
"resources": {
"binaries": [],
"splitBinaries": True
},
"scheduleId": self.schedule_id,
"tenantId": self.tenant_id,
}

if self.index_acl:
payload["acl"] = {
"email": acl_list
"email": acl_list
}
else:
payload["acl"] = {}

if self.get_attachments:
payload["resource"] = {
"binaries": binaries,
"splitBinaries": True
}
else:
payload["resource"] = {
"binaries": [],
"splitBinaries": True
}

try:
post_message(ingestion_url, payload, 10)
# self.status_logger.info(payload)
Expand Down
5 changes: 4 additions & 1 deletion email-parser/app/imap/util/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,12 @@ def parse_email(fetched_msg):
if msg.is_multipart():
for j, part in enumerate(msg.walk()):
content_type = part.get_content_type()
if content_type == 'text/html':
logger.info(content_type)
if content_type == 'text/html' or content_type == "text/plain":
charset = part.get_content_charset()
# decode the base64 unicode bytestring into plain text
raw_body = part.get_payload(decode=True).decode(encoding=charset, errors="ignore")
logger.info('ok')
# if we've found the plain/text part, stop looping thru the parts
elif content_type in ["image/png", "image/jpg", "image/jpeg", "application/pdf", "application/msword"]:
data = get_as_base64(part.get_payload(decode=True))
Expand All @@ -70,6 +72,7 @@ def parse_email(fetched_msg):
"data": data
}
binaries.append(binary)

else:
# not multipart - i.e. plain text, no attachments
charset = msg.get_content_charset()
Expand Down
7 changes: 5 additions & 2 deletions email-parser/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import threading
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
from imap.imap_extraction import AsyncEmailExtraction

app = FastAPI()
Expand All @@ -33,7 +34,8 @@ class ImapRequest(BaseModel):
scheduleId: str
folder: str
tenantId: str
indexAcl: bool
indexAcl: Optional[bool] = False
getAttachments: Optional[bool] = False


@app.post("/execute")
Expand All @@ -51,9 +53,10 @@ def get_data(request: ImapRequest):
schedule_id = request["scheduleId"]
tenant_id = request["tenantId"]
index_acl = request["indexAcl"]
get_attachments = request["getAttachments"]

email_extraction_task = AsyncEmailExtraction(mail_server, port, username, password, timestamp, datasource_id,
folder, schedule_id, tenant_id, index_acl)
folder, schedule_id, tenant_id, index_acl, get_attachments)

thread = threading.Thread(target=email_extraction_task.extract())
thread.start()
Expand Down

0 comments on commit 10c8dcc

Please sign in to comment.