-
Notifications
You must be signed in to change notification settings - Fork 0
/
api.py
133 lines (106 loc) · 3.81 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import logging
import os
import shutil
import uuid
from typing import Annotated
from fastapi import FastAPI, Depends, status, HTTPException, BackgroundTasks, Response
from pydantic import BaseModel, Field
from starlette.responses import JSONResponse
import ocr
from aws import aws
from utils import task
from utils.settings import ApiSettings, api_settings
app = FastAPI()
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
class StartPayload(BaseModel):
file: str = Field(min_length=1)
if api_settings().skip_processing:
logging.warning("SKIP_PROCESSING is active, files will always be marked as completed without being proceed")
@app.post("/")
def start(
payload: StartPayload,
settings: Annotated[ApiSettings, Depends(api_settings)],
background_tasks: BackgroundTasks,
):
if not payload.file.endswith('.pdf'):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"message": "input must be a PDF file"}
)
aws_client = aws.connect(settings)
has_file = aws_client.exists_file(
settings.s3_input_bucket,
f'{settings.s3_input_folder}{payload.file}',
)
if not has_file:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail={"message": "file does not exist"}
)
task.start(payload.file, background_tasks, lambda: process(payload, aws_client, settings))
return Response(status_code=status.HTTP_204_NO_CONTENT)
class CollectPayload(BaseModel):
file: str = Field(min_length=1)
@app.post("/collect")
def collect(
payload: CollectPayload,
):
result = task.collect_result(payload.file)
if result is None and not task.has_task(payload.file):
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail={"message": "OCR is not running for this file"}
)
has_finished = result is not None
if not has_finished:
logging.info(f"Processing of '{payload.file}' has not yet finished.")
return JSONResponse(status_code=status.HTTP_200_OK, content={
"has_finished": False,
"data": None,
})
if result.ok:
logging.info(f"Processing of '{payload.file}' has been successful.")
return JSONResponse(status_code=status.HTTP_200_OK, content={
"has_finished": True,
"data": result.value,
})
logging.info(f"Processing of '{payload.file}' has failed.")
return JSONResponse(status_code=status.HTTP_200_OK, content={
"has_finished": True,
"error": "Internal Server Error",
})
def process(
payload: StartPayload,
aws_client: aws.Client,
settings: Annotated[ApiSettings, Depends(api_settings)],
):
if settings.skip_processing:
# Sleep between 30 seconds to 2 minutes to simulate processing time.
# sleep(randint(30, 120))
return
task_id = f"{uuid.uuid4()}"
tmp_dir = os.path.join(settings.tmp_path, task_id)
os.makedirs(tmp_dir, exist_ok=True)
input_path = os.path.join(tmp_dir, "input.pdf")
output_path = os.path.join(tmp_dir, "output.pdf")
aws.load_file(
aws_client.bucket(settings.s3_input_bucket),
f'{settings.s3_input_folder}{payload.file}',
input_path,
)
ocr.process(
input_path,
output_path,
tmp_dir,
aws_client.textract,
settings.confidence_threshold,
settings.use_aggressive_strategy,
)
aws.store_file(
aws_client.bucket(settings.s3_output_bucket),
f'{settings.s3_output_folder}{payload.file}',
output_path,
)
shutil.rmtree(tmp_dir)
return ()