diff --git a/example/clipping_test-1.png b/example/clipping_test-1.png new file mode 100644 index 00000000..c8bc709c Binary files /dev/null and b/example/clipping_test-1.png differ diff --git a/example/clipping_test.pdf b/example/clipping_test.pdf new file mode 100644 index 00000000..2b6a8f21 Binary files /dev/null and b/example/clipping_test.pdf differ diff --git a/src/app/api/v1/router.py b/src/app/api/v1/router.py index 090a44f4..a77a4495 100644 --- a/src/app/api/v1/router.py +++ b/src/app/api/v1/router.py @@ -38,7 +38,30 @@ class BadRequestResponse(BaseModel): }, ) def post_create_pngs(request: PNGRequest) -> PNGResponse: - """Create PNGs from the given data.""" + """Create PNG images from a PDF stored in the S3 bucket. + + This endpoint generates PNG images from each page of a specified PDF document stored in the AWS S3 bucket. + The PDF file must be accessible in the bucket with a valid filename provided in the request. + + ### Request Body + - **request** (`PNGRequest`): Contains the `filename` of the PDF document in the S3 bucket from which PNGs + should be generated. + + ### Returns + - **PNGResponse**: Response containing a list of keys (filenames) for the generated PNG images stored in the + S3 bucket. + + ### Status Codes + - **200 OK**: PNG images were successfully created and stored in the S3 bucket. + - **400 Bad Request**: The request format or content is invalid. Verify that `filename` is correctly specified. + - **404 Not Found**: PDF file not found in S3 bucket. + - **500 Internal Server Error**: An error occurred on the server while creating PNGs. + + ### Additional Information + - The endpoint connects to AWS S3 to retrieve the specified PDF, converts its pages to PNGs, and stores + the generated images back in S3. Ensure the PDF file exists in the S3 bucket and is accessible before + making a request. + """ return create_pngs(request.filename) @@ -58,7 +81,37 @@ def post_create_pngs(request: PNGRequest) -> PNGResponse: def post_extract_data( extract_data_request: ExtractDataRequest, ) -> ExtractCoordinatesResponse | ExtractTextResponse | ExtractNumberResponse: - """Extract data from the given PNGs.""" + """Extract specified data from a given document based on the bounding box coordinates and format. + + Behavior of the data extraction from the specified bounding box is the following: extraction on a per-letter + basis, which means that as soon as the specified bounding box overlaps (partially or fully) with a letter + or number, then this character is added to the extracted text. This behavior is consistent with the + clipping behavior of the `PyMuPDF` library. + + ### Prerequisites + Ensure that the PDF file has been processed by the create_pngs endpoint first. + + ### Request Body + - **extract_data_request**: Instance of `ExtractDataRequest`, containing file details, page number, bounding + box, and data format. The bounding box in PNG coordinates helps locate the region to extract data from. + + ### Returns + The endpoint responds with one of the following response models based on the extracted data: + - **ExtractCoordinatesResponse**: If geographic coordinates are extracted. + - **ExtractTextResponse**: If text content is extracted. + - **ExtractNumberResponse**: If numerical data is extracted. + + ### Status Codes + - **200 OK**: Successful extraction, returning the specified data type. + - **400 Bad Request**: Input request was invalid, typically due to misformatted or missing parameters. + - **404 Not Found**: Requested data could not be found within the specified bounding box or page. + - **500 Internal Server Error**: An error occurred on the server side during data extraction. + + ### Error Handling + Known `ValueError`s (e.g., invalid input data) result in a `400 Bad Request` response with a relevant error + message. + For other errors, the endpoint returns a `500 Internal Server Error`. + """ try: # Extract the data based on the request response = extract_data(extract_data_request) diff --git a/src/app/common/schemas.py b/src/app/common/schemas.py index 6467882d..d6c5e830 100644 --- a/src/app/common/schemas.py +++ b/src/app/common/schemas.py @@ -17,7 +17,17 @@ def validate_filename(value: str) -> str: - """Ensure the filename is not empty.""" + """Ensure the filename is not empty. + + Args: + value (str): The filename to validate. + + Returns: + str: The validated filename. + + Raises: + ValueError: If the filename is empty + """ if value == "": raise ValueError("Filename must not be empty.") return value @@ -31,18 +41,28 @@ class PNGRequest(BaseModel): @field_validator("filename", mode="before") @classmethod def validate_filename(cls, value: str) -> str: + """Ensure the filename is not empty.""" return validate_filename(value) class Config: """Make to allow using non-standard types like Path.""" - arbitrary_types_allowed = True # This allows using non-standard types like Path + arbitrary_types_allowed: bool = True # This allows using non-standard types like Path class PNGResponse(BaseModel): - """Response schema for the create_pngs endpoint.""" + """Response schema for the `create_pngs` endpoint, representing the output of PNG file creation and storage. - keys: list[str] # keys in the S3 bucket + This schema lists the keys (identifiers) of the created PNG files stored in an S3 bucket, + enabling users to retrieve or reference them as needed. + """ + + keys: list[str] = Field( + ..., + description="""List of unique identifiers (keys) for the generated PNG files stored in the S3 bucket. Each key + allows access to a specific file within the bucket.""", + example=["dataextraction/file1-1.png", "dataextraction/file1-2.png", "dataextraction/file1-3.png"], + ) ######################################################################################################################## @@ -59,19 +79,44 @@ class FormatTypes(str, Enum): class BoundingBox(BaseModel): - """Bounding box schema.""" + """Bounding box schema for defining a rectangular area within an image. - x0: float = Field(..., example=0.0) - y0: float = Field(..., example=0.0) - x1: float = Field(..., example=100.0) - y1: float = Field(..., example=100.0) + This schema represents the coordinates of the box’s corners, which can be used + to specify an area of interest in image processing tasks. Coordinates are + defined with the origin at the top-left of the image. Coordinates are in pixels. + """ + + x0: float = Field( + ..., + description="""The x-coordinate of the top-left corner of the bounding box. This value marks the + horizontal starting point of the box.""", + example=0.0, + ) + y0: float = Field( + ..., + description="""The y-coordinate of the top-left corner of the bounding box. This value marks the vertical + starting point of the box.""", + example=0.0, + ) + x1: float = Field( + ..., + description="""The x-coordinate of the bottom-right corner of the bounding box. This value marks the + horizontal endpoint of the box.""", + example=100.0, + ) + y1: float = Field( + ..., + description="""The y-coordinate of the bottom-right corner of the bounding box. This value marks the vertical + endpoint of the box.""", + example=100.0, + ) @field_validator("x0", "y0", "x1", "y1") @classmethod - def page_number_must_be_positive(cls, v: int) -> int: - """Validate that the page number is positive.""" + def bbox_corners_must_be_positive(cls, v: int) -> int: + """Validate that the edges of the bounding box are positive.""" if v < 0.0: - raise ValueError("Bounding box coordinate must be a positive integer") + raise ValueError("Bounding box coordinates must be positive") return v def rescale( @@ -125,28 +170,96 @@ def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox": class Coordinates(BaseModel): - """Coordinates schema.""" + """Coordinates schema for representing geographical data points. - east: float = Field(..., example=1.0) - north: float = Field(..., example=2.0) - projection: str = Field(..., example="LV95") + This schema defines the format for specifying location data using east/north coordinates + along with the projection system used. + """ + + east: float = Field( + ..., + description="""Easting coordinate. The value should be in the units of the specified projection system.""", + example=1.0, + ) + north: float = Field( + ..., + description="""Northing coordinate. The value should be in the units of the specified projection system.""", + example=2.0, + ) + projection: str = Field( + ..., + description="""Projection system used to reference the coordinates. This defines the coordinate reference + system, such as 'LV95' for Swiss coordinate systems.""", + example="LV95", + ) class ExtractDataRequest(ABC, BaseModel): - """Request schema for the extract_data endpoint. + """Request schema for the `extract_data` endpoint. + + ** Requirements:** + Before using this schema, ensure that the PDF file has been processed by the create_pngs endpoint first. + + **Coordinate Systems:** + - **PNG coordinates:** Pixels are measured from the top-left corner (0, 0), where x increases rightward + and y downward. + + ### Fields + Each field below includes inline examples to aid users in creating requests. See `json_schema_extra` + for a complete example. + + **Attributes:** + - **filename** (`Path`): Path to the PDF file. _Example_: `"document.pdf"` + - **page_number** (`int`): Target page for data extraction. This is a 1-based index. _Example_: `1` + - **bbox** (`BoundingBox`): Bounding box for the extraction area, in PNG coordinates. Origin is the + top-left, with x increasing rightward and y increasing downward. + - Example format: `{"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}` + - **format** (`FormatTypes`): Specifies the expected format for extracted data, e.g., `"coordinates"`. + + ### Validation + Custom validators ensure data integrity: + - **Filename Validator:** Ensures filename is not empty. + - **Page Number Validator:** Confirms page number is positive. + - **Format Validator:** Checks format is valid as per `FormatTypes`. + + The bounding box should be provided in PNG coordinates. Each field in the Pydantic model can have an example parameter, which provides an inline example for that specific field. """ - filename: Path = Field(..., example=Path("document.png")) - page_number: int = Field(..., example=1) # 1-based index - bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}) - format: FormatTypes = Field(..., example=FormatTypes.COORDINATES.value) + filename: Path = Field( + ..., + description="""Path to the input PDF document file that contains the data to be extracted. This should be + a valid file path, and the file should be accessible to the API.""", + example=Path("document.pdf"), + ) + page_number: int = Field( + ..., + description="""Page number within the document where the extraction is to be performed. This is a 1-based + index (e.g., 1 for the first page), applicable for multi-page files like PDFs.""", + example=1, + ) + bbox: BoundingBox = Field( + ..., + description="""Bounding box defining the area for data extraction within the PNG version of the specified + PDF file. The box is specified in pixels with the top-left as the origin (0,0), where x increases to the + right and y increases downward. This box should be provided in PNG coordinates, and any + transformations to PDF coordinates are managed internally. + """, + example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}, + ) + format: FormatTypes = Field( + ..., + description="""Specifies the desired format for extracted data, allowing for options like `coordinates` or + other defined `FormatTypes` values. This dictates the structure of the output returned by the API.""", + example=FormatTypes.COORDINATES.value, + ) @field_validator("filename", mode="before") @classmethod def validate_filename(cls, value: str) -> str: + """Ensure the filename is not empty.""" return validate_filename(value) @field_validator("page_number") @@ -183,20 +296,37 @@ class Config: class ExtractDataResponse(ABC, BaseModel): - """Response schema for the extract_data endpoint.""" + """Base response schema for the `extract_data` endpoint, representing the extracted data's bounding box. + + This abstract base class provides a bounding box field for data localization and an abstract property + `response_type` to be implemented by subclasses, indicating the type of extracted content. + """ - bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}) + bbox: BoundingBox = Field( + ..., + description="""Bounding box coordinates that define the area within the document where data was extracted. + The box is specified in PNG coordinates, with the origin at the top-left corner (0,0).""", + example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}, + ) @property @abstractmethod def response_type(self): - """Abstract property to be implemented by subclasses to define response type.""" + """Abstract property to be implemented by subclasses to define the type of response content.""" class ExtractCoordinatesResponse(ExtractDataResponse): - """Response schema for the extract_data endpoint.""" + """Response schema for the `extract_data` endpoint when returning geographic coordinates. - coordinates: Coordinates = Field(..., example={"east": 1.0, "north": 2.0, "page": 1, "projection": "LV95"}) + This schema includes a `coordinates` field with east/north values and projection information. + """ + + coordinates: Coordinates = Field( + ..., + description="""Geographical coordinates extracted from the document, including east and north values, + and projection type.""", + example={"east": 1.0, "north": 2.0, "projection": "LV95"}, + ) @property def response_type(self): @@ -204,9 +334,16 @@ def response_type(self): class ExtractTextResponse(ExtractDataResponse): - """Response schema for the extract_data endpoint.""" + """Response schema for the `extract_data` endpoint when returning extracted text content. + + This schema includes a `text` field with the extracted textual content from the specified bounding box. + """ - text: str = Field(..., example="text") + text: str = Field( + ..., + description="""Text content extracted from the specified bounding box within the document.""", + example="text", + ) @property def response_type(self): @@ -214,9 +351,18 @@ def response_type(self): class ExtractNumberResponse(ExtractDataResponse): - """Response schema for the extract_data endpoint.""" + """Response schema for the `extract_data` endpoint when returning numerical data. + + This schema includes a `number` field for extracted numeric content, such as measurements or other + quantitative data. + """ - number: float = Field(..., example=1.0) + number: float = Field( + ..., + description="""Numeric value extracted from the specified bounding box within the document, representing a + measurement or quantitative data.""", + example=1.0, + ) @property def response_type(self): diff --git a/src/app/main.py b/src/app/main.py index 8b184b59..5da7dde0 100644 --- a/src/app/main.py +++ b/src/app/main.py @@ -76,7 +76,18 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE #################################################################################################### @app.get("/health", tags=["health"]) def get_health(): - """Check the health of the application.""" + """Check the health of the application. + + This endpoint provides a simple health check to verify that the application is up and running. + It can be used for monitoring purposes to ensure the API is responsive. + + ### Returns + - **200 OK**: The application is running and responsive. + - **Response Body**: Returns a plain text message indicating the health status, typically `"Healthy"`. + + ### Usage + Use this endpoint as a basic check in monitoring or load balancer setups to assess application uptime. + """ return "Healthy" @@ -85,7 +96,19 @@ def get_health(): #################################################################################################### @app.get("/version") def get_version(): - """Return the version of the application.""" + """Return the current version of the application. + + This endpoint provides the current application version as specified in the environment variables. + Useful for tracking deployed versions in staging or production environments. + + ### Returns + - **200 OK**: The version information was successfully retrieved. + - **Response Body**: JSON object with the application version, e.g., `{"version": "1.0.0"}`. + + ### Notes + Ensure the `APP_VERSION` environment variable is set; otherwise, the response may contain `null` or an + empty version value. + """ return {"version": os.getenv("APP_VERSION")} diff --git a/tests/test_data_extraction_from_bbox.py b/tests/test_data_extraction_from_bbox.py index f387552f..3f391449 100644 --- a/tests/test_data_extraction_from_bbox.py +++ b/tests/test_data_extraction_from_bbox.py @@ -20,11 +20,17 @@ TEST_PDF_PATH = Path(__file__).parent.parent / "example" / "example_borehole_profile.pdf" TEST_PNG_KEY = Path("dataextraction/sample-1.png") TEST_PNG_PATH = Path(__file__).parent.parent / "example" / "sample-1.png" + TEST_ROTATED_PNG_KEY = Path("dataextraction/16132-1.png") TEST_ROTATED_PNG_PATH = Path(__file__).parent.parent / "example" / "16132-1.png" TEST_ROTATED_PDF_KEY = Path("16132.pdf") TEST_ROTATED_PDF_PATH = Path(__file__).parent.parent / "example" / "16132.pdf" # Rotated PDF of 270 degrees +TEST_CLIPPING_BEHAVIOR_PDF_PATH = Path(__file__).parent.parent / "example" / "clipping_test.pdf" +TEST_CLIPPING_BEHAVIOR_PDF_KEY = Path("clipping_test.pdf") +TEST_CLIPPING_BEHAVIOR_PNG_PATH = Path(__file__).parent.parent / "example" / "clipping_test-1.png" +TEST_CLIPPING_BEHAVIOR_PNG_KEY = Path("dataextraction/clipping_test-1.png") + def get_default_small_coordinate_request(): """Return a default ExtractDataRequest for coordinates.""" @@ -63,6 +69,11 @@ def upload_test_pdf(s3_client): s3_client.upload_file( Filename=str(TEST_ROTATED_PDF_PATH), Bucket=config.test_bucket_name, Key=str(TEST_ROTATED_PDF_KEY) ) + s3_client.upload_file( + Filename=str(TEST_CLIPPING_BEHAVIOR_PDF_PATH), + Bucket=config.test_bucket_name, + Key=str(TEST_CLIPPING_BEHAVIOR_PDF_KEY), + ) @pytest.fixture(scope="function") @@ -72,6 +83,11 @@ def upload_test_png(s3_client, upload_test_pdf): s3_client.upload_file( Filename=str(TEST_ROTATED_PNG_PATH), Bucket=config.test_bucket_name, Key=str(TEST_ROTATED_PNG_KEY) ) + s3_client.upload_file( + Filename=str(TEST_CLIPPING_BEHAVIOR_PNG_PATH), + Bucket=config.test_bucket_name, + Key=str(TEST_CLIPPING_BEHAVIOR_PNG_KEY), + ) def test_load_pdf_from_aws(upload_test_pdf): @@ -130,6 +146,111 @@ def test_extract_text_success(test_client: TestClient, upload_test_pdf, upload_t assert json_response["text"] == target_text +def test_clipping_behavior(test_client: TestClient, upload_test_pdf, upload_test_png): + """Test the extract_data endpoint with a valid request.""" + #################################################################################################### + ### Extract Data on Normal PDF with bounding box with all text inside + #################################################################################################### + target_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut" + + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 311, "y0": 269, "x1": 821, "y1": 704}, # pixels + format=FormatTypes.TEXT, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert json_response["text"] == target_text + + #################################################################################################### + ### Extract Data on Normal PDF with bounding box with text on the boundary (e.g., the bounding box line is on the + ### text) + #################################################################################################### + target_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut" + + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 311, "y0": 299, "x1": 813, "y1": 704}, # pixels + format=FormatTypes.TEXT, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert json_response["text"] == target_text + + #################################################################################################### + ### Extract Data on Normal PDF with bounding box with only a few words selected out of the text. + ### Here the text was done with multiple Text Boxes (e.g., each line is a different Text Box). + #################################################################################################### + target_text = "Lorem ipsum" + + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 311, "y0": 269, "x1": 611, "y1": 336}, # pixels + format=FormatTypes.TEXT, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert json_response["text"] == target_text + + #################################################################################################### + ### Extract Data on Normal PDF with bounding box with only a few words selected out of the text. + ### Here the text was done with one Text Box. + #################################################################################################### + target_text = "Lorem ipsum" + + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 1848, "y0": 242, "x1": 2145, "y1": 303}, # pixels + format=FormatTypes.TEXT, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert json_response["text"] == target_text + + #################################################################################################### + ### Extract Data on Normal PDF with bounding box with only one part of one word selected out of the text. + #################################################################################################### + target_text = "Lo" + + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 315, "y0": 281, "x1": 371, "y1": 330}, # pixels + format=FormatTypes.TEXT, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert json_response["text"] == target_text + + target_text = "Lorem" + + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 315, "y0": 300, "x1": 465, "y1": 330}, # pixels + format=FormatTypes.TEXT, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert json_response["text"] == target_text + + def test_extract_text_empty(test_client: TestClient, upload_test_pdf, upload_test_png): """Test the extract_data endpoint with a valid request.""" request = ExtractDataRequest( @@ -147,6 +268,9 @@ def test_extract_text_empty(test_client: TestClient, upload_test_pdf, upload_tes def test_extract_coordinate_success(test_client: TestClient, upload_test_pdf, upload_test_png): """Test the extract_data endpoint with a valid request.""" + #################################################################################################### + ### Extract Data on Normal PDF with LV03 coordinates + #################################################################################################### request = get_default_coordinate_request() response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) assert response.status_code == 200 @@ -157,6 +281,42 @@ def test_extract_coordinate_success(test_client: TestClient, upload_test_pdf, up assert json_response["coordinates"]["north"] == 157500 assert json_response["coordinates"]["projection"] == "LV03" + #################################################################################################### + ### Extract Data on Rotated PDF with LV03 coordinates + #################################################################################################### + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 1625, "y0": 900, "x1": 2819, "y1": 968}, # pixels + format=FormatTypes.COORDINATES, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert "coordinates" in json_response + assert json_response["coordinates"]["east"] == 684592.0 + assert json_response["coordinates"]["north"] == 252857.0 + assert json_response["coordinates"]["projection"] == "LV03" + + #################################################################################################### + ### Extract Data on Rotated PDF with LV95 coordinates + #################################################################################################### + request = ExtractDataRequest( + filename=TEST_CLIPPING_BEHAVIOR_PDF_KEY.name, + page_number=1, + bbox={"x0": 1625, "y0": 1000, "x1": 2819, "y1": 1068}, # pixels + format=FormatTypes.COORDINATES, + ) + response = test_client.post("/api/V1/extract_data", content=request.model_dump_json()) + assert response.status_code == 200 + json_response = response.json() + assert "bbox" in json_response + assert "coordinates" in json_response + assert json_response["coordinates"]["east"] == 2682834.0 + assert json_response["coordinates"]["north"] == 1253400.0 + assert json_response["coordinates"]["projection"] == "LV95" + def test_incomplete_request(test_client: TestClient, upload_test_pdf, upload_test_png): """Test the extract_data endpoint with an incomplete request.""" @@ -200,7 +360,7 @@ def test_invalid_bbox(test_client: TestClient, upload_test_pdf, upload_test_png) request_json["bbox"] = {"x0": 0, "y0": 0, "x1": 100, "y1": -100.0} response = test_client.post("/api/V1/extract_data", json=request_json) assert response.status_code == 400 - assert response.json() == {"detail": "Bounding box coordinate must be a positive integer"} + assert response.json() == {"detail": "Bounding box coordinates must be positive"} def test_invalid_pdf(test_client: TestClient, upload_test_pdf, upload_test_png):