Merge pull request #103 from swisstopo/LGVISIUM-89

LGVISIUM-89: Improved clipping behavior documentation
swisstopo · Nov 7, 2024 · a5d0947 · a5d0947 · github-actions · Nov 7, 2024
2 parents 498a4e3 + 530702c
commit a5d0947
Show file tree

Hide file tree

Showing 6 changed files with 417 additions and 35 deletions.
diff --git a/example/clipping_test-1.png b/example/clipping_test-1.png
diff --git a/example/clipping_test.pdf b/example/clipping_test.pdf
diff --git a/src/app/api/v1/router.py b/src/app/api/v1/router.py
@@ -38,7 +38,30 @@ class BadRequestResponse(BaseModel):
     },
 )
 def post_create_pngs(request: PNGRequest) -> PNGResponse:
-    """Create PNGs from the given data."""
+    """Create PNG images from a PDF stored in the S3 bucket.
+
+    This endpoint generates PNG images from each page of a specified PDF document stored in the AWS S3 bucket.
+    The PDF file must be accessible in the bucket with a valid filename provided in the request.
+
+    ### Request Body
+    - **request** (`PNGRequest`): Contains the `filename` of the PDF document in the S3 bucket from which PNGs
+    should be generated.
+
+    ### Returns
+    - **PNGResponse**: Response containing a list of keys (filenames) for the generated PNG images stored in the
+    S3 bucket.
+
+    ### Status Codes
+    - **200 OK**: PNG images were successfully created and stored in the S3 bucket.
+    - **400 Bad Request**: The request format or content is invalid. Verify that `filename` is correctly specified.
+    - **404 Not Found**: PDF file not found in S3 bucket.
+    - **500 Internal Server Error**: An error occurred on the server while creating PNGs.
+
+    ### Additional Information
+    - The endpoint connects to AWS S3 to retrieve the specified PDF, converts its pages to PNGs, and stores
+    the generated images back in S3. Ensure the PDF file exists in the S3 bucket and is accessible before
+    making a request.
+    """
     return create_pngs(request.filename)
 
 
@@ -58,7 +81,37 @@ def post_create_pngs(request: PNGRequest) -> PNGResponse:
 def post_extract_data(
     extract_data_request: ExtractDataRequest,
 ) -> ExtractCoordinatesResponse | ExtractTextResponse | ExtractNumberResponse:
-    """Extract data from the given PNGs."""
+    """Extract specified data from a given document based on the bounding box coordinates and format.
+
+    Behavior of the data extraction from the specified bounding box is the following: extraction on a per-letter
+    basis, which means that as soon as the specified bounding box overlaps (partially or fully) with a letter
+    or number, then this character is added to the extracted text. This behavior is consistent with the
+    clipping behavior of the `PyMuPDF` library.
+
+    ### Prerequisites
+    Ensure that the PDF file has been processed by the create_pngs endpoint first.
+
+    ### Request Body
+    - **extract_data_request**: Instance of `ExtractDataRequest`, containing file details, page number, bounding
+    box, and data format. The bounding box in PNG coordinates helps locate the region to extract data from.
+
+    ### Returns
+    The endpoint responds with one of the following response models based on the extracted data:
+    - **ExtractCoordinatesResponse**: If geographic coordinates are extracted.
+    - **ExtractTextResponse**: If text content is extracted.
+    - **ExtractNumberResponse**: If numerical data is extracted.
+
+    ### Status Codes
+    - **200 OK**: Successful extraction, returning the specified data type.
+    - **400 Bad Request**: Input request was invalid, typically due to misformatted or missing parameters.
+    - **404 Not Found**: Requested data could not be found within the specified bounding box or page.
+    - **500 Internal Server Error**: An error occurred on the server side during data extraction.
+
+    ### Error Handling
+    Known `ValueError`s (e.g., invalid input data) result in a `400 Bad Request` response with a relevant error
+    message.
+    For other errors, the endpoint returns a `500 Internal Server Error`.
+    """
     try:
         # Extract the data based on the request
         response = extract_data(extract_data_request)

diff --git a/src/app/common/schemas.py b/src/app/common/schemas.py
@@ -17,7 +17,17 @@
 
 
 def validate_filename(value: str) -> str:
-    """Ensure the filename is not empty."""
+    """Ensure the filename is not empty.
+
+    Args:
+        value (str): The filename to validate.
+
+    Returns:
+        str: The validated filename.
+
+    Raises:
+        ValueError: If the filename is empty
+    """
     if value == "":
         raise ValueError("Filename must not be empty.")
     return value
@@ -31,18 +41,28 @@ class PNGRequest(BaseModel):
     @field_validator("filename", mode="before")
     @classmethod
     def validate_filename(cls, value: str) -> str:
+        """Ensure the filename is not empty."""
         return validate_filename(value)
 
     class Config:
         """Make to allow using non-standard types like Path."""
 
-        arbitrary_types_allowed = True  # This allows using non-standard types like Path
+        arbitrary_types_allowed: bool = True  # This allows using non-standard types like Path
 
 
 class PNGResponse(BaseModel):
-    """Response schema for the create_pngs endpoint."""
+    """Response schema for the `create_pngs` endpoint, representing the output of PNG file creation and storage.
 
-    keys: list[str]  # keys in the S3 bucket
+    This schema lists the keys (identifiers) of the created PNG files stored in an S3 bucket,
+    enabling users to retrieve or reference them as needed.
+    """
+
+    keys: list[str] = Field(
+        ...,
+        description="""List of unique identifiers (keys) for the generated PNG files stored in the S3 bucket. Each key 
+        allows access to a specific file within the bucket.""",
+        example=["dataextraction/file1-1.png", "dataextraction/file1-2.png", "dataextraction/file1-3.png"],
+    )
 
 
 ########################################################################################################################
@@ -59,19 +79,44 @@ class FormatTypes(str, Enum):
 
 
 class BoundingBox(BaseModel):
-    """Bounding box schema."""
+    """Bounding box schema for defining a rectangular area within an image.
 
-    x0: float = Field(..., example=0.0)
-    y0: float = Field(..., example=0.0)
-    x1: float = Field(..., example=100.0)
-    y1: float = Field(..., example=100.0)
+    This schema represents the coordinates of the box’s corners, which can be used
+    to specify an area of interest in image processing tasks. Coordinates are
+    defined with the origin at the top-left of the image. Coordinates are in pixels.
+    """
+
+    x0: float = Field(
+        ...,
+        description="""The x-coordinate of the top-left corner of the bounding box. This value marks the 
+        horizontal starting point of the box.""",
+        example=0.0,
+    )
+    y0: float = Field(
+        ...,
+        description="""The y-coordinate of the top-left corner of the bounding box. This value marks the vertical 
+        starting point of the box.""",
+        example=0.0,
+    )
+    x1: float = Field(
+        ...,
+        description="""The x-coordinate of the bottom-right corner of the bounding box. This value marks the 
+        horizontal endpoint of the box.""",
+        example=100.0,
+    )
+    y1: float = Field(
+        ...,
+        description="""The y-coordinate of the bottom-right corner of the bounding box. This value marks the vertical 
+        endpoint of the box.""",
+        example=100.0,
+    )
 
     @field_validator("x0", "y0", "x1", "y1")
     @classmethod
-    def page_number_must_be_positive(cls, v: int) -> int:
-        """Validate that the page number is positive."""
+    def bbox_corners_must_be_positive(cls, v: int) -> int:
+        """Validate that the edges of the bounding box are positive."""
         if v < 0.0:
-            raise ValueError("Bounding box coordinate must be a positive integer")
+            raise ValueError("Bounding box coordinates must be positive")
         return v
 
     def rescale(
@@ -125,28 +170,96 @@ def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":
 
 
 class Coordinates(BaseModel):
-    """Coordinates schema."""
+    """Coordinates schema for representing geographical data points.
 
-    east: float = Field(..., example=1.0)
-    north: float = Field(..., example=2.0)
-    projection: str = Field(..., example="LV95")
+    This schema defines the format for specifying location data using east/north coordinates
+    along with the projection system used.
+    """
+
+    east: float = Field(
+        ...,
+        description="""Easting coordinate. The value should be in the units of the specified projection system.""",
+        example=1.0,
+    )
+    north: float = Field(
+        ...,
+        description="""Northing coordinate. The value should be in the units of the specified projection system.""",
+        example=2.0,
+    )
+    projection: str = Field(
+        ...,
+        description="""Projection system used to reference the coordinates. This defines the coordinate reference
+        system, such as 'LV95' for Swiss coordinate systems.""",
+        example="LV95",
+    )
 
 
 class ExtractDataRequest(ABC, BaseModel):
-    """Request schema for the extract_data endpoint.
+    """Request schema for the `extract_data` endpoint.
+
+    ** Requirements:**
+    Before using this schema, ensure that the PDF file has been processed by the create_pngs endpoint first.
+
+    **Coordinate Systems:**
+    - **PNG coordinates:** Pixels are measured from the top-left corner (0, 0), where x increases rightward
+    and y downward.
+
+    ### Fields
+    Each field below includes inline examples to aid users in creating requests. See `json_schema_extra`
+    for a complete example.
+
+    **Attributes:**
+    - **filename** (`Path`): Path to the PDF file. _Example_: `"document.pdf"`
+    - **page_number** (`int`): Target page for data extraction. This is a 1-based index. _Example_: `1`
+    - **bbox** (`BoundingBox`): Bounding box for the extraction area, in PNG coordinates. Origin is the
+    top-left, with x increasing rightward and y increasing downward.
+        - Example format: `{"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}`
+    - **format** (`FormatTypes`): Specifies the expected format for extracted data, e.g., `"coordinates"`.
+
+    ### Validation
+    Custom validators ensure data integrity:
+    - **Filename Validator:** Ensures filename is not empty.
+    - **Page Number Validator:** Confirms page number is positive.
+    - **Format Validator:** Checks format is valid as per `FormatTypes`.
+
+    The bounding box should be provided in PNG coordinates.
 
     Each field in the Pydantic model can have an example parameter, which provides an inline
     example for that specific field.
     """
 
-    filename: Path = Field(..., example=Path("document.png"))
-    page_number: int = Field(..., example=1)  # 1-based index
-    bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
-    format: FormatTypes = Field(..., example=FormatTypes.COORDINATES.value)
+    filename: Path = Field(
+        ...,
+        description="""Path to the input PDF document file that contains the data to be extracted. This should be
+        a valid file path, and the file should be accessible to the API.""",
+        example=Path("document.pdf"),
+    )
+    page_number: int = Field(
+        ...,
+        description="""Page number within the document where the extraction is to be performed. This is a 1-based 
+        index (e.g., 1 for the first page), applicable for multi-page files like PDFs.""",
+        example=1,
+    )
+    bbox: BoundingBox = Field(
+        ...,
+        description="""Bounding box defining the area for data extraction within the PNG version of the specified 
+        PDF file. The box is specified in pixels with the top-left as the origin (0,0), where x increases to the 
+        right and y increases downward. This box should be provided in PNG coordinates, and any 
+        transformations to PDF coordinates are managed internally.
+        """,
+        example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
+    )
+    format: FormatTypes = Field(
+        ...,
+        description="""Specifies the desired format for extracted data, allowing for options like `coordinates` or 
+        other defined `FormatTypes` values. This dictates the structure of the output returned by the API.""",
+        example=FormatTypes.COORDINATES.value,
+    )
 
     @field_validator("filename", mode="before")
     @classmethod
     def validate_filename(cls, value: str) -> str:
+        """Ensure the filename is not empty."""
         return validate_filename(value)
 
     @field_validator("page_number")
@@ -183,40 +296,73 @@ class Config:
 
 
 class ExtractDataResponse(ABC, BaseModel):
-    """Response schema for the extract_data endpoint."""
+    """Base response schema for the `extract_data` endpoint, representing the extracted data's bounding box.
+
+    This abstract base class provides a bounding box field for data localization and an abstract property
+    `response_type` to be implemented by subclasses, indicating the type of extracted content.
+    """
 
-    bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
+    bbox: BoundingBox = Field(
+        ...,
+        description="""Bounding box coordinates that define the area within the document where data was extracted.
+        The box is specified in PNG coordinates, with the origin at the top-left corner (0,0).""",
+        example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
+    )
 
     @property
     @abstractmethod
     def response_type(self):
-        """Abstract property to be implemented by subclasses to define response type."""
+        """Abstract property to be implemented by subclasses to define the type of response content."""
 
 
 class ExtractCoordinatesResponse(ExtractDataResponse):
-    """Response schema for the extract_data endpoint."""
+    """Response schema for the `extract_data` endpoint when returning geographic coordinates.
 
-    coordinates: Coordinates = Field(..., example={"east": 1.0, "north": 2.0, "page": 1, "projection": "LV95"})
+    This schema includes a `coordinates` field with east/north values and projection information.
+    """
+
+    coordinates: Coordinates = Field(
+        ...,
+        description="""Geographical coordinates extracted from the document, including east and north values, 
+        and projection type.""",
+        example={"east": 1.0, "north": 2.0, "projection": "LV95"},
+    )
 
     @property
     def response_type(self):
         return "coordinates"
 
 
 class ExtractTextResponse(ExtractDataResponse):
-    """Response schema for the extract_data endpoint."""
+    """Response schema for the `extract_data` endpoint when returning extracted text content.
+
+    This schema includes a `text` field with the extracted textual content from the specified bounding box.
+    """
 
-    text: str = Field(..., example="text")
+    text: str = Field(
+        ...,
+        description="""Text content extracted from the specified bounding box within the document.""",
+        example="text",
+    )
 
     @property
     def response_type(self):
         return "text"
 
 
 class ExtractNumberResponse(ExtractDataResponse):
-    """Response schema for the extract_data endpoint."""
+    """Response schema for the `extract_data` endpoint when returning numerical data.
+
+    This schema includes a `number` field for extracted numeric content, such as measurements or other
+    quantitative data.
+    """
 
-    number: float = Field(..., example=1.0)
+    number: float = Field(
+        ...,
+        description="""Numeric value extracted from the specified bounding box within the document, representing a
+        measurement or quantitative data.""",
+        example=1.0,
+    )
 
     @property
     def response_type(self):
File	Stmts	Miss	Cover	Missing
src/stratigraphy
__init__.py	8	1	88%	11
extract.py	192	192	0%	3–477
get_files.py	19	19	0%	3–47
main.py	119	119	0%	3–314
src/stratigraphy/benchmark
ground_truth.py	21	1	95%	47
src/stratigraphy/data_extractor
data_extractor.py	57	3	95%	33, 66, 103
src/stratigraphy/depthcolumn
boundarydepthcolumnvalidator.py	41	20	51%	47, 57, 60, 81–84, 110–128, 140–149
depthcolumn.py	223	80	64%	25, 29, 50, 67, 72, 78, 86–92, 113, 116, 124–125, 141–143, 150, 157, 165–166, 176, 193–209, 247, 255–256, 272–274, 312, 331–339, 350, 355, 362, 393, 398–405, 420–421, 464–506
depthcolumnentry.py	36	10	72%	17, 21, 37, 52, 55, 72, 81, 98–100
find_depth_columns.py	106	19	82%	42–43, 73, 86, 180–181, 225–245
src/stratigraphy/depths_materials_column_pairs
depths_materials_column_pairs.py	18	6	67%	23, 34, 55–59
src/stratigraphy/evaluation
evaluation_dataclasses.py	49	16	67%	24, 33, 42–44, 52, 71–74, 90, 104, 125–131, 137
metadata_evaluator.py	38	14	63%	52–71, 94–101
src/stratigraphy/groundwater
groundwater_extraction.py	147	98	33%	44, 52, 83, 98, 106, 125, 152–156, 171–191, 202–291, 307–339
utility.py	39	33	15%	10–17, 30–47, 59–73, 88–102
src/stratigraphy/layer
layer.py	37	18	51%	29, 39, 57–94, 106–107
layer_identifier_column.py	74	52	30%	16–17, 20, 28, 43, 47, 51, 59–63, 66, 74, 91–96, 99, 112, 125–126, 148–158, 172–199
src/stratigraphy/lines
geometric_line_utilities.py	86	2	98%	81, 131
line.py	51	4	92%	25, 50, 60, 110
linesquadtree.py	46	1	98%	75
src/stratigraphy/metadata
coordinate_extraction.py	108	5	95%	30, 64, 94–95, 107
elevation_extraction.py	79	52	34%	34, 42, 50, 66–69, 106–120, 132–135, 147–179, 194–202, 210–214
language_detection.py	18	13	28%	17–23, 37–45
metadata.py	66	24	64%	27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/text
description_block_splitter.py	70	2	97%	24, 139
extract_text.py	29	3	90%	19, 53–54
find_description.py	64	28	56%	27–35, 50–63, 79–95, 172–175
textblock.py	80	9	89%	28, 56, 64, 89, 101, 124, 145, 154, 183
src/stratigraphy/util
dataclasses.py	32	3	91%	37–39
interval.py	104	55	47%	29–32, 37–40, 46, 52, 56, 66–68, 107–153, 174, 180–196
predictions.py	105	41	61%	74–78, 86–94, 171–203, 242, 265–283
util.py	39	17	56%	41, 69–76, 90–92, 116–117, 129–133
TOTAL	2207	960	57%