Skip to content

Commit

Permalink
Merge pull request #103 from swisstopo/LGVISIUM-89
Browse files Browse the repository at this point in the history
LGVISIUM-89: Improved clipping behavior documentation
  • Loading branch information
dcleres authored Nov 7, 2024
2 parents 498a4e3 + 530702c commit a5d0947
Show file tree
Hide file tree
Showing 6 changed files with 417 additions and 35 deletions.
Binary file added example/clipping_test-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example/clipping_test.pdf
Binary file not shown.
57 changes: 55 additions & 2 deletions src/app/api/v1/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,30 @@ class BadRequestResponse(BaseModel):
},
)
def post_create_pngs(request: PNGRequest) -> PNGResponse:
"""Create PNGs from the given data."""
"""Create PNG images from a PDF stored in the S3 bucket.
This endpoint generates PNG images from each page of a specified PDF document stored in the AWS S3 bucket.
The PDF file must be accessible in the bucket with a valid filename provided in the request.
### Request Body
- **request** (`PNGRequest`): Contains the `filename` of the PDF document in the S3 bucket from which PNGs
should be generated.
### Returns
- **PNGResponse**: Response containing a list of keys (filenames) for the generated PNG images stored in the
S3 bucket.
### Status Codes
- **200 OK**: PNG images were successfully created and stored in the S3 bucket.
- **400 Bad Request**: The request format or content is invalid. Verify that `filename` is correctly specified.
- **404 Not Found**: PDF file not found in S3 bucket.
- **500 Internal Server Error**: An error occurred on the server while creating PNGs.
### Additional Information
- The endpoint connects to AWS S3 to retrieve the specified PDF, converts its pages to PNGs, and stores
the generated images back in S3. Ensure the PDF file exists in the S3 bucket and is accessible before
making a request.
"""
return create_pngs(request.filename)


Expand All @@ -58,7 +81,37 @@ def post_create_pngs(request: PNGRequest) -> PNGResponse:
def post_extract_data(
extract_data_request: ExtractDataRequest,
) -> ExtractCoordinatesResponse | ExtractTextResponse | ExtractNumberResponse:
"""Extract data from the given PNGs."""
"""Extract specified data from a given document based on the bounding box coordinates and format.
Behavior of the data extraction from the specified bounding box is the following: extraction on a per-letter
basis, which means that as soon as the specified bounding box overlaps (partially or fully) with a letter
or number, then this character is added to the extracted text. This behavior is consistent with the
clipping behavior of the `PyMuPDF` library.
### Prerequisites
Ensure that the PDF file has been processed by the create_pngs endpoint first.
### Request Body
- **extract_data_request**: Instance of `ExtractDataRequest`, containing file details, page number, bounding
box, and data format. The bounding box in PNG coordinates helps locate the region to extract data from.
### Returns
The endpoint responds with one of the following response models based on the extracted data:
- **ExtractCoordinatesResponse**: If geographic coordinates are extracted.
- **ExtractTextResponse**: If text content is extracted.
- **ExtractNumberResponse**: If numerical data is extracted.
### Status Codes
- **200 OK**: Successful extraction, returning the specified data type.
- **400 Bad Request**: Input request was invalid, typically due to misformatted or missing parameters.
- **404 Not Found**: Requested data could not be found within the specified bounding box or page.
- **500 Internal Server Error**: An error occurred on the server side during data extraction.
### Error Handling
Known `ValueError`s (e.g., invalid input data) result in a `400 Bad Request` response with a relevant error
message.
For other errors, the endpoint returns a `500 Internal Server Error`.
"""
try:
# Extract the data based on the request
response = extract_data(extract_data_request)
Expand Down
206 changes: 176 additions & 30 deletions src/app/common/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,17 @@


def validate_filename(value: str) -> str:
"""Ensure the filename is not empty."""
"""Ensure the filename is not empty.
Args:
value (str): The filename to validate.
Returns:
str: The validated filename.
Raises:
ValueError: If the filename is empty
"""
if value == "":
raise ValueError("Filename must not be empty.")
return value
Expand All @@ -31,18 +41,28 @@ class PNGRequest(BaseModel):
@field_validator("filename", mode="before")
@classmethod
def validate_filename(cls, value: str) -> str:
"""Ensure the filename is not empty."""
return validate_filename(value)

class Config:
"""Make to allow using non-standard types like Path."""

arbitrary_types_allowed = True # This allows using non-standard types like Path
arbitrary_types_allowed: bool = True # This allows using non-standard types like Path


class PNGResponse(BaseModel):
"""Response schema for the create_pngs endpoint."""
"""Response schema for the `create_pngs` endpoint, representing the output of PNG file creation and storage.
keys: list[str] # keys in the S3 bucket
This schema lists the keys (identifiers) of the created PNG files stored in an S3 bucket,
enabling users to retrieve or reference them as needed.
"""

keys: list[str] = Field(
...,
description="""List of unique identifiers (keys) for the generated PNG files stored in the S3 bucket. Each key
allows access to a specific file within the bucket.""",
example=["dataextraction/file1-1.png", "dataextraction/file1-2.png", "dataextraction/file1-3.png"],
)


########################################################################################################################
Expand All @@ -59,19 +79,44 @@ class FormatTypes(str, Enum):


class BoundingBox(BaseModel):
"""Bounding box schema."""
"""Bounding box schema for defining a rectangular area within an image.
x0: float = Field(..., example=0.0)
y0: float = Field(..., example=0.0)
x1: float = Field(..., example=100.0)
y1: float = Field(..., example=100.0)
This schema represents the coordinates of the box’s corners, which can be used
to specify an area of interest in image processing tasks. Coordinates are
defined with the origin at the top-left of the image. Coordinates are in pixels.
"""

x0: float = Field(
...,
description="""The x-coordinate of the top-left corner of the bounding box. This value marks the
horizontal starting point of the box.""",
example=0.0,
)
y0: float = Field(
...,
description="""The y-coordinate of the top-left corner of the bounding box. This value marks the vertical
starting point of the box.""",
example=0.0,
)
x1: float = Field(
...,
description="""The x-coordinate of the bottom-right corner of the bounding box. This value marks the
horizontal endpoint of the box.""",
example=100.0,
)
y1: float = Field(
...,
description="""The y-coordinate of the bottom-right corner of the bounding box. This value marks the vertical
endpoint of the box.""",
example=100.0,
)

@field_validator("x0", "y0", "x1", "y1")
@classmethod
def page_number_must_be_positive(cls, v: int) -> int:
"""Validate that the page number is positive."""
def bbox_corners_must_be_positive(cls, v: int) -> int:
"""Validate that the edges of the bounding box are positive."""
if v < 0.0:
raise ValueError("Bounding box coordinate must be a positive integer")
raise ValueError("Bounding box coordinates must be positive")
return v

def rescale(
Expand Down Expand Up @@ -125,28 +170,96 @@ def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":


class Coordinates(BaseModel):
"""Coordinates schema."""
"""Coordinates schema for representing geographical data points.
east: float = Field(..., example=1.0)
north: float = Field(..., example=2.0)
projection: str = Field(..., example="LV95")
This schema defines the format for specifying location data using east/north coordinates
along with the projection system used.
"""

east: float = Field(
...,
description="""Easting coordinate. The value should be in the units of the specified projection system.""",
example=1.0,
)
north: float = Field(
...,
description="""Northing coordinate. The value should be in the units of the specified projection system.""",
example=2.0,
)
projection: str = Field(
...,
description="""Projection system used to reference the coordinates. This defines the coordinate reference
system, such as 'LV95' for Swiss coordinate systems.""",
example="LV95",
)


class ExtractDataRequest(ABC, BaseModel):
"""Request schema for the extract_data endpoint.
"""Request schema for the `extract_data` endpoint.
** Requirements:**
Before using this schema, ensure that the PDF file has been processed by the create_pngs endpoint first.
**Coordinate Systems:**
- **PNG coordinates:** Pixels are measured from the top-left corner (0, 0), where x increases rightward
and y downward.
### Fields
Each field below includes inline examples to aid users in creating requests. See `json_schema_extra`
for a complete example.
**Attributes:**
- **filename** (`Path`): Path to the PDF file. _Example_: `"document.pdf"`
- **page_number** (`int`): Target page for data extraction. This is a 1-based index. _Example_: `1`
- **bbox** (`BoundingBox`): Bounding box for the extraction area, in PNG coordinates. Origin is the
top-left, with x increasing rightward and y increasing downward.
- Example format: `{"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}`
- **format** (`FormatTypes`): Specifies the expected format for extracted data, e.g., `"coordinates"`.
### Validation
Custom validators ensure data integrity:
- **Filename Validator:** Ensures filename is not empty.
- **Page Number Validator:** Confirms page number is positive.
- **Format Validator:** Checks format is valid as per `FormatTypes`.
The bounding box should be provided in PNG coordinates.
Each field in the Pydantic model can have an example parameter, which provides an inline
example for that specific field.
"""

filename: Path = Field(..., example=Path("document.png"))
page_number: int = Field(..., example=1) # 1-based index
bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
format: FormatTypes = Field(..., example=FormatTypes.COORDINATES.value)
filename: Path = Field(
...,
description="""Path to the input PDF document file that contains the data to be extracted. This should be
a valid file path, and the file should be accessible to the API.""",
example=Path("document.pdf"),
)
page_number: int = Field(
...,
description="""Page number within the document where the extraction is to be performed. This is a 1-based
index (e.g., 1 for the first page), applicable for multi-page files like PDFs.""",
example=1,
)
bbox: BoundingBox = Field(
...,
description="""Bounding box defining the area for data extraction within the PNG version of the specified
PDF file. The box is specified in pixels with the top-left as the origin (0,0), where x increases to the
right and y increases downward. This box should be provided in PNG coordinates, and any
transformations to PDF coordinates are managed internally.
""",
example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
)
format: FormatTypes = Field(
...,
description="""Specifies the desired format for extracted data, allowing for options like `coordinates` or
other defined `FormatTypes` values. This dictates the structure of the output returned by the API.""",
example=FormatTypes.COORDINATES.value,
)

@field_validator("filename", mode="before")
@classmethod
def validate_filename(cls, value: str) -> str:
"""Ensure the filename is not empty."""
return validate_filename(value)

@field_validator("page_number")
Expand Down Expand Up @@ -183,40 +296,73 @@ class Config:


class ExtractDataResponse(ABC, BaseModel):
"""Response schema for the extract_data endpoint."""
"""Base response schema for the `extract_data` endpoint, representing the extracted data's bounding box.
This abstract base class provides a bounding box field for data localization and an abstract property
`response_type` to be implemented by subclasses, indicating the type of extracted content.
"""

bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
bbox: BoundingBox = Field(
...,
description="""Bounding box coordinates that define the area within the document where data was extracted.
The box is specified in PNG coordinates, with the origin at the top-left corner (0,0).""",
example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
)

@property
@abstractmethod
def response_type(self):
"""Abstract property to be implemented by subclasses to define response type."""
"""Abstract property to be implemented by subclasses to define the type of response content."""


class ExtractCoordinatesResponse(ExtractDataResponse):
"""Response schema for the extract_data endpoint."""
"""Response schema for the `extract_data` endpoint when returning geographic coordinates.
coordinates: Coordinates = Field(..., example={"east": 1.0, "north": 2.0, "page": 1, "projection": "LV95"})
This schema includes a `coordinates` field with east/north values and projection information.
"""

coordinates: Coordinates = Field(
...,
description="""Geographical coordinates extracted from the document, including east and north values,
and projection type.""",
example={"east": 1.0, "north": 2.0, "projection": "LV95"},
)

@property
def response_type(self):
return "coordinates"


class ExtractTextResponse(ExtractDataResponse):
"""Response schema for the extract_data endpoint."""
"""Response schema for the `extract_data` endpoint when returning extracted text content.
This schema includes a `text` field with the extracted textual content from the specified bounding box.
"""

text: str = Field(..., example="text")
text: str = Field(
...,
description="""Text content extracted from the specified bounding box within the document.""",
example="text",
)

@property
def response_type(self):
return "text"


class ExtractNumberResponse(ExtractDataResponse):
"""Response schema for the extract_data endpoint."""
"""Response schema for the `extract_data` endpoint when returning numerical data.
This schema includes a `number` field for extracted numeric content, such as measurements or other
quantitative data.
"""

number: float = Field(..., example=1.0)
number: float = Field(
...,
description="""Numeric value extracted from the specified bounding box within the document, representing a
measurement or quantitative data.""",
example=1.0,
)

@property
def response_type(self):
Expand Down
Loading

1 comment on commit a5d0947

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1921920%3–477
   get_files.py19190%3–47
   main.py1191190%3–314
src/stratigraphy/benchmark
   ground_truth.py21195%47
src/stratigraphy/data_extractor
   data_extractor.py57395%33, 66, 103
src/stratigraphy/depthcolumn
   boundarydepthcolumnvalidator.py412051%47, 57, 60, 81–84, 110–128, 140–149
   depthcolumn.py2238064%25, 29, 50, 67, 72, 78, 86–92, 113, 116, 124–125, 141–143, 150, 157, 165–166, 176, 193–209, 247, 255–256, 272–274, 312, 331–339, 350, 355, 362, 393, 398–405, 420–421, 464–506
   depthcolumnentry.py361072%17, 21, 37, 52, 55, 72, 81, 98–100
   find_depth_columns.py1061982%42–43, 73, 86, 180–181, 225–245
src/stratigraphy/depths_materials_column_pairs
   depths_materials_column_pairs.py18667%23, 34, 55–59
src/stratigraphy/evaluation
   evaluation_dataclasses.py491667%24, 33, 42–44, 52, 71–74, 90, 104, 125–131, 137
   metadata_evaluator.py381463%52–71, 94–101
src/stratigraphy/groundwater
   groundwater_extraction.py1479833%44, 52, 83, 98, 106, 125, 152–156, 171–191, 202–291, 307–339
   utility.py393315%10–17, 30–47, 59–73, 88–102
src/stratigraphy/layer
   layer.py371851%29, 39, 57–94, 106–107
   layer_identifier_column.py745230%16–17, 20, 28, 43, 47, 51, 59–63, 66, 74, 91–96, 99, 112, 125–126, 148–158, 172–199
src/stratigraphy/lines
   geometric_line_utilities.py86298%81, 131
   line.py51492%25, 50, 60, 110
   linesquadtree.py46198%75
src/stratigraphy/metadata
   coordinate_extraction.py108595%30, 64, 94–95, 107
   elevation_extraction.py795234%34, 42, 50, 66–69, 106–120, 132–135, 147–179, 194–202, 210–214
   language_detection.py181328%17–23, 37–45
   metadata.py662464%27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/text
   description_block_splitter.py70297%24, 139
   extract_text.py29390%19, 53–54
   find_description.py642856%27–35, 50–63, 79–95, 172–175
   textblock.py80989%28, 56, 64, 89, 101, 124, 145, 154, 183
src/stratigraphy/util
   dataclasses.py32391%37–39
   interval.py1045547%29–32, 37–40, 46, 52, 56, 66–68, 107–153, 174, 180–196
   predictions.py1054161%74–78, 86–94, 171–203, 242, 265–283
   util.py391756%41, 69–76, 90–92, 116–117, 129–133
TOTAL220796057% 

Tests Skipped Failures Errors Time
89 0 💤 0 ❌ 0 🔥 7.752s ⏱️

Please sign in to comment.