-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #106 from swisstopo/LGVISIUM-102/merge-a-to-b-inte…
…rval-and-depth-column-entry LGVISIUM-102: merge AToBInterval and AToBDepthColumnEntry
- Loading branch information
Showing
22 changed files
with
255 additions
and
249 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
"""Modules for extracting values indicating some measured depth below the surface.""" | ||
|
||
from .a_to_b_interval_extractor import AToBIntervalExtractor | ||
from .depthcolumnentry import DepthColumnEntry | ||
from .depthcolumnentry_extractor import DepthColumnEntryExtractor | ||
from .interval import AAboveBInterval, AToBInterval, Interval | ||
|
||
__all__ = [ | ||
"AAboveBInterval", | ||
"AToBInterval", | ||
"AToBIntervalExtractor", | ||
"DepthColumnEntry", | ||
"DepthColumnEntryExtractor", | ||
"Interval", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
"""Contains logic for finding AToBInterval instances in a text.""" | ||
|
||
import re | ||
|
||
import fitz | ||
|
||
from stratigraphy.lines.line import TextLine | ||
|
||
from .depthcolumnentry import DepthColumnEntry | ||
from .interval import AToBInterval | ||
from .util import value_as_float | ||
|
||
|
||
class AToBIntervalExtractor: | ||
"""Methods for finding AToBInterval instances (e.g. "0.5m - 1.8m") in a text.""" | ||
|
||
@classmethod | ||
def from_material_description_lines(cls, lines: list[TextLine]) -> AToBInterval | None: | ||
"""Extract depth interval from text lines from a material description. | ||
For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material | ||
description. Often, these text descriptions contain a further separation into multiple sub layers. | ||
These sub layers have their own depth intervals. This function extracts the overall depth interval, | ||
spanning across all mentioned sub layers. | ||
For example (from GeoQuat 12306): | ||
1) REMBLAIS HETEROGENES | ||
0.00 - 0.08 m : Revêtement bitumineux | ||
0.08- 0.30 m : Grave d'infrastructure | ||
0.30 - 1.40 m : Grave dans importante matrice de sable | ||
moyen, brun beige, pulvérulent. | ||
From this material description, this method will extract a single depth interval that starts at 0m and ends | ||
at 1.40m. | ||
Args: | ||
lines (list[TextLine]): The lines to extract the depth interval from. | ||
Returns: | ||
AToBInterval | None: The depth interval (if any) or None (if no depth interval was found). | ||
""" | ||
depth_entries = [] | ||
for line in lines: | ||
try: | ||
a_to_b_depth_entry = AToBIntervalExtractor.from_text( | ||
line.text, line.rect, require_start_of_string=False | ||
) | ||
# require_start_of_string = False because the depth interval may not always start at the beginning | ||
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" | ||
if a_to_b_depth_entry: | ||
depth_entries.append(a_to_b_depth_entry) | ||
except ValueError: | ||
pass | ||
|
||
if depth_entries: | ||
# Merge the sub layers into one depth interval. | ||
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value) | ||
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value) | ||
return AToBInterval(start, end) | ||
else: | ||
return None | ||
|
||
@classmethod | ||
def from_text(cls, text: str, rect: fitz.Rect, require_start_of_string: bool = True) -> AToBInterval | None: | ||
"""Attempts to extract a AToBInterval from a string. | ||
Args: | ||
text (str): The string to extract the depth interval from. | ||
rect (fitz.Rect): The rectangle of the text. | ||
require_start_of_string (bool, optional): Whether the number to extract needs to be | ||
at the start of a string. Defaults to True. | ||
Returns: | ||
AToBInterval | None: The extracted AToBInterval or None if none is found. | ||
""" | ||
input_string = text.strip().replace(",", ".") | ||
|
||
query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*" | ||
if not require_start_of_string: | ||
query = r".*?" + query | ||
regex = re.compile(query) | ||
match = regex.match(input_string) | ||
if match: | ||
value1 = value_as_float(match.group(1)) | ||
first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1) | ||
|
||
value2 = value_as_float(match.group(3)) | ||
second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) | ||
return AToBInterval( | ||
DepthColumnEntry(first_half_rect, value1), | ||
DepthColumnEntry(second_half_rect, value2), | ||
) | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
"""Contains a dataclass for depth column entries, which indicate the measured depth of an interface between layers.""" | ||
|
||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import Any | ||
|
||
import fitz | ||
|
||
|
||
@dataclass | ||
class DepthColumnEntry: # noqa: D101 | ||
"""Class to represent a depth column entry.""" | ||
|
||
rect: fitz.Rect | ||
value: float | ||
|
||
def __repr__(self) -> str: | ||
return str(self.value) | ||
|
||
def to_json(self) -> dict[str, Any]: | ||
"""Convert the depth column entry to a JSON serializable format.""" | ||
return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]} | ||
|
||
@classmethod | ||
def from_json(cls, data: dict) -> DepthColumnEntry: | ||
"""Converts a dictionary to an object. | ||
Args: | ||
data (dict): A dictionary representing the depth column entry. | ||
Returns: | ||
DepthColumnEntry: The depth column entry object. | ||
""" | ||
return cls(rect=fitz.Rect(data["rect"]), value=data["value"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
"""Contains logic for finding depth column entries in text.""" | ||
|
||
import re | ||
|
||
from stratigraphy.depth import DepthColumnEntry | ||
from stratigraphy.depth.util import value_as_float | ||
from stratigraphy.lines.line import TextWord | ||
|
||
from .a_to_b_interval_extractor import AToBIntervalExtractor | ||
|
||
|
||
class DepthColumnEntryExtractor: | ||
"""Methods for finding depth column entries in a text.""" | ||
|
||
@classmethod | ||
def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: | ||
"""Find all depth column entries given a list of TextWord objects. | ||
Note: Only depths up to two digits before the decimal point are supported. | ||
Args: | ||
all_words (list[TextWord]): List of text words to extract depth column entries from. | ||
include_splits (bool): Whether to include split entries. | ||
Returns: | ||
list[DepthColumnEntry]: The extracted depth column entries. | ||
""" | ||
entries = [] | ||
for word in sorted(all_words, key=lambda word: word.rect.y0): | ||
try: | ||
input_string = word.text.strip().replace(",", ".") | ||
regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$") | ||
# numbers such as '.40' are not supported. The reason is that sometimes the OCR | ||
# recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue. | ||
match = regex.match(input_string) | ||
if match: | ||
value = value_as_float(match.group(1)) | ||
entries.append(DepthColumnEntry(word.rect, value)) | ||
|
||
elif include_splits: | ||
# support for e.g. "1.10-1.60m" extracted as a single word | ||
a_to_b_interval = AToBIntervalExtractor.from_text(input_string, word.rect) | ||
entries.extend([a_to_b_interval.start, a_to_b_interval.end] if a_to_b_interval else []) | ||
except ValueError: | ||
pass | ||
return entries |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
"""Contains utility functions for depth column entries.""" | ||
|
||
import re | ||
|
||
|
||
def value_as_float(string_value: str) -> float: # noqa: D103 | ||
"""Converts a string to a float.""" | ||
# OCR sometimes tends to miss the decimal comma | ||
parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) | ||
return abs(float(parsed_text)) |
Oops, something went wrong.
c6bea44
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Coverage Report