Skip to content

Commit

Permalink
Merge pull request #106 from swisstopo/LGVISIUM-102/merge-a-to-b-inte…
Browse files Browse the repository at this point in the history
…rval-and-depth-column-entry

LGVISIUM-102: merge AToBInterval and AToBDepthColumnEntry
  • Loading branch information
stijnvermeeren-swisstopo authored Nov 22, 2024
2 parents b66b08e + 8ff58a4 commit c6bea44
Show file tree
Hide file tree
Showing 22 changed files with 255 additions and 249 deletions.
15 changes: 15 additions & 0 deletions src/stratigraphy/depth/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Modules for extracting values indicating some measured depth below the surface."""

from .a_to_b_interval_extractor import AToBIntervalExtractor
from .depthcolumnentry import DepthColumnEntry
from .depthcolumnentry_extractor import DepthColumnEntryExtractor
from .interval import AAboveBInterval, AToBInterval, Interval

__all__ = [
"AAboveBInterval",
"AToBInterval",
"AToBIntervalExtractor",
"DepthColumnEntry",
"DepthColumnEntryExtractor",
"Interval",
]
92 changes: 92 additions & 0 deletions src/stratigraphy/depth/a_to_b_interval_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Contains logic for finding AToBInterval instances in a text."""

import re

import fitz

from stratigraphy.lines.line import TextLine

from .depthcolumnentry import DepthColumnEntry
from .interval import AToBInterval
from .util import value_as_float


class AToBIntervalExtractor:
"""Methods for finding AToBInterval instances (e.g. "0.5m - 1.8m") in a text."""

@classmethod
def from_material_description_lines(cls, lines: list[TextLine]) -> AToBInterval | None:
"""Extract depth interval from text lines from a material description.
For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material
description. Often, these text descriptions contain a further separation into multiple sub layers.
These sub layers have their own depth intervals. This function extracts the overall depth interval,
spanning across all mentioned sub layers.
For example (from GeoQuat 12306):
1) REMBLAIS HETEROGENES
0.00 - 0.08 m : Revêtement bitumineux
0.08- 0.30 m : Grave d'infrastructure
0.30 - 1.40 m : Grave dans importante matrice de sable
moyen, brun beige, pulvérulent.
From this material description, this method will extract a single depth interval that starts at 0m and ends
at 1.40m.
Args:
lines (list[TextLine]): The lines to extract the depth interval from.
Returns:
AToBInterval | None: The depth interval (if any) or None (if no depth interval was found).
"""
depth_entries = []
for line in lines:
try:
a_to_b_depth_entry = AToBIntervalExtractor.from_text(
line.text, line.rect, require_start_of_string=False
)
# require_start_of_string = False because the depth interval may not always start at the beginning
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
if a_to_b_depth_entry:
depth_entries.append(a_to_b_depth_entry)
except ValueError:
pass

if depth_entries:
# Merge the sub layers into one depth interval.
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value)
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value)
return AToBInterval(start, end)
else:
return None

@classmethod
def from_text(cls, text: str, rect: fitz.Rect, require_start_of_string: bool = True) -> AToBInterval | None:
"""Attempts to extract a AToBInterval from a string.
Args:
text (str): The string to extract the depth interval from.
rect (fitz.Rect): The rectangle of the text.
require_start_of_string (bool, optional): Whether the number to extract needs to be
at the start of a string. Defaults to True.
Returns:
AToBInterval | None: The extracted AToBInterval or None if none is found.
"""
input_string = text.strip().replace(",", ".")

query = r"-?([0-9]+(\.[0-9]+)?)[müMN\]*[\s-]+([0-9]+(\.[0-9]+)?)[müMN\\.]*"
if not require_start_of_string:
query = r".*?" + query
regex = re.compile(query)
match = regex.match(input_string)
if match:
value1 = value_as_float(match.group(1))
first_half_rect = fitz.Rect(rect.x0, rect.y0, rect.x1 - rect.width / 2, rect.y1)

value2 = value_as_float(match.group(3))
second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1)
return AToBInterval(
DepthColumnEntry(first_half_rect, value1),
DepthColumnEntry(second_half_rect, value2),
)
return None
35 changes: 35 additions & 0 deletions src/stratigraphy/depth/depthcolumnentry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Contains a dataclass for depth column entries, which indicate the measured depth of an interface between layers."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

import fitz


@dataclass
class DepthColumnEntry: # noqa: D101
"""Class to represent a depth column entry."""

rect: fitz.Rect
value: float

def __repr__(self) -> str:
return str(self.value)

def to_json(self) -> dict[str, Any]:
"""Convert the depth column entry to a JSON serializable format."""
return {"value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1]}

@classmethod
def from_json(cls, data: dict) -> DepthColumnEntry:
"""Converts a dictionary to an object.
Args:
data (dict): A dictionary representing the depth column entry.
Returns:
DepthColumnEntry: The depth column entry object.
"""
return cls(rect=fitz.Rect(data["rect"]), value=data["value"])
46 changes: 46 additions & 0 deletions src/stratigraphy/depth/depthcolumnentry_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Contains logic for finding depth column entries in text."""

import re

from stratigraphy.depth import DepthColumnEntry
from stratigraphy.depth.util import value_as_float
from stratigraphy.lines.line import TextWord

from .a_to_b_interval_extractor import AToBIntervalExtractor


class DepthColumnEntryExtractor:
"""Methods for finding depth column entries in a text."""

@classmethod
def find_in_words(cls, all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]:
"""Find all depth column entries given a list of TextWord objects.
Note: Only depths up to two digits before the decimal point are supported.
Args:
all_words (list[TextWord]): List of text words to extract depth column entries from.
include_splits (bool): Whether to include split entries.
Returns:
list[DepthColumnEntry]: The extracted depth column entries.
"""
entries = []
for word in sorted(all_words, key=lambda word: word.rect.y0):
try:
input_string = word.text.strip().replace(",", ".")
regex = re.compile(r"^-?\.?([0-9]+(\.[0-9]+)?)[müMN\\.]*$")
# numbers such as '.40' are not supported. The reason is that sometimes the OCR
# recognizes a '-' as a '.' and we just ommit the leading '.' to avoid this issue.
match = regex.match(input_string)
if match:
value = value_as_float(match.group(1))
entries.append(DepthColumnEntry(word.rect, value))

elif include_splits:
# support for e.g. "1.10-1.60m" extracted as a single word
a_to_b_interval = AToBIntervalExtractor.from_text(input_string, word.rect)
entries.extend([a_to_b_interval.start, a_to_b_interval.end] if a_to_b_interval else [])
except ValueError:
pass
return entries
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@

import fitz

from stratigraphy.depthcolumn.depthcolumnentry import (
AToBDepthColumnEntry,
DepthColumnEntry,
)
from stratigraphy.lines.line import TextLine
from stratigraphy.text.textblock import TextBlock

from .depthcolumnentry import DepthColumnEntry


class Interval(metaclass=abc.ABCMeta):
"""Abstract class for (depth) intervals."""
Expand Down Expand Up @@ -143,9 +141,13 @@ def matching_blocks(
class AToBInterval(Interval):
"""Class for intervals that are defined in a single line like "1.00 - 2.30m"."""

def __init__(self, layer_depth_column_entry: AToBDepthColumnEntry):
self.entry = layer_depth_column_entry
super().__init__(layer_depth_column_entry.start, layer_depth_column_entry.end)
def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry):
super().__init__(start, end)

@property
def rect(self) -> fitz.Rect:
"""Get the rectangle surrounding the interval."""
return fitz.Rect(self.start.rect).include_rect(self.end.rect)

@property
def line_anchor(self) -> fitz.Point | None:
Expand Down Expand Up @@ -177,38 +179,3 @@ def matching_blocks(
return [TextBlock(matched_lines)]
else:
return []

@classmethod
def get_depth_interval_from_lines(cls, lines: list[TextLine]) -> AToBInterval | None:
"""Extract depth interval from text lines.
For borehole profiles in the Deriaz layout, the depth interval is usually found in the text of the material
description. Often, these text descriptions contain a further separation into multiple sub layers.
These sub layers have their own depth intervals. This function extracts the overall depth interval,
spanning across all mentioned sub layers.
Args:
lines (list[TextLine]): The lines to extract the depth interval from.
Returns:
AToBInterval | None: The depth interval (if any) or None (if no depth interval was found).
"""
depth_entries = []
for line in lines:
try:
layer_depth_entry = AToBDepthColumnEntry.from_text(line.text, line.rect, require_start_of_string=False)
# require_start_of_string = False because the depth interval may not always start at the beginning
# of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
if layer_depth_entry:
depth_entries.append(layer_depth_entry)
except ValueError:
pass

if depth_entries:
# Merge the sub layers into one depth interval.
start = min([entry.start for entry in depth_entries], key=lambda start_entry: start_entry.value)
end = max([entry.end for entry in depth_entries], key=lambda end_entry: end_entry.value)

return AToBInterval(AToBDepthColumnEntry(start, end))
else:
return None
10 changes: 10 additions & 0 deletions src/stratigraphy/depth/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Contains utility functions for depth column entries."""

import re


def value_as_float(string_value: str) -> float: # noqa: D103
"""Converts a string to a float."""
# OCR sometimes tends to miss the decimal comma
parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
return abs(float(parsed_text))
Loading

1 comment on commit c6bea44

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1671670%3–446
   get_files.py19190%3–47
   main.py1141140%3–307
src/stratigraphy/benchmark
   metrics.py594229%22–25, 29–32, 36–39, 46–49, 53–54, 58, 65–74, 78–91, 96–133
src/stratigraphy/data_extractor
   data_extractor.py74495%33, 46, 123, 168
src/stratigraphy/depth
   a_to_b_interval_extractor.py371559%41–60, 79, 92
   depthcolumnentry.py15380%19, 23, 35
   depthcolumnentry_extractor.py23291%44–45
   interval.py1015249%26–29, 34–37, 43, 49, 53, 92–138, 159, 165–181
src/stratigraphy/depths_materials_column_pairs
   bounding_boxes.py301067%23, 32, 50, 60, 72–78
   material_description_rect_with_sidebar.py18856%27–41
src/stratigraphy/evaluation
   evaluation_dataclasses.py491178%52, 71–74, 90, 104, 125–131, 147
   groundwater_evaluator.py48198%77
   layer_evaluator.py664630%29–30, 35–39, 47, 69–95, 105–113, 128–149
   metadata_evaluator.py371462%46–65, 86–93
   utility.py16756%43–52
src/stratigraphy/groundwater
   groundwater_extraction.py1569937%52, 94, 127–132, 140, 167–171, 186–206, 217–306, 322–354
   utility.py393315%10–17, 30–47, 59–73, 88–102
src/stratigraphy/layer
   layer.py361364%25, 28, 36, 51–71
src/stratigraphy/lines
   geometric_line_utilities.py86298%81, 131
   line.py51492%25, 50, 60, 110
   linesquadtree.py46198%75
src/stratigraphy/metadata
   coordinate_extraction.py106496%29, 93–94, 106
   elevation_extraction.py906033%34–39, 47, 55, 63, 79–87, 124–138, 150–153, 165–197, 212–220, 228–232
   language_detection.py181328%17–23, 37–45
   metadata.py662464%27, 83, 101–127, 146–155, 195–198, 206
src/stratigraphy/sidebar
   a_above_b_sidebar.py944057%38, 44, 63–71, 82, 87, 94, 107, 112–119, 134–135, 177–218
   a_above_b_sidebar_extractor.py29390%46–48
   a_above_b_sidebar_validator.py412051%48, 58, 61, 81–84, 109–127, 139–148
   a_to_b_sidebar.py431467%36, 49–50, 67, 95–108
   layer_identifier_sidebar.py513237%23–24, 27, 59–78, 94–110, 122, 135
   layer_identifier_sidebar_extractor.py413320%30–40, 54–86
   sidebar.py40198%84
src/stratigraphy/text
   description_block_splitter.py70297%24, 139
   extract_text.py29390%19, 53–54
   find_description.py41880%26–34, 111–114
   textblock.py901188%22, 27, 39, 44, 71, 79, 104, 116, 139, 160, 189
src/stratigraphy/util
   dataclasses.py32391%37–39
   predictions.py723453%72, 95–115, 143–187
   util.py341362%41, 69–76, 90–92, 116–117
TOTAL237398658% 

Tests Skipped Failures Errors Time
99 0 💤 0 ❌ 0 🔥 7.749s ⏱️

Please sign in to comment.