From 4d82070a405d2bd0d78ad3fe2789880f39dd9730 Mon Sep 17 00:00:00 2001 From: dcleres Date: Wed, 18 Sep 2024 15:48:27 +0200 Subject: [PATCH 01/14] Added illustration matcing to the extraction of groundwater data --- .../data_extractor/data_extractor.py | 28 ++++-- .../assets/267123077-bp_page1_template.npy | Bin 0 -> 1328 bytes .../assets/697243001-bp_page1_template.npy | Bin 0 -> 1628 bytes .../groundwater/groundwater_extraction.py | 85 +++++++++++++++++- 4 files changed, 102 insertions(+), 11 deletions(-) create mode 100644 src/stratigraphy/groundwater/assets/267123077-bp_page1_template.npy create mode 100644 src/stratigraphy/groundwater/assets/697243001-bp_page1_template.npy diff --git a/src/stratigraphy/data_extractor/data_extractor.py b/src/stratigraphy/data_extractor/data_extractor.py index 2889df48..842c1766 100644 --- a/src/stratigraphy/data_extractor/data_extractor.py +++ b/src/stratigraphy/data_extractor/data_extractor.py @@ -122,14 +122,28 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]: list[TextLine]: The lines close to the key. """ key_rect = key_line.rect - elevation_search_rect = fitz.Rect( - key_rect.x0 - self.search_left_factor * key_rect.width, - key_rect.y0, - key_rect.x1 + self.search_right_factor * key_rect.width, - key_rect.y1 + self.search_below_factor * key_rect.height, - ) - feature_lines = [line for line in lines if line.rect.intersects(elevation_search_rect)] + feature_lines = self.get_lines_near_rect(lines, key_rect) # makes sure the line with the key is included first in the extracted information and the duplicate removed feature_lines.insert(0, key_line) return list(dict.fromkeys(feature_lines)) + + def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]: + """Find the lines of the text that are close to a given rectangle. + + Args: + lines (list[TextLine]): Arbitrary text lines to search in. + rect (fitz.Rect): The rectangle to search around. + + Returns: + list[TextLine]: The lines close to the rectangle. + """ + search_rect = fitz.Rect( + rect.x0 - self.search_left_factor * rect.width, + rect.y0, + rect.x1 + self.search_right_factor * rect.width, + rect.y1 + self.search_below_factor * rect.height, + ) + feature_lines = [line for line in lines if line.rect.intersects(search_rect)] + + return feature_lines diff --git a/src/stratigraphy/groundwater/assets/267123077-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/267123077-bp_page1_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..3c0cf6c741d90e4cbe47be719cf149452a1f039e GIT binary patch literal 1328 zcmbV~+fy557{=?Ze+75w3>S9v%0Iw6t9GPJbsWke2o|B79SFn}fk>%Ri*2!;v7@yW z2dAV_LQDu@*(5+x2rUT=rw|V2z;4K9&-@7b5%v4+lf2P^u5 z)kDkG1ECKF!WTxKU2sGz27@0u!WDzz_lF|q1|sLL#x6SI<^4~~ZQ)CfXx+$4)fcSA z5&rmdPn~t9!Rl}5_P1YM>ad5~``B(f+t=r{*_}UlkM6p5Z%=NFJzD?%$^PBRt^3Z@ zjoHOfPv~oRFY=u~^sSE_^(~DBqIa0p@!;BRX7vucc8iJK@U7kSJ-a@= zJmOjyb+cnL%zZX`Z!S3QJbXASO!(xV+`_%dg9om|3HSc5?#wR}`G=2oHJn#)QNfao zRT%>TfulnHB!8IWGgF8d4NP(RCH;I^hcVtFfP( zhuojglzQiCtih_DV6{C|X`8LI&eZq%>w5e(J#$q(bC-x2KrILASlqHCBKm|#Q*1O%!C$OuD?U;!hV zgNlJdL9}WpCLkwE(PVNyzP^x1(3Qf(YTHN5EG?#l#bkbQHyuvw#x}R; zxDlcfEE0pd5(&vuO+dHQBrW2cnJqGqZXieMq(8-}G6QmJD{fZa^T^ALMc^z^FITlb6V95ZG z_5#x0(~&a=;2#LBPzx^ra8W0xDmqnAK?b6RB*~B^U4a_Vp{ih6fH3)|vbB>}%n6bZ HFq8iaz48X& literal 0 HcmV?d00001 diff --git a/src/stratigraphy/groundwater/assets/697243001-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/697243001-bp_page1_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..2aaa6d3d6ab0ea4caf2fcc4c715af219c46ec51e GIT binary patch literal 1628 zcmbW1`BM{T9L5WBBq88AIwFz~Mx&OF2esA%J5>ZJ1(oWkMMNy1Vn-ZQ&`PH&^*}i+ z8tPrE;5FV3byOh9CLsq2_a%xscC*Ws}g7cQxDoTz(J;92sjBtxLHP9 zl|Wuh4V*z$U`WFaU;}B9dr$~R6>eAts>a&Di-?l4CHOYZ92lDz)e!zp?9Gg9MdsQ1M6%WaJ3BM9YfCc5l{Ch**oIy8g+M# z;60+F{ zeaZb><(JEkNzNTDJ9((+NY43F`I3^tKP0DaUpZTKz2we~(py*0mYvQ&c6e9uiM(G9 z@5)SDmzkEFzI81>B#6i71O;(|c$^8ryqTdB#|QC3c-#;!Cz#9Oa{_^bJRVHI{6F!z zoFGmhpBo4~Fu;FI{(fmYx20!p&pVjAZ~xw$z1c?#3v%}C%FfJAOWmHbE+uF8-aWf= z($jYA*q*Ut$Ii60^v#>Mu2~Zw7q>PsF)21SEddXLsQ%`WVH4?UVezgZrf+n>G)VQjG1U;$Hq5pIAMtju9zE^W4Ktk|sZP>u zVCZ&_$AEiU2AS?bvI{Y(M@OBbq{(Jdo-NwHB_;mkiGqt2Co3)-KV5v_aDK++--~Ws zJ9GW&>8e|$<)uf8Pvu>@T=d|vq~tl40>*^m;uDt2l5woOm&-Bin} zLMH`q{d_t8{z3def1Yn3-;e7bJc&Owm@|7vIGwPbPh{LkipDvp{LF(+oh;^@fP0zvqS70aJKen7blWPhu% zx1pm&+1J}??CUdsce~|9)#HJ+6awgBpy6g0xFD)@o1t#~i`o~rY93cztt`7=U3ssn^3NOPH?NmnsXTL8 zavJ0ZcP~DAbm>lYMfKf^hmR!2UMY!bDO54W2h=cNYjauU1OgENJE-O(-Z?<@((bN7 zqRor9crc9YLrs_dM{~UNxhe9W{D=MUq?ICq*Y2I^@IhYkw(Va1gZe{0Kq(F zDS@h7sInhX;1(GH%L!Npg3DM7gj=L;xDH3_X@`=w1ExVN3OEL_0B%;0L6xB&kr9Z3 zwW%ps0Vqn?AP*uX%?iLe)~cpq6*Hy=!UVVnPE-!swO+J=F)O{WhC%d%Nkf5ev#P-z z-I#*3swlVt><0(YHi$rEj9EQ^=mxEgURcXoH2;b)YiL+Egz3E&C2fRQz+sD)wl-1L KdO%dtqWv48XVoMC literal 0 HcmV?d00001 diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index 1c3adbdf..3eab37c4 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -2,12 +2,15 @@ import abc import logging +import os from dataclasses import dataclass from datetime import date as dt from datetime import datetime +from pathlib import Path import fitz import numpy as np +import skimage as ski from stratigraphy.data_extractor.data_extractor import DataExtractor, ExtractedFeature from stratigraphy.groundwater.utility import extract_date, extract_depth, extract_elevation from stratigraphy.util.extract_text import extract_text_lines @@ -267,6 +270,80 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G else: raise ValueError("Could not extract all required information from the lines provided.") + def load_templates(self) -> list[np.ndarray]: + """Load the templates for the groundwater information. + + Returns: + list[np.ndarray]: the loaded templates + """ + templates = [] + template_dir = os.path.join(os.path.dirname(__file__), "assets") + for template in os.listdir(template_dir): + if template.endswith(".npy"): + templates.append(np.load(os.path.join(template_dir, template))) + return templates + + def get_groundwater_from_illustration( + self, lines: list[TextLine], page_number: int + ) -> list[GroundwaterInformationOnPage]: + """Extracts the groundwater information from an illustration. + + Args: + lines (list[TextLine]): the lines of text to extract the groundwater information from + page_number (int): the page number (1-based) of the PDF document + Returns: + list[GroundwaterInformationOnPage]: the extracted groundwater information + """ + extracted_groundwater_list = [] + + # convert the doc to an image + page = self.doc.load_page(page_number - 1) + pix = page.get_pixmap(matrix=fitz.Matrix(3, 3)) + filename = Path(self.doc.name).stem + png_filename = f"{filename}-{page_number + 1}.png" + png_path = f"/tmp/{png_filename}" # Local path to save the PNG + + pix.save(png_path) + + # load the image + img = ski.io.imread(png_path) + + # extract the groundwater information from the image + for template in self.load_templates(): + result = ski.feature.match_template(img, template) + ij = np.unravel_index(np.argmax(result), result.shape) + # confidence = np.max(result) # TODO - use confidence to filter out bad matches + top_left = (ij[1], ij[0]) + illustration_rect = fitz.Rect( + top_left[0], top_left[1], top_left[0] + template.shape[1], top_left[1] + template.shape[0] + ) + + # convert the illustration_rect to the coordinate system of the PDF + horizontal_scaling = page.rect.width / img.shape[1] + vertical_scaling = page.rect.height / img.shape[0] + pdf_illustration_rect = fitz.Rect( + illustration_rect.x0 * horizontal_scaling, + illustration_rect.y0 * vertical_scaling, + illustration_rect.x1 * horizontal_scaling, + illustration_rect.y1 * vertical_scaling, + ) + + # extract the groundwater information from the image using the text + groundwater_info_lines = self.get_lines_near_rect(lines, pdf_illustration_rect) + + # sort the lines by their proximity to the key line center, compute the distance to the key line center + key_center = (illustration_rect.x0 + illustration_rect.x1) / 2 + groundwater_info_lines.sort(key=lambda line: abs((line.rect.x0 + line.rect.x1) / 2 - key_center)) + try: + extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page) + if extracted_gw.groundwater.depth: + extracted_groundwater_list.append(extracted_gw) + except ValueError as error: + logger.warning("ValueError: %s", error) + logger.warning("Could not extract groundwater information from the lines near the key.") + + return extracted_groundwater_list + def extract_groundwater(self) -> list[GroundwaterInformationOnPage]: """Extracts the groundwater information from a borehole profile. @@ -282,10 +359,10 @@ def extract_groundwater(self) -> list[GroundwaterInformationOnPage]: lines = extract_text_lines(page) page_number = page.number + 1 # page.number is 0-based - found_groundwater = ( - self.get_groundwater_near_key(lines, page_number) - # or XXXX # Add other techniques here - ) + found_groundwater = self.get_groundwater_near_key(lines, page_number) + if not found_groundwater: + logger.info("No groundwater found near the key on page %s.", page_number) + found_groundwater = self.get_groundwater_from_illustration(lines, page_number) if found_groundwater: groundwater_output = ", ".join([str(entry.groundwater) for entry in found_groundwater]) From 25b40329abc91aa72ba8193414665d3e6527f86d Mon Sep 17 00:00:00 2001 From: dcleres Date: Wed, 18 Sep 2024 22:22:50 +0200 Subject: [PATCH 02/14] Minor fix for the page and page number confusion --- src/stratigraphy/groundwater/groundwater_extraction.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index d16639bf..217a4728 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -337,8 +337,8 @@ def get_groundwater_from_illustration( key_center = (illustration_rect.x0 + illustration_rect.x1) / 2 groundwater_info_lines.sort(key=lambda line: abs((line.rect.x0 + line.rect.x1) / 2 - key_center)) try: - extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page) - if extracted_gw.groundwater.depth: + extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page_number) + if extracted_gw.groundwater.depth or extracted_gw.groundwater.elevation: extracted_groundwater_list.append(extracted_gw) except ValueError as error: logger.warning("ValueError: %s", error) @@ -370,6 +370,8 @@ def extract_groundwater( if not found_groundwater: logger.info("No groundwater found near the key on page %s.", page_number) found_groundwater = self.get_groundwater_from_illustration(lines, page_number) + if not found_groundwater: + logger.info("No groundwater found in the illustration on page %s.", page_number) if terrain_elevation: # If the elevation is provided, calculate the depth of the groundwater From e7868951917c450999552826d2826681ce174e0d Mon Sep 17 00:00:00 2001 From: dcleres Date: Fri, 20 Sep 2024 09:05:10 +0200 Subject: [PATCH 03/14] Bug fixes and case by case improvements --- .../data_extractor/data_extractor.py | 4 +- .../assets/266126001-bp_page1_template.npy | Bin 0 -> 1388 bytes .../assets/266126001-bp_template.npy | Bin 0 -> 1388 bytes .../assets/268124336-bp_page1_template.npy | Bin 0 -> 5003 bytes .../assets/268124375-bp_page1_template.npy | Bin 0 -> 3638 bytes .../assets/697243001-bp_page1_template.npy | Bin 1628 -> 0 bytes .../groundwater/groundwater_extraction.py | 190 ++++++++++++++---- src/stratigraphy/groundwater/utility.py | 15 +- 8 files changed, 164 insertions(+), 45 deletions(-) create mode 100644 src/stratigraphy/groundwater/assets/266126001-bp_page1_template.npy create mode 100644 src/stratigraphy/groundwater/assets/266126001-bp_template.npy create mode 100644 src/stratigraphy/groundwater/assets/268124336-bp_page1_template.npy create mode 100644 src/stratigraphy/groundwater/assets/268124375-bp_page1_template.npy delete mode 100644 src/stratigraphy/groundwater/assets/697243001-bp_page1_template.npy diff --git a/src/stratigraphy/data_extractor/data_extractor.py b/src/stratigraphy/data_extractor/data_extractor.py index cf1f4045..bf052656 100644 --- a/src/stratigraphy/data_extractor/data_extractor.py +++ b/src/stratigraphy/data_extractor/data_extractor.py @@ -48,6 +48,8 @@ class DataExtractor(ABC): search_right_factor: float = 0 # How much below a key do we look for the feature information, as a multiple of the key line height search_below_factor: float = 0 + # How much above a key do we look for the feature information, as a multiple of the key line height + search_above_factor: float = 0 preprocess_replacements: dict[str, str] = {} @@ -140,7 +142,7 @@ def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]: """ search_rect = fitz.Rect( rect.x0 - self.search_left_factor * rect.width, - rect.y0, + rect.y0 - self.search_above_factor * rect.height, rect.x1 + self.search_right_factor * rect.width, rect.y1 + self.search_below_factor * rect.height, ) diff --git a/src/stratigraphy/groundwater/assets/266126001-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/266126001-bp_page1_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..a6abee21776c4d497c17650a5ce6020ca6e23322 GIT binary patch literal 1388 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JQ);NLqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$-Mmh?{ra-Kzqfo0r4&eGvFOZu0pVp|P(Z7HH@7^VVe*L0?MN}~Y0Bpx0%>V!Z literal 0 HcmV?d00001 diff --git a/src/stratigraphy/groundwater/assets/266126001-bp_template.npy b/src/stratigraphy/groundwater/assets/266126001-bp_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..a6abee21776c4d497c17650a5ce6020ca6e23322 GIT binary patch literal 1388 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JQ);NLqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$-Mmh?{ra-Kzqfo0r4&eGvFOZu0pVp|P(Z7HH@7^VVe*L0?MN}~Y0Bpx0%>V!Z literal 0 HcmV?d00001 diff --git a/src/stratigraphy/groundwater/assets/268124336-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/268124336-bp_page1_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..a31e63ec820d772f01e94cad4eb07c0fad1b0529 GIT binary patch literal 5003 zcmeH}TQ6Ku6vx}TMBQ%{9>{o*QBl!KJ@mrvwC(;b$Vimb{_nb+4EnQv-aBiti8^D_aZ;@W%1oR?RR?2*6Nlj zoB5f^{I>0}IniXUX|lE2EN?2BY}V>?{c}r0OEvnI*OumL z?+;gSc2&4C$<6v+bp>ZPo0col-FykBr>A82?JvOe)q(F02xr6{g?dP=GM=2Ah@1GJ zL?fPXrel-{imR{?@lg>}0uWbr-A#Sj2d9~_6;o4Fy}iBt{rzus=<4d~?(VkPY&|_a zeSLlH?d|Xu2b!Ck8yg$z>+9?4>KYmvnjC=C*49>4RlRngrl!VfwITvcj24RpjaVxy zE6dBv%gV}1N=ja7C@wB8EG*2=&(F=x&CJZq$;rvf%gf5j%E-t-oSvR`nmuF z`}_L(dU$x;`yFU7g@=cGdwc(GaV~jzc`3tP4Z*>|=mAp%1qB5L1`^iO)03D!K0fy~ zFcH>}kdV;O&<74gL_|bIMa9O(rl+UR%*_043Fz&1`||Sg%F4=n9adLY*VfiHHa0dl zH$Q6F-rmMpu)n|mS;OJs;TH|+2uM})mIV|uKnaBFN6qW}q!2-g5OFieA0GqjB~AKd zj2~e-{SqW$(Ma0z_~irQk29;LX-IIQ38W{WKR!NYRM2-1#g+8}(Sn`-GYkIWpw~0+ zp9%k|G2R)q@C$ClVN%LwcXwASANwZ_Xs9cJR%jm)cXoC}$x~ZfTOTy6udh>?fD)x( zDP4i^rKP3$`FV<&n$+hHzmSznBKthECG+VKvgwu*?%mV`hG-P}xCMJf4hVTR^2L}fc zhlhtpM@JdQ#>PfQM&OL&}PEA58eA=bI;a2B?bRg&kq-nSM7|zE_V)Jph_LN+8{r2B2l5L3 l%Ue+EQfo0F%Go`MLFUTjo|&txRGtPzIp31o9dO$n_z6V~1OEU3 literal 0 HcmV?d00001 diff --git a/src/stratigraphy/groundwater/assets/268124375-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/268124375-bp_page1_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..552f72e1d44a0ca052e0fdf3de09dd9ca17ead93 GIT binary patch literal 3638 zcmc&$ZBtrD6i&x)ooSP3`Xx0fnp9%M7ZfR?PBa?FTD1hDh=_=Qf*Lh|#uv0ke?X^o zrj!5Cb|&NJv|s!Mt(uH8erg2n8*t%Xq0hZ*kO6EHGZUYAxcBT~ckel8&)MCr-u@ng z`IAcvmsXTcm(Sr<>J`esyjrPJD92~Ley@GTHtTh|Bz=c{+UJ7a=eEze(7vPAsTA6J zm7=a(rC3#*2cM>7h%yq1#G7F7pWqSAGPDeqW+E&XWw}g5G7yhM<9s|8=Q%zWOT+}m8!C)s=BVarZNN26I7~Hnrd}@okp#xtE^M2>or=f zRHhuJ`)_gY-o4h=*0#2`_V)IUjt&R_1mJ;; zw$6?R-CdnsU0vUH^?cXe+tb@^Fm&}9x_f%i=kepmu~;na0$dUV0qd~8zrXeKkMrIc z-;?QSkIOSL<{7t7xgE1p6H`;3@o^^zVRF)Kw~tz_mQkD4H3p7=4*T%fs70d9Vjmew z(=j>>jni&*j9P5NgEota&^~N-+J-HrzL6o5bIfY94w_9y(<6g)f#L!Y77_z-@4-`UwYKR=I{XSpEdfFSUEG#aI&5jrA~ig1)HKt?h#AcV9m zYbcI;XEI?fO*YK26ib?zfeo|SL^`t)r5|K;4zWy-VQ7Yd84*N2(*=@>@B&sbbdV6! zN;5MdnmVGW5W`R)giuIo(sVc+4$@&NO)3-y(FDTuDQT8QlVT{23Z9Y}a{v%L^SY!BE^uuouPkOWXxN}B@!@1+39Ko9gr*DR8;`JN{O zvtB4qXhc!^mK>537xI~Xs)NIM38%NVekv#^C@#K!`SNF1u6$ltc%`W5YT>ma!mCAv zMPFPizIj7YT5{v7o7aoKgj{l~6iu-7_O07xUn@#V%F4>h%gdEYCH6G3$w9cQs;E}i zRMk{hRH|wr~ zfA;K$_4T#2wZQ7?%5vZ-VPI`(ePeZLWnp2_=l9R~yfc3P?BY`&xG?_&GBiH?W@fM- z&dtra-ENNz2ooNsbKLHl7#nvx?2eJqG3(H#{e4_GD~*veazf@L zSSlgrP>vP)=TZ(vPUv!HkTufL1tyZBkb(>f(93C(GsOofj!Bu20wH-WZwpy@{a;Pw zJca-eKFF4H4JJ0j3;B4D#^W)ZEIf}VE*vpzeGsr$=K}gnkjjOh=d$912Ell;H3frj zc6a~U+xvTee|K+h7YF^TSFc{bevRk<;^5!_A;2nJlnb#~1Sj3|=f7w)wR*j-sY&11 zsBdm=#uW&c9o&gpT3T=;$_4b~FN48wcz6i=FC>w$3UC3Z=l1rC<>f`fKwxF{EdqhS z^78W1($eDMBA#jZ{r&|R5H>eAanj4xAe~ts8Kjoe=>!49S)T?1`|h}iM8eV&n{h=% u$;JpX>l9k#d2TspX^6{Mh>yGk_#8GuK|m$qCV;PJ9*pm23n_&2x$rNfbGu~# literal 0 HcmV?d00001 diff --git a/src/stratigraphy/groundwater/assets/697243001-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/697243001-bp_page1_template.npy deleted file mode 100644 index 2aaa6d3d6ab0ea4caf2fcc4c715af219c46ec51e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1628 zcmbW1`BM{T9L5WBBq88AIwFz~Mx&OF2esA%J5>ZJ1(oWkMMNy1Vn-ZQ&`PH&^*}i+ z8tPrE;5FV3byOh9CLsq2_a%xscC*Ws}g7cQxDoTz(J;92sjBtxLHP9 zl|Wuh4V*z$U`WFaU;}B9dr$~R6>eAts>a&Di-?l4CHOYZ92lDz)e!zp?9Gg9MdsQ1M6%WaJ3BM9YfCc5l{Ch**oIy8g+M# z;60+F{ zeaZb><(JEkNzNTDJ9((+NY43F`I3^tKP0DaUpZTKz2we~(py*0mYvQ&c6e9uiM(G9 z@5)SDmzkEFzI81>B#6i71O;(|c$^8ryqTdB#|QC3c-#;!Cz#9Oa{_^bJRVHI{6F!z zoFGmhpBo4~Fu;FI{(fmYx20!p&pVjAZ~xw$z1c?#3v%}C%FfJAOWmHbE+uF8-aWf= z($jYA*q*Ut$Ii60^v#>Mu2~Zw7q>PsF)21SEddXLsQ%`WVH4?UVezgZrf+n>G)VQjG1U;$Hq5pIAMtju9zE^W4Ktk|sZP>u zVCZ&_$AEiU2AS?bvI{Y(M@OBbq{(Jdo-NwHB_;mkiGqt2Co3)-KV5v_aDK++--~Ws zJ9GW&>8e|$<)uf8Pvu>@T=d|vq~tl40>*^m;uDt2l5woOm&-Bin} zLMH`q{d_t8{z3def1Yn3-;e7bJc&Owm@|7vIGwPbPh{LkipDvp{LF(+oh;^@fP0zvqS70aJKen7blWPhu% zx1pm&+1J}??CUdsce~|9)#HJ+6awgBpy6g0xFD)@o1t#~i`o~rY93cztt`7=U3ssn^3NOPH?NmnsXTL8 zavJ0ZcP~DAbm>lYMfKf^hmR!2UMY!bDO54W2h=cNYjauU1OgENJE-O(-Z?<@((bN7 zqRor9crc9YLrs_dM{~UNxhe9W{D=MUq?ICq*Y2I^@IhYkw(Va1gZe{0Kq(F zDS@h7sInhX;1(GH%L!Npg3DM7gj=L;xDH3_X@`=w1ExVN3OEL_0B%;0L6xB&kr9Z3 zwW%ps0Vqn?AP*uX%?iLe)~cpq6*Hy=!UVVnPE-!swO+J=F)O{WhC%d%Nkf5ev#P-z z-I#*3swlVt><0(YHi$rEj9EQ^=mxEgURcXoH2;b)YiL+Egz3E&C2fRQz+sD)wl-1L KdO%dtqWv48XVoMC diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index d10d847e..77263483 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -2,6 +2,7 @@ import abc import logging +import math import os from dataclasses import dataclass from datetime import date as dt @@ -140,9 +141,10 @@ class GroundwaterLevelExtractor(DataExtractor): feature_name = "groundwater" # look for elevation values to the left, right and/or immediately below the key - search_left_factor: float = 2 + search_left_factor: float = 12 search_right_factor: float = 10 search_below_factor: float = 4 + search_above_factor: float = 4 preprocess_replacements = {",": ".", "'": ".", "o": "0", "\n": " ", "ü": "u"} @@ -213,7 +215,6 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G elevation = extract_elevation(text) - # Pattern for matching depth (e.g., "1,48 m u.T.") matched_lines_rect.append(line.rect) else: # Pattern for matching date @@ -222,6 +223,12 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G if extracted_date_str: text = text.replace(extracted_date_str, "").strip() date = extracted_date + matched_lines_rect.append(line.rect) + else: + # in case several dates are present, we skip the other dates + extracted_date, extracted_date_str = extract_date(text) + if extracted_date_str: + continue # Pattern for matching depth (e.g., "1,48 m u.T.") if not depth: @@ -286,65 +293,166 @@ def load_templates(self) -> list[np.ndarray]: return templates def get_groundwater_from_illustration( - self, lines: list[TextLine], page_number: int + self, lines: list[TextLine], page_number: int, terrain_elevation: Elevation | None ) -> list[GroundwaterInformationOnPage]: """Extracts the groundwater information from an illustration. Args: lines (list[TextLine]): the lines of text to extract the groundwater information from page_number (int): the page number (1-based) of the PDF document + terrain_elevation (Elevation | None): The elevation of the terrain. + Returns: list[GroundwaterInformationOnPage]: the extracted groundwater information """ extracted_groundwater_list = [] + confidence_list = [] # convert the doc to an image page = self.doc.load_page(page_number - 1) - pix = page.get_pixmap(matrix=fitz.Matrix(3, 3)) filename = Path(self.doc.name).stem png_filename = f"{filename}-{page_number + 1}.png" png_path = f"/tmp/{png_filename}" # Local path to save the PNG - - pix.save(png_path) + fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(png_path) # load the image img = ski.io.imread(png_path) + N_BEST_MATCHES = 5 + TEMPLATE_MATCH_THRESHOLD = 0.66 # extract the groundwater information from the image for template in self.load_templates(): + # Compute the match of the template and the image (correlation coef) result = ski.feature.match_template(img, template) - ij = np.unravel_index(np.argmax(result), result.shape) - # confidence = np.max(result) # TODO - use confidence to filter out bad matches - top_left = (ij[1], ij[0]) - illustration_rect = fitz.Rect( - top_left[0], top_left[1], top_left[0] + template.shape[1], top_left[1] + template.shape[0] - ) - - # convert the illustration_rect to the coordinate system of the PDF - horizontal_scaling = page.rect.width / img.shape[1] - vertical_scaling = page.rect.height / img.shape[0] - pdf_illustration_rect = fitz.Rect( - illustration_rect.x0 * horizontal_scaling, - illustration_rect.y0 * vertical_scaling, - illustration_rect.x1 * horizontal_scaling, - illustration_rect.y1 * vertical_scaling, - ) - - # extract the groundwater information from the image using the text - groundwater_info_lines = self.get_lines_near_rect(lines, pdf_illustration_rect) - # sort the lines by their proximity to the key line center, compute the distance to the key line center - key_center = (illustration_rect.x0 + illustration_rect.x1) / 2 - groundwater_info_lines.sort(key=lambda line: abs((line.rect.x0 + line.rect.x1) / 2 - key_center)) - try: - extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page_number) - if extracted_gw.groundwater.depth or extracted_gw.groundwater.elevation: - extracted_groundwater_list.append(extracted_gw) - except ValueError as error: - logger.warning("ValueError: %s", error) - logger.warning("Could not extract groundwater information from the lines near the key.") - - return extracted_groundwater_list + for _ in range(N_BEST_MATCHES): + ij = np.unravel_index(np.argmax(result), result.shape) + confidence = np.max(result) # TODO - use confidence to filter out bad matches + if confidence < TEMPLATE_MATCH_THRESHOLD: + # skip this template if the confidence is too low to avoid false positives + continue + top_left = (ij[1], ij[0]) + illustration_rect = fitz.Rect( + top_left[0], top_left[1], top_left[0] + template.shape[1], top_left[1] + template.shape[0] + ) + + # remove the matched area from the result to avoid finding the same area again + result[ + top_left[0] : top_left[0] + template.shape[0], top_left[1] : top_left[1] + template.shape[1] + ] = 0 + + # convert the illustration_rect to the coordinate system of the PDF + horizontal_scaling = page.rect.width / img.shape[1] + vertical_scaling = page.rect.height / img.shape[0] + pdf_illustration_rect = fitz.Rect( + illustration_rect.x0 * horizontal_scaling, + illustration_rect.y0 * vertical_scaling, + illustration_rect.x1 * horizontal_scaling, + illustration_rect.y1 * vertical_scaling, + ) + + # extract the groundwater information from the image using the text + groundwater_info_lines = self.get_lines_near_rect(lines, pdf_illustration_rect) + + # sort the lines by their proximity to the key line center, compute the distance to the key line center + def distance_to_key_center(line_rect: fitz.Rect, illustration_rect: fitz.Rect) -> float: + key_center_x = (illustration_rect.x0 + illustration_rect.x1) / 2 + key_center_y = (illustration_rect.y0 + illustration_rect.y1) / 2 + line_center_x = (line_rect.x0 + line_rect.x1) / 2 + line_center_y = (line_rect.y0 + line_rect.y1) / 2 + return math.sqrt((line_center_x - key_center_x) ** 2 + (line_center_y - key_center_y) ** 2) + + groundwater_info_lines.sort(key=lambda line: distance_to_key_center(line.rect, pdf_illustration_rect)) + try: + extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page_number) + if extracted_gw.groundwater.depth or extracted_gw.groundwater.elevation: + # Fill in the depth and elevation if they are not already filled in based on the terrain + if terrain_elevation: + if not extracted_gw.groundwater.depth and extracted_gw.groundwater.elevation: + extracted_gw.groundwater.depth = round( + terrain_elevation.elevation - extracted_gw.groundwater.elevation, 2 + ) + if not extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: + extracted_gw.groundwater.elevation = round( + terrain_elevation.elevation - extracted_gw.groundwater.depth, 2 + ) + + # Make a sanity check to see if elevation and depth make sense (i.e., they add up: + # elevation + depth = terrain elevation) + if extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: + extract_terrain_elevation = round( + extracted_gw.groundwater.elevation + extracted_gw.groundwater.depth, 2 + ) + if extract_terrain_elevation != terrain_elevation.elevation: + # If the extracted elevation and depth do not match the terrain elevation, we try + # to remove one of the items from the match and see if we can find a better match. + logger.warning( + "The extracted elevation and depth do not match the terrain elevation." + ) + logger.warning( + "Elevation: %s, Depth: %s, Terrain Elevation: %s", + extracted_gw.groundwater.elevation, + extracted_gw.groundwater.depth, + terrain_elevation.elevation, + ) + + # re-run the extraction and see if we can find a better match by removing one + # item from the current match + groundwater_info_lines_without_depth = [ + line + for line in groundwater_info_lines + if str(extracted_gw.groundwater.depth) not in line.text + ] + groundwater_info_lines_without_elevation = [ + line + for line in groundwater_info_lines + if str(extracted_gw.groundwater.elevation) not in line.text + ] + extracted_gw = self.get_groundwater_info_from_lines( + groundwater_info_lines_without_depth, page_number + ) + + if not extracted_gw.groundwater.depth: + extracted_gw = self.get_groundwater_info_from_lines( + groundwater_info_lines_without_elevation, page_number + ) + + if extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: + extract_terrain_elevation = round( + extracted_gw.groundwater.elevation + extracted_gw.groundwater.depth, 2 + ) + + if extract_terrain_elevation != terrain_elevation.elevation: + logger.warning( + "The extracted elevation and depth do not match the terrain elevation." + ) + logger.warning( + "Elevation: %s, Depth: %s, Terrain Elevation: %s", + extracted_gw.groundwater.elevation, + extracted_gw.groundwater.depth, + terrain_elevation.elevation, + ) + continue + + # Only if the groundwater information is not already in the list + if extracted_gw not in extracted_groundwater_list: + if extracted_gw.groundwater.date: + extracted_groundwater_list.append(extracted_gw) + confidence_list.append(confidence) + + # Remove the extracted groundwater information from the lines to avoid double extraction + for line in groundwater_info_lines: + # if the rectangle of the line is in contact with the rectangle of the extracted + # groundwater information, remove the line + if line.rect.intersects(extracted_gw.rect): + lines.remove(line) + + except ValueError: + continue + + # TODO: Maybe we could stop the search if we found a good match with one of the templates + + return extracted_groundwater_list, confidence_list def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[GroundwaterInformationOnPage]: """Extracts the groundwater information from a borehole profile. @@ -367,7 +475,13 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Groun found_groundwater = self.get_groundwater_near_key(lines, page_number) if not found_groundwater: logger.info("No groundwater found near the key on page %s.", page_number) - found_groundwater = self.get_groundwater_from_illustration(lines, page_number) + found_groundwater, confidence_list = self.get_groundwater_from_illustration( + lines, page_number, terrain_elevation + ) + logger.info("Confidence list: %s", confidence_list) + print("Confidence list: %s", confidence_list) + logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater) + print("Found groundwater from illustration on page %s: %s", page_number, found_groundwater) if not found_groundwater: logger.info("No groundwater found in the illustration on page %s.", page_number) diff --git a/src/stratigraphy/groundwater/utility.py b/src/stratigraphy/groundwater/utility.py index 10c16133..e476b134 100644 --- a/src/stratigraphy/groundwater/utility.py +++ b/src/stratigraphy/groundwater/utility.py @@ -45,12 +45,15 @@ def extract_depth(text: str, max_depth: int) -> float | None: for pattern in depth_patterns: depth_match = regex.search(pattern, corrected_text) if depth_match: - depth = float(depth_match.group(1).replace(",", ".")) - if depth > max_depth: - # If the extracted depth is greater than the max depth, set it to None and continue searching. - depth = None - else: - break + try: + depth = float(depth_match.group(1).replace(",", ".")) + if depth > max_depth: + # If the extracted depth is greater than the max depth, set it to None and continue searching. + depth = None + else: + break + except ValueError: + continue return depth From 283435a8a7449442e7ecd341e53f74b5660ecebd Mon Sep 17 00:00:00 2001 From: dcleres Date: Fri, 20 Sep 2024 13:59:30 +0200 Subject: [PATCH 04/14] Uploaded latest changes --- Screenshot 2024-09-16 at 19.04.42_template.npy | Bin 0 -> 1388 bytes config/matching_params.yml | 8 +++++++- .../data_extractor/data_extractor.py | 16 +++++++++++++++- .../assets/269126062-bp_page1_template.npy | Bin 0 -> 668 bytes .../assets/700246002-bp_page1_template.npy | Bin 0 -> 2888 bytes .../groundwater/groundwater_extraction.py | 5 +++-- 6 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 Screenshot 2024-09-16 at 19.04.42_template.npy create mode 100644 src/stratigraphy/groundwater/assets/269126062-bp_page1_template.npy create mode 100644 src/stratigraphy/groundwater/assets/700246002-bp_page1_template.npy diff --git a/Screenshot 2024-09-16 at 19.04.42_template.npy b/Screenshot 2024-09-16 at 19.04.42_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..a6abee21776c4d497c17650a5ce6020ca6e23322 GIT binary patch literal 1388 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JQ);NLqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$-Mmh?{ra-Kzqfo0r4&eGvFOZu0pVp|P(Z7HH@7^VVe*L0?MN}~Y0Bpx0%>V!Z literal 0 HcmV?d00001 diff --git a/config/matching_params.yml b/config/matching_params.yml index 1af66a56..8462664a 100644 --- a/config/matching_params.yml +++ b/config/matching_params.yml @@ -115,6 +115,11 @@ coordinate_keys: - coordonnées - coordonn +coordinate_fp_keys: + + +groundwater_fp_keys: + - Wasserstau groundwater_keys: # German @@ -132,7 +137,6 @@ groundwater_keys: - W SP - Gr.W.spiegel - GrW Sp - - Wsp. - Wsp - GW-Spiegel - Grundwasser @@ -170,3 +174,5 @@ elevation_keys: - Ansatzhöhe - Terrainkote +elevation_fp_keys: + diff --git a/src/stratigraphy/data_extractor/data_extractor.py b/src/stratigraphy/data_extractor/data_extractor.py index bf052656..301c925f 100644 --- a/src/stratigraphy/data_extractor/data_extractor.py +++ b/src/stratigraphy/data_extractor/data_extractor.py @@ -40,6 +40,7 @@ class DataExtractor(ABC): doc: fitz.Document = None feature_keys: list[str] = None + feature_fp_keys: list[str] = None feature_name: str = None # How much to the left of a key do we look for the feature information, as a multiple of the key line width @@ -65,6 +66,11 @@ def __init__(self, document: fitz.Document): self.doc = document self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"] + self.feature_fp_keys = ( + read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] + if read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] + else [] + ) def preprocess(self, value: str) -> str: for old, new in self.preprocess_replacements.items(): @@ -107,7 +113,15 @@ def find_feature_key(self, lines: list[TextLine], allowed_error_rate: float = 0. for line in lines: match = pattern.search(line.text) if match: - matches.add(line) + # Make sure the key is not in the false positive list + is_fp_key = False + for fp_key in self.feature_fp_keys: + if fp_key in line.text: + is_fp_key = True + break + + if not is_fp_key: + matches.add(line) return list(matches) diff --git a/src/stratigraphy/groundwater/assets/269126062-bp_page1_template.npy b/src/stratigraphy/groundwater/assets/269126062-bp_page1_template.npy new file mode 100644 index 0000000000000000000000000000000000000000..339d8820dbf9575e88601e24a21d6e830d315e73 GIT binary patch literal 668 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JQ);NLqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$dIts=HK&+{wP^&-|;4(7$PhDhV^Z)++|3DBDLS+;e_y5BOkjZG^`}hBI z=KP;IlT38!5}J7k?z(mV85qb%{{9G~Fj+vqynBa&_U`?kmxmplJ&U69*Dp--@QD-` MAF5y+KyWmgzIQJ3HPF^AGNx=e*~6-|w3_&pi8;w=uUkC%^0;Z+-Pv zZP>cXt!t~p=7xstsEG@U3=ON=QoXIZ;*0Vv)$1xp>!}r+wpN1r)(sV3R)Tzbd~9Us ztc1wWnKL3ozYG0;etrk{*L?fUMq%$0#IC||4UVY^LWkp86xG;mN}E-Mqgo76)1;Ba z^#C}kAuuh48fcr5K}`gIDgXmG4m?mI` zfC;;V!6DKvVHvfc>(}hU^hIfluHU>$(#E>Fnq&1lVU++w6$A?7s2oQWm|cn56n2Zk zHUgtI32Kw#2m}a;fmDiE#i&J!4a1}rrtA;^gTf4=Vh{~!fe9;wTV&{vgs{qyAqj2) z6BL+DL?99xmm$MK(kdcH`kW~A;>pix`KgHu=3KjRnIH}O4}Z1i@HcX8uanWcD2cuR8oox~D}kPj7Iocn#WSJk{++v5Oame?B53YBW16zURmcM1g$8XES+ z&5ci(pLFf|Pbj9#E-p?lEZ)5RySB@1$D0rDKeA`n{vBU_wWV}bN!jYM%)G49&nn74 z|2(s>C^hmW@h&&Mk;$ZPy051&8}o}Z_;KZyVI@&y2VH1+rL@bmQa^YZld z^ziZY1n}c|jq~>p9veM#X8VP61Tj!lUXq-(s&R+IZkt!@=g3`ZI0EkJNvEw0?KPsYCUR zM;jUsx3)L#tJ`t1>B!OgeR~ho*6!Ta(0J(ky05?9`9^s$T@ax(w zQ=`HnV&j@lHzSB?=b@Us6&W`>uM;@zq!gS>%ef64XX2a&%qnFmJ?)S&PKe{w9H-?t z9mi=#qg&6pO`|iNN*66-9q=f3895giGSWEY1`}LbuwmVdYV>VfznlAubN6%H00#(b zVx7j`m$!;ml_oFw^lV!zQ1V^vrri9@Kkq-l38ljUGY-|jpipP-)taT+K^QWLpT4-& zCwVNEJ$=@78+g_E=(n!!4p`Bp*Y&{iXOQfP$=Ih+zXYiQey-7rbVj+L=Xtl_#j|G* zA3eI=+tVrPyVoaqsMCuzN|9RL_wvzgi&5xAMEoZg3yU)n=Ek@G*n%>~9osj4lALtq zm+Kg&GwS(gPVZT>wxF;yJ+Cw~vnZ=zMd6an)c85GBI2Sx{MVEz)2GHJ#7~b7|9HmK zxP;i4S<%rmqvj?jMa4(Xh@L)m#>X*9(FqHa!eS%CVxxm5O`06~enjkyX_28ZbHnE^ zPD+?NJ1Qn}PGa1{&Ks;#56L=8m*)T_m#(!F&i_&)bg&>R!HH06bq_M9g%4Pp69;lt2%POS5Jr#Qbo%jc}MY zRF%fWMsz%QNE0RuQ3?2e@Vox#;@|0dd0)`;_}PoQSAV{6xAXV=4{!Z>_vX*nfB5lA z+u7C=XWLG;pFeY|>3Dtp!99Cwo0^XQboD}0bN#XA11*;uo6a|!KHu2f*4Wf?@^ss& zi~g9d3|N+C2V-zJ3>;%{Glthr<7;rJG6ol8Kt?2l9cCSRx6^=HWq+CY;F_P*2yj)UZ*3WUox!I^|=_uoR>SsY#%xj=+;Jl!+$wR?dZpy<_N}YF-M($> zwlCIImKPS~re`d!tojV3yLNx`pPCKJR~M}Ne8q-MIEp!0AwmMYr>F7cW^f|Ngz-Daw?cwIF0-(4M+O zgJwAaZhusZ3`6(`{Mu=5Ik9Wy0_#Yx%?^{KhNRR4rN#)Q z9ffdOBaz-;UYb37cJ#UP%^0E2$xR6gnXtR==+KZHM-?RK4G@^H$-%$Oe{^l*+U(U; zh2`rDtG8F}KlbhBnu_Y}>o?b|SzTFDQB}UF`roV9u3Wih`TEV3OH1<$O7e0GKCRkN zs@L;L3?l4(q(w+u#H>wDp)wkSNJQ$Qlpx|ZF@wvs%13J|ilY;wPMmA75xVT0ws1k^w?FyiT!Ic!I006{6s2qfv#*{Rs7zGZ list[np.ndarray]: templates = [] template_dir = os.path.join(os.path.dirname(__file__), "assets") for template in os.listdir(template_dir): - if template.endswith(".npy"): + if template.endswith(".npy"): # and template.startswith("700246002-bp_page1_template"): templates.append(np.load(os.path.join(template_dir, template))) return templates @@ -447,7 +447,8 @@ def distance_to_key_center(line_rect: fitz.Rect, illustration_rect: fitz.Rect) - if line.rect.intersects(extracted_gw.rect): lines.remove(line) - except ValueError: + except ValueError as error: + logger.warning("ValueError: %s", error) continue # TODO: Maybe we could stop the search if we found a good match with one of the templates From 3812178fb297695f7ee604c5e3c7f20a8a66e7c4 Mon Sep 17 00:00:00 2001 From: dcleres Date: Mon, 23 Sep 2024 11:14:29 +0200 Subject: [PATCH 05/14] Fixed the removal of areas that have already been matched --- config/matching_params.yml | 2 ++ .../groundwater/groundwater_extraction.py | 17 ++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/config/matching_params.yml b/config/matching_params.yml index 8462664a..5e672653 100644 --- a/config/matching_params.yml +++ b/config/matching_params.yml @@ -120,6 +120,8 @@ coordinate_fp_keys: groundwater_fp_keys: - Wasserstau + - Grundwasser- + - Grundwasserfassung groundwater_keys: # German diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index 00a4054a..3a917b39 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -141,7 +141,8 @@ class GroundwaterLevelExtractor(DataExtractor): feature_name = "groundwater" # look for elevation values to the left, right and/or immediately below the key - search_left_factor: float = 12 + search_left_factor: float = 3 # NOTE: check files 267125334-bp.pdf, 267125338-bp.pdf, and 267125339-bp.pdf if this + # value is too high, as it might lead to false positives search_right_factor: float = 10 search_below_factor: float = 4 search_above_factor: float = 4 @@ -337,9 +338,12 @@ def get_groundwater_from_illustration( ) # remove the matched area from the result to avoid finding the same area again + x_area_to_remove = int(0.75 * template.shape[1]) + y_area_to_remove = int(0.75 * template.shape[0]) result[ - top_left[0] : top_left[0] + template.shape[0], top_left[1] : top_left[1] + template.shape[1] - ] = 0 + int(illustration_rect.y0) - y_area_to_remove : int(illustration_rect.y1) + y_area_to_remove, + int(illustration_rect.x0) - x_area_to_remove : int(illustration_rect.x1) + x_area_to_remove, + ] = float("-inf") # convert the illustration_rect to the coordinate system of the PDF horizontal_scaling = page.rect.width / img.shape[1] @@ -435,10 +439,9 @@ def distance_to_key_center(line_rect: fitz.Rect, illustration_rect: fitz.Rect) - continue # Only if the groundwater information is not already in the list - if extracted_gw not in extracted_groundwater_list: - if extracted_gw.groundwater.date: - extracted_groundwater_list.append(extracted_gw) - confidence_list.append(confidence) + if extracted_gw not in extracted_groundwater_list and extracted_gw.groundwater.date: + extracted_groundwater_list.append(extracted_gw) + confidence_list.append(confidence) # Remove the extracted groundwater information from the lines to avoid double extraction for line in groundwater_info_lines: From bee7c57587c6da3b248520252c8091fd99da706b Mon Sep 17 00:00:00 2001 From: dcleres Date: Thu, 26 Sep 2024 10:14:21 +0200 Subject: [PATCH 06/14] Added ski to the dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 98e8a7b4..4cd116b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "opencv-python-headless", "quads>=1.1.0", "numpy<2", + "scikit-image==0.24.0" ] [project.optional-dependencies] From d28489ddc4bb5db44317695c2c7c5ca44cc87c4f Mon Sep 17 00:00:00 2001 From: dcleres Date: Mon, 14 Oct 2024 11:18:30 +0200 Subject: [PATCH 07/14] Addressed code review comments --- README.md | 8 +++++ .../groundwater/groundwater_extraction.py | 33 ++++++++++++------- src/stratigraphy/groundwater/utility.py | 11 ++----- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 45c42392..4a370e90 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,14 @@ With regard to the extraction of coordinates, the [Swiss coordinate systems](htt #### Groundwater With the current version of the code, groundwater can only be found at depth smaller than 200 meters. This threshold is defined in `src/stratigraphy/groundwater/groundwater_extraction.py` by the constant `MAX_DEPTH`. +The groundwater is extracted in two main ways from the borehole documents. The first one aims to match a groundwater-related keyword in the text extracted from the document (e.g., groundwater, groundwater-level). The second technique focuses on extracting the groundwater-related illustration from the document by using template matching. The matching of the groundwater illustration is disabled by default as it significantly increases the runtime of the data extraction pipeline. You can control the activation of this feature by using the `IS_SEARCHING_GROUNDWATER_ILLUSTRATION`. + +Add the following line to the `.env` document to turn on the groundwater detection: + +``` +IS_SEARCHING_GROUNDWATER_ILLUSTRATION="True" +``` + ## Main contributors * Stijn Vermeeren [@stijnvermeeren-swisstopo](https://www.github.com/stijnvermeeren-swisstopo) (swisstopo) - Project Lead diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index 3a917b39..b6bc3c0e 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -140,6 +140,8 @@ class GroundwaterLevelExtractor(DataExtractor): feature_name = "groundwater" + is_searching_groundwater_illustration: bool = False + # look for elevation values to the left, right and/or immediately below the key search_left_factor: float = 3 # NOTE: check files 267125334-bp.pdf, 267125338-bp.pdf, and 267125339-bp.pdf if this # value is too high, as it might lead to false positives @@ -149,6 +151,11 @@ class GroundwaterLevelExtractor(DataExtractor): preprocess_replacements = {",": ".", "'": ".", "o": "0", "\n": " ", "ü": "u"} + def __init__(self, document): + super().__init__(document) + + self.is_searching_groundwater_illustration = os.getenv("IS_SEARCHING_GROUNDWATER_ILLUSTRATION") == "True" + def get_groundwater_near_key(self, lines: list[TextLine], page: int) -> list[GroundwaterInformationOnPage]: """Find groundwater information from text lines that are close to an explicit "groundwater" label. @@ -289,7 +296,7 @@ def load_templates(self) -> list[np.ndarray]: templates = [] template_dir = os.path.join(os.path.dirname(__file__), "assets") for template in os.listdir(template_dir): - if template.endswith(".npy"): # and template.startswith("700246002-bp_page1_template"): + if template.endswith(".npy"): templates.append(np.load(os.path.join(template_dir, template))) return templates @@ -474,20 +481,24 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Groun """ for page in self.doc: lines = extract_text_lines(page) - page_number = page.number + 1 # page.number is 0-based + page_number = page.number + 1 # NOTE: page.number is 0-based found_groundwater = self.get_groundwater_near_key(lines, page_number) if not found_groundwater: logger.info("No groundwater found near the key on page %s.", page_number) - found_groundwater, confidence_list = self.get_groundwater_from_illustration( - lines, page_number, terrain_elevation - ) - logger.info("Confidence list: %s", confidence_list) - print("Confidence list: %s", confidence_list) - logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater) - print("Found groundwater from illustration on page %s: %s", page_number, found_groundwater) - if not found_groundwater: - logger.info("No groundwater found in the illustration on page %s.", page_number) + + ### Extract groundwater from illustration + if self.is_searching_groundwater_illustration: + found_groundwater, confidence_list = self.get_groundwater_from_illustration( + lines, page_number, terrain_elevation + ) + if found_groundwater: + logger.info("Confidence list: %s", confidence_list) + logger.info( + "Found groundwater from illustration on page %s: %s", page_number, found_groundwater + ) + else: + logger.info("No groundwater illustration found on page %s.", page_number) if terrain_elevation: # If the elevation is provided, calculate the depth of the groundwater diff --git a/src/stratigraphy/groundwater/utility.py b/src/stratigraphy/groundwater/utility.py index e476b134..35823de1 100644 --- a/src/stratigraphy/groundwater/utility.py +++ b/src/stratigraphy/groundwater/utility.py @@ -13,15 +13,8 @@ def extract_date(text: str) -> tuple[date | None, str | None]: return None, None date_str = date_match.group(1) - - for date_format in ("%d.%m.%Y", "%d.%m.%y"): - try: - date = datetime.strptime(date_str, date_format).date() - return date, date_str - except ValueError: - continue - - return None, None + date_format = "%d.%m.%y" if len(date_str.split(".")[2]) == 2 else "%d.%m.%Y" + return datetime.strptime(date_str, date_format).date(), date_str def extract_depth(text: str, max_depth: int) -> float | None: From 0cb47a98179e694b47701aa9d405d5931bb4b103 Mon Sep 17 00:00:00 2001 From: dcleres Date: Mon, 14 Oct 2024 13:06:03 +0200 Subject: [PATCH 08/14] Updated the seach factors --- src/stratigraphy/groundwater/groundwater_extraction.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index b6bc3c0e..c04d0de8 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -143,11 +143,10 @@ class GroundwaterLevelExtractor(DataExtractor): is_searching_groundwater_illustration: bool = False # look for elevation values to the left, right and/or immediately below the key - search_left_factor: float = 3 # NOTE: check files 267125334-bp.pdf, 267125338-bp.pdf, and 267125339-bp.pdf if this - # value is too high, as it might lead to false positives + search_left_factor: float = 2 search_right_factor: float = 10 search_below_factor: float = 4 - search_above_factor: float = 4 + search_above_factor: float = 0 preprocess_replacements = {",": ".", "'": ".", "o": "0", "\n": " ", "ü": "u"} From a4a97ed75532da015dbb1e953941560754574c0b Mon Sep 17 00:00:00 2001 From: dcleres Date: Mon, 14 Oct 2024 14:23:41 +0200 Subject: [PATCH 09/14] Reverted the extract depth function back to its implementation in main --- .vscode/settings.json | 1 + config/matching_params.yml | 1 + .../groundwater/groundwater_extraction.py | 2 ++ src/stratigraphy/groundwater/utility.py | 15 ++++++--------- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6767e200..da83894d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,6 +14,7 @@ "swissgeol", "swisstopo", "textblock", + "USCS", "venv" ] } \ No newline at end of file diff --git a/config/matching_params.yml b/config/matching_params.yml index 5e672653..dab1888a 100644 --- a/config/matching_params.yml +++ b/config/matching_params.yml @@ -122,6 +122,7 @@ groundwater_fp_keys: - Wasserstau - Grundwasser- - Grundwasserfassung + - GW/ # makes it possible to avoid false positives like "GW/" from the USCS Nomenclature columns groundwater_keys: # German diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index c04d0de8..dea40d9c 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -154,6 +154,8 @@ def __init__(self, document): super().__init__(document) self.is_searching_groundwater_illustration = os.getenv("IS_SEARCHING_GROUNDWATER_ILLUSTRATION") == "True" + if self.is_searching_groundwater_illustration: + logger.info("Searching for groundwater information in illustrations.") def get_groundwater_near_key(self, lines: list[TextLine], page: int) -> list[GroundwaterInformationOnPage]: """Find groundwater information from text lines that are close to an explicit "groundwater" label. diff --git a/src/stratigraphy/groundwater/utility.py b/src/stratigraphy/groundwater/utility.py index 83fe2136..3925ba31 100644 --- a/src/stratigraphy/groundwater/utility.py +++ b/src/stratigraphy/groundwater/utility.py @@ -38,15 +38,12 @@ def extract_depth(text: str, max_depth: int) -> float | None: for pattern in depth_patterns: depth_match = regex.search(pattern, corrected_text) if depth_match: - try: - depth = float(depth_match.group(1).replace(",", ".")) - if depth > max_depth: - # If the extracted depth is greater than the max depth, set it to None and continue searching. - depth = None - else: - break - except ValueError: - continue + depth = float(depth_match.group(1).replace(",", ".")) + if depth > max_depth: + # If the extracted depth is greater than the max depth, set it to None and continue searching. + depth = None + else: + break return depth From 45bda630fb25cd95c511672ea4671108a1e096ef Mon Sep 17 00:00:00 2001 From: Stijn Vermeeren Date: Mon, 14 Oct 2024 14:56:24 +0200 Subject: [PATCH 10/14] code review --- README.md | 2 +- Screenshot 2024-09-16 at 19.04.42_template.npy | Bin 1388 -> 0 bytes 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 Screenshot 2024-09-16 at 19.04.42_template.npy diff --git a/README.md b/README.md index 9985ccd0..0d1ab8fd 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ With regard to the extraction of coordinates, the [Swiss coordinate systems](htt #### Groundwater With the current version of the code, groundwater can only be found at depth smaller than 200 meters. This threshold is defined in `src/stratigraphy/groundwater/groundwater_extraction.py` by the constant `MAX_DEPTH`. -The groundwater is extracted in two main ways from the borehole documents. The first one aims to match a groundwater-related keyword in the text extracted from the document (e.g., groundwater, groundwater-level). The second technique focuses on extracting the groundwater-related illustration from the document by using template matching. The matching of the groundwater illustration is disabled by default as it significantly increases the runtime of the data extraction pipeline. You can control the activation of this feature by using the `IS_SEARCHING_GROUNDWATER_ILLUSTRATION`. +The groundwater is extracted in two main ways from the borehole documents. The first one aims to match a groundwater-related keyword in the text extracted from the document (e.g., groundwater, groundwater-level). The second technique focuses on extracting the groundwater-related illustration from the document by using template matching. The matching of the groundwater illustration is disabled by default as it significantly increases the runtime of the data extraction pipeline. You can control the activation of this feature by using the `IS_SEARCHING_GROUNDWATER_ILLUSTRATION` environment variable. Add the following line to the `.env` document to turn on the groundwater detection: diff --git a/Screenshot 2024-09-16 at 19.04.42_template.npy b/Screenshot 2024-09-16 at 19.04.42_template.npy deleted file mode 100644 index a6abee21776c4d497c17650a5ce6020ca6e23322..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1388 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JQ);NLqoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$-Mmh?{ra-Kzqfo0r4&eGvFOZu0pVp|P(Z7HH@7^VVe*L0?MN}~Y0Bpx0%>V!Z From 2a39943df63ccf7ab078c2783e74f658eb5103e1 Mon Sep 17 00:00:00 2001 From: dcleres Date: Mon, 14 Oct 2024 16:41:54 +0200 Subject: [PATCH 11/14] Added changes related to the comments required duri ng PR --- README.md | 2 ++ pyproject.toml | 6 ++-- .../data_extractor/data_extractor.py | 19 +++-------- .../groundwater/groundwater_extraction.py | 34 +++++++++---------- 4 files changed, 26 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 0d1ab8fd..388e06ed 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Add the following line to the `.env` document to turn on the groundwater detecti IS_SEARCHING_GROUNDWATER_ILLUSTRATION="True" ``` +The extraction of groundwater relies on the `scikit-image` library. This library is part of the optional dependencies of this project as part of the `groundwater_illustration_matching` dependencies in the `pyproject.toml` file. If you wish to use the template matching algorithm to determine the groundwater elevation, depth, and date, please install this dependency before running the code. + ## Main contributors * Stijn Vermeeren [@stijnvermeeren-swisstopo](https://www.github.com/stijnvermeeren-swisstopo) (swisstopo) - Project Lead diff --git a/pyproject.toml b/pyproject.toml index 4cd116b3..c2e2b4d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,8 +34,7 @@ dependencies = [ "PyMuPDF>=1.23.26", "opencv-python-headless", "quads>=1.1.0", - "numpy<2", - "scikit-image==0.24.0" + "numpy<2" ] [project.optional-dependencies] @@ -55,6 +54,9 @@ visualize = [ devtools = [ "tqdm" ] +groundwater_illustration_matching = [ + "scikit-image==0.24.0" +] all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools]"] diff --git a/src/stratigraphy/data_extractor/data_extractor.py b/src/stratigraphy/data_extractor/data_extractor.py index 46e03af4..4df348db 100644 --- a/src/stratigraphy/data_extractor/data_extractor.py +++ b/src/stratigraphy/data_extractor/data_extractor.py @@ -66,11 +66,7 @@ def __init__(self, document: fitz.Document): self.doc = document self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"] - self.feature_fp_keys = ( - read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] - if read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] - else [] - ) + self.feature_fp_keys = read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] or [] def preprocess(self, value: str) -> str: for old, new in self.preprocess_replacements.items(): @@ -112,16 +108,9 @@ def find_feature_key(self, lines: list[TextLine], allowed_error_rate: float = 0. for line in lines: match = pattern.search(line.text) - if match: - # Make sure the key is not in the false positive list - is_fp_key = False - for fp_key in self.feature_fp_keys: - if fp_key in line.text: - is_fp_key = True - break - - if not is_fp_key: - matches.add(line) + if match and (not any(fp_key in line.text for fp_key in self.feature_fp_keys)): + # Check if there is a match and the matched string is not in the false positive list + matches.add(line) return list(matches) diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index dea40d9c..4700c73a 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -232,9 +232,13 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G if extracted_date_str: text = text.replace(extracted_date_str, "").strip() date = extracted_date - matched_lines_rect.append(line.rect) + matched_lines_rect.append( + line.rect + ) # Add the rectangle of the line to the matched lines list to make sure it is drawn + # in the output image. else: - # in case several dates are present, we skip the other dates + # If a second date is present in the lines around the groundwater key, then we skip this line, + # instead of potentially falsely extracting a depth value from the date. extracted_date, extracted_date_str = extract_date(text) if extracted_date_str: continue @@ -345,7 +349,8 @@ def get_groundwater_from_illustration( top_left[0], top_left[1], top_left[0] + template.shape[1], top_left[1] + template.shape[0] ) - # remove the matched area from the result to avoid finding the same area again + # Remove the matched area from the template matching result to avoid finding the same area again + # for the same template x_area_to_remove = int(0.75 * template.shape[1]) y_area_to_remove = int(0.75 * template.shape[0]) result[ @@ -485,21 +490,14 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Groun page_number = page.number + 1 # NOTE: page.number is 0-based found_groundwater = self.get_groundwater_near_key(lines, page_number) - if not found_groundwater: - logger.info("No groundwater found near the key on page %s.", page_number) - - ### Extract groundwater from illustration - if self.is_searching_groundwater_illustration: - found_groundwater, confidence_list = self.get_groundwater_from_illustration( - lines, page_number, terrain_elevation - ) - if found_groundwater: - logger.info("Confidence list: %s", confidence_list) - logger.info( - "Found groundwater from illustration on page %s: %s", page_number, found_groundwater - ) - else: - logger.info("No groundwater illustration found on page %s.", page_number) + if not found_groundwater and self.is_searching_groundwater_illustration: + # Extract groundwater from illustration + found_groundwater, confidence_list = self.get_groundwater_from_illustration( + lines, page_number, terrain_elevation + ) + if found_groundwater: + logger.info("Confidence list: %s", confidence_list) + logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater) if terrain_elevation: # If the elevation is provided, calculate the depth of the groundwater From 8ceeeb287067a2db3ab2215eabcedf882f95d46e Mon Sep 17 00:00:00 2001 From: dcleres Date: Mon, 14 Oct 2024 18:58:45 +0200 Subject: [PATCH 12/14] Moved the template matching to a specific file to separate it from the rest of the groundwater detection --- .../data_extractor/data_extractor.py | 16 +- src/stratigraphy/data_extractor/utility.py | 36 +++ .../groundwater/groundwater_extraction.py | 190 +--------------- .../gw_illustration_template_matching.py | 212 ++++++++++++++++++ 4 files changed, 262 insertions(+), 192 deletions(-) create mode 100644 src/stratigraphy/data_extractor/utility.py create mode 100644 src/stratigraphy/groundwater/gw_illustration_template_matching.py diff --git a/src/stratigraphy/data_extractor/data_extractor.py b/src/stratigraphy/data_extractor/data_extractor.py index 4df348db..b52d6714 100644 --- a/src/stratigraphy/data_extractor/data_extractor.py +++ b/src/stratigraphy/data_extractor/data_extractor.py @@ -9,6 +9,7 @@ import fitz import regex +from stratigraphy.data_extractor.utility import get_lines_near_rect from stratigraphy.lines.line import TextLine from stratigraphy.util.util import read_params @@ -148,12 +149,11 @@ def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]: Returns: list[TextLine]: The lines close to the rectangle. """ - search_rect = fitz.Rect( - rect.x0 - self.search_left_factor * rect.width, - rect.y0 - self.search_above_factor * rect.height, - rect.x1 + self.search_right_factor * rect.width, - rect.y1 + self.search_below_factor * rect.height, + return get_lines_near_rect( + self.search_left_factor, + self.search_right_factor, + self.search_above_factor, + self.search_below_factor, + lines, + rect, ) - feature_lines = [line for line in lines if line.rect.intersects(search_rect)] - - return feature_lines diff --git a/src/stratigraphy/data_extractor/utility.py b/src/stratigraphy/data_extractor/utility.py new file mode 100644 index 00000000..1afbd0c0 --- /dev/null +++ b/src/stratigraphy/data_extractor/utility.py @@ -0,0 +1,36 @@ +"""Utility functions for the data extractor module.""" + +from fitz import Rect +from stratigraphy.lines.line import TextLine + + +def get_lines_near_rect( + search_left_factor: float, + search_right_factor: float, + search_above_factor: float, + search_below_factor: float, + lines: list[TextLine], + rect: Rect, +) -> list[TextLine]: + """Find the lines of the text that are close to a given rectangle. + + Args: + search_left_factor (float): The factor to search to the left of the rectangle. + search_right_factor (float): The factor to search to the right of the rectangle. + search_above_factor (float): The factor to search above the rectangle. + search_below_factor (float): The factor to search below the rectangle + lines (list[TextLine]): Arbitrary text lines to search in. + rect (fitz.Rect): The rectangle to search around. + + Returns: + list[TextLine]: The lines close to the rectangle. + """ + search_rect = Rect( + rect.x0 - search_left_factor * rect.width, + rect.y0 - search_above_factor * rect.height, + rect.x1 + search_right_factor * rect.width, + rect.y1 + search_below_factor * rect.height, + ) + feature_lines = [line for line in lines if line.rect.intersects(search_rect)] + + return feature_lines diff --git a/src/stratigraphy/groundwater/groundwater_extraction.py b/src/stratigraphy/groundwater/groundwater_extraction.py index 4700c73a..d548c77f 100644 --- a/src/stratigraphy/groundwater/groundwater_extraction.py +++ b/src/stratigraphy/groundwater/groundwater_extraction.py @@ -2,16 +2,13 @@ import abc import logging -import math import os from dataclasses import dataclass from datetime import date as dt from datetime import datetime -from pathlib import Path import fitz import numpy as np -import skimage as ski from stratigraphy.data_extractor.data_extractor import DataExtractor, ExtractedFeature from stratigraphy.groundwater.utility import extract_date, extract_depth, extract_elevation from stratigraphy.lines.line import TextLine @@ -292,185 +289,6 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G else: raise ValueError("Could not extract all required information from the lines provided.") - def load_templates(self) -> list[np.ndarray]: - """Load the templates for the groundwater information. - - Returns: - list[np.ndarray]: the loaded templates - """ - templates = [] - template_dir = os.path.join(os.path.dirname(__file__), "assets") - for template in os.listdir(template_dir): - if template.endswith(".npy"): - templates.append(np.load(os.path.join(template_dir, template))) - return templates - - def get_groundwater_from_illustration( - self, lines: list[TextLine], page_number: int, terrain_elevation: Elevation | None - ) -> list[GroundwaterInformationOnPage]: - """Extracts the groundwater information from an illustration. - - Args: - lines (list[TextLine]): the lines of text to extract the groundwater information from - page_number (int): the page number (1-based) of the PDF document - terrain_elevation (Elevation | None): The elevation of the terrain. - - Returns: - list[GroundwaterInformationOnPage]: the extracted groundwater information - """ - extracted_groundwater_list = [] - confidence_list = [] - - # convert the doc to an image - page = self.doc.load_page(page_number - 1) - filename = Path(self.doc.name).stem - png_filename = f"{filename}-{page_number + 1}.png" - png_path = f"/tmp/{png_filename}" # Local path to save the PNG - fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(png_path) - - # load the image - img = ski.io.imread(png_path) - N_BEST_MATCHES = 5 - TEMPLATE_MATCH_THRESHOLD = 0.66 - - # extract the groundwater information from the image - for template in self.load_templates(): - # Compute the match of the template and the image (correlation coef) - result = ski.feature.match_template(img, template) - - for _ in range(N_BEST_MATCHES): - ij = np.unravel_index(np.argmax(result), result.shape) - confidence = np.max(result) # TODO - use confidence to filter out bad matches - if confidence < TEMPLATE_MATCH_THRESHOLD: - # skip this template if the confidence is too low to avoid false positives - continue - top_left = (ij[1], ij[0]) - illustration_rect = fitz.Rect( - top_left[0], top_left[1], top_left[0] + template.shape[1], top_left[1] + template.shape[0] - ) - - # Remove the matched area from the template matching result to avoid finding the same area again - # for the same template - x_area_to_remove = int(0.75 * template.shape[1]) - y_area_to_remove = int(0.75 * template.shape[0]) - result[ - int(illustration_rect.y0) - y_area_to_remove : int(illustration_rect.y1) + y_area_to_remove, - int(illustration_rect.x0) - x_area_to_remove : int(illustration_rect.x1) + x_area_to_remove, - ] = float("-inf") - - # convert the illustration_rect to the coordinate system of the PDF - horizontal_scaling = page.rect.width / img.shape[1] - vertical_scaling = page.rect.height / img.shape[0] - pdf_illustration_rect = fitz.Rect( - illustration_rect.x0 * horizontal_scaling, - illustration_rect.y0 * vertical_scaling, - illustration_rect.x1 * horizontal_scaling, - illustration_rect.y1 * vertical_scaling, - ) - - # extract the groundwater information from the image using the text - groundwater_info_lines = self.get_lines_near_rect(lines, pdf_illustration_rect) - - # sort the lines by their proximity to the key line center, compute the distance to the key line center - def distance_to_key_center(line_rect: fitz.Rect, illustration_rect: fitz.Rect) -> float: - key_center_x = (illustration_rect.x0 + illustration_rect.x1) / 2 - key_center_y = (illustration_rect.y0 + illustration_rect.y1) / 2 - line_center_x = (line_rect.x0 + line_rect.x1) / 2 - line_center_y = (line_rect.y0 + line_rect.y1) / 2 - return math.sqrt((line_center_x - key_center_x) ** 2 + (line_center_y - key_center_y) ** 2) - - groundwater_info_lines.sort(key=lambda line: distance_to_key_center(line.rect, pdf_illustration_rect)) - try: - extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page_number) - if extracted_gw.groundwater.depth or extracted_gw.groundwater.elevation: - # Fill in the depth and elevation if they are not already filled in based on the terrain - if terrain_elevation: - if not extracted_gw.groundwater.depth and extracted_gw.groundwater.elevation: - extracted_gw.groundwater.depth = round( - terrain_elevation.elevation - extracted_gw.groundwater.elevation, 2 - ) - if not extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: - extracted_gw.groundwater.elevation = round( - terrain_elevation.elevation - extracted_gw.groundwater.depth, 2 - ) - - # Make a sanity check to see if elevation and depth make sense (i.e., they add up: - # elevation + depth = terrain elevation) - if extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: - extract_terrain_elevation = round( - extracted_gw.groundwater.elevation + extracted_gw.groundwater.depth, 2 - ) - if extract_terrain_elevation != terrain_elevation.elevation: - # If the extracted elevation and depth do not match the terrain elevation, we try - # to remove one of the items from the match and see if we can find a better match. - logger.warning( - "The extracted elevation and depth do not match the terrain elevation." - ) - logger.warning( - "Elevation: %s, Depth: %s, Terrain Elevation: %s", - extracted_gw.groundwater.elevation, - extracted_gw.groundwater.depth, - terrain_elevation.elevation, - ) - - # re-run the extraction and see if we can find a better match by removing one - # item from the current match - groundwater_info_lines_without_depth = [ - line - for line in groundwater_info_lines - if str(extracted_gw.groundwater.depth) not in line.text - ] - groundwater_info_lines_without_elevation = [ - line - for line in groundwater_info_lines - if str(extracted_gw.groundwater.elevation) not in line.text - ] - extracted_gw = self.get_groundwater_info_from_lines( - groundwater_info_lines_without_depth, page_number - ) - - if not extracted_gw.groundwater.depth: - extracted_gw = self.get_groundwater_info_from_lines( - groundwater_info_lines_without_elevation, page_number - ) - - if extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: - extract_terrain_elevation = round( - extracted_gw.groundwater.elevation + extracted_gw.groundwater.depth, 2 - ) - - if extract_terrain_elevation != terrain_elevation.elevation: - logger.warning( - "The extracted elevation and depth do not match the terrain elevation." - ) - logger.warning( - "Elevation: %s, Depth: %s, Terrain Elevation: %s", - extracted_gw.groundwater.elevation, - extracted_gw.groundwater.depth, - terrain_elevation.elevation, - ) - continue - - # Only if the groundwater information is not already in the list - if extracted_gw not in extracted_groundwater_list and extracted_gw.groundwater.date: - extracted_groundwater_list.append(extracted_gw) - confidence_list.append(confidence) - - # Remove the extracted groundwater information from the lines to avoid double extraction - for line in groundwater_info_lines: - # if the rectangle of the line is in contact with the rectangle of the extracted - # groundwater information, remove the line - if line.rect.intersects(extracted_gw.rect): - lines.remove(line) - - except ValueError as error: - logger.warning("ValueError: %s", error) - continue - - # TODO: Maybe we could stop the search if we found a good match with one of the templates - - return extracted_groundwater_list, confidence_list - def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[GroundwaterInformationOnPage]: """Extracts the groundwater information from a borehole profile. @@ -491,9 +309,13 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Groun found_groundwater = self.get_groundwater_near_key(lines, page_number) if not found_groundwater and self.is_searching_groundwater_illustration: + from stratigraphy.groundwater.gw_illustration_template_matching import ( + get_groundwater_from_illustration, + ) + # Extract groundwater from illustration - found_groundwater, confidence_list = self.get_groundwater_from_illustration( - lines, page_number, terrain_elevation + found_groundwater, confidence_list = get_groundwater_from_illustration( + self, lines, page_number, terrain_elevation ) if found_groundwater: logger.info("Confidence list: %s", confidence_list) diff --git a/src/stratigraphy/groundwater/gw_illustration_template_matching.py b/src/stratigraphy/groundwater/gw_illustration_template_matching.py new file mode 100644 index 00000000..29eed3c0 --- /dev/null +++ b/src/stratigraphy/groundwater/gw_illustration_template_matching.py @@ -0,0 +1,212 @@ +"""This module contains the template matching code. + +The code in this file aims to extract groundwater information based on the location where +the groundwater illustration was found in the document of interest. +""" + +import logging +import math +import os +from pathlib import Path + +import fitz +import numpy as np +import skimage as ski +from stratigraphy.data_extractor.utility import get_lines_near_rect +from stratigraphy.groundwater.groundwater_extraction import GroundwaterInformationOnPage, GroundwaterLevelExtractor +from stratigraphy.lines.line import TextLine +from stratigraphy.metadata.elevation_extraction import Elevation + +logger = logging.getLogger(__name__) + + +def load_templates() -> list[np.ndarray]: + """Load the templates for the groundwater information. + + Returns: + list[np.ndarray]: the loaded templates + """ + templates = [] + template_dir = os.path.join(os.path.dirname(__file__), "assets") + for template in os.listdir(template_dir): + if template.endswith(".npy"): + templates.append(np.load(os.path.join(template_dir, template))) + return templates + + +def get_groundwater_from_illustration( + groundwater_extractor: GroundwaterLevelExtractor, + lines: list[TextLine], + page_number: int, + terrain_elevation: Elevation | None, +) -> list[GroundwaterInformationOnPage]: + """Extracts the groundwater information from an illustration. + + Args: + groundwater_extractor (GroundwaterLevelExtractor): the groundwater level extractor + lines (list[TextLine]): the lines of text to extract the groundwater information from + page_number (int): the page number (1-based) of the PDF document + terrain_elevation (Elevation | None): The elevation of the terrain. + + Returns: + list[GroundwaterInformationOnPage]: the extracted groundwater information + """ + extracted_groundwater_list = [] + confidence_list = [] + + # convert the doc to an image + page = groundwater_extractor.doc.load_page(page_number - 1) + filename = Path(groundwater_extractor.doc.name).stem + png_filename = f"{filename}-{page_number + 1}.png" + png_path = f"/tmp/{png_filename}" # Local path to save the PNG + fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(png_path) + + # load the image + img = ski.io.imread(png_path) + N_BEST_MATCHES = 5 + TEMPLATE_MATCH_THRESHOLD = 0.66 + + # extract the groundwater information from the image + for template in load_templates(): + # Compute the match of the template and the image (correlation coef) + result = ski.feature.match_template(img, template) + + for _ in range(N_BEST_MATCHES): + ij = np.unravel_index(np.argmax(result), result.shape) + confidence = np.max(result) # TODO - use confidence to filter out bad matches + if confidence < TEMPLATE_MATCH_THRESHOLD: + # skip this template if the confidence is too low to avoid false positives + continue + top_left = (ij[1], ij[0]) + illustration_rect = fitz.Rect( + top_left[0], top_left[1], top_left[0] + template.shape[1], top_left[1] + template.shape[0] + ) + + # Remove the matched area from the template matching result to avoid finding the same area again + # for the same template + x_area_to_remove = int(0.75 * template.shape[1]) + y_area_to_remove = int(0.75 * template.shape[0]) + result[ + int(illustration_rect.y0) - y_area_to_remove : int(illustration_rect.y1) + y_area_to_remove, + int(illustration_rect.x0) - x_area_to_remove : int(illustration_rect.x1) + x_area_to_remove, + ] = float("-inf") + + # convert the illustration_rect to the coordinate system of the PDF + horizontal_scaling = page.rect.width / img.shape[1] + vertical_scaling = page.rect.height / img.shape[0] + pdf_illustration_rect = fitz.Rect( + illustration_rect.x0 * horizontal_scaling, + illustration_rect.y0 * vertical_scaling, + illustration_rect.x1 * horizontal_scaling, + illustration_rect.y1 * vertical_scaling, + ) + + # extract the groundwater information from the image using the text + groundwater_info_lines = get_lines_near_rect( + groundwater_extractor.search_left_factor, + groundwater_extractor.search_right_factor, + groundwater_extractor.search_above_factor, + groundwater_extractor.search_below_factor, + lines, + pdf_illustration_rect, + ) + + # sort the lines by their proximity to the key line center, compute the distance to the key line center + def distance_to_key_center(line_rect: fitz.Rect, illustration_rect: fitz.Rect) -> float: + key_center_x = (illustration_rect.x0 + illustration_rect.x1) / 2 + key_center_y = (illustration_rect.y0 + illustration_rect.y1) / 2 + line_center_x = (line_rect.x0 + line_rect.x1) / 2 + line_center_y = (line_rect.y0 + line_rect.y1) / 2 + return math.sqrt((line_center_x - key_center_x) ** 2 + (line_center_y - key_center_y) ** 2) + + groundwater_info_lines.sort(key=lambda line: distance_to_key_center(line.rect, pdf_illustration_rect)) + try: + extracted_gw = groundwater_extractor.get_groundwater_info_from_lines( + groundwater_info_lines, page_number + ) + if extracted_gw.groundwater.depth or extracted_gw.groundwater.elevation: + # Fill in the depth and elevation if they are not already filled in based on the terrain + if terrain_elevation: + if not extracted_gw.groundwater.depth and extracted_gw.groundwater.elevation: + extracted_gw.groundwater.depth = round( + terrain_elevation.elevation - extracted_gw.groundwater.elevation, 2 + ) + if not extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: + extracted_gw.groundwater.elevation = round( + terrain_elevation.elevation - extracted_gw.groundwater.depth, 2 + ) + + # Make a sanity check to see if elevation and depth make sense (i.e., they add up: + # elevation + depth = terrain elevation) + if extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: + extract_terrain_elevation = round( + extracted_gw.groundwater.elevation + extracted_gw.groundwater.depth, 2 + ) + if extract_terrain_elevation != terrain_elevation.elevation: + # If the extracted elevation and depth do not match the terrain elevation, we try + # to remove one of the items from the match and see if we can find a better match. + logger.warning("The extracted elevation and depth do not match the terrain elevation.") + logger.warning( + "Elevation: %s, Depth: %s, Terrain Elevation: %s", + extracted_gw.groundwater.elevation, + extracted_gw.groundwater.depth, + terrain_elevation.elevation, + ) + + # re-run the extraction and see if we can find a better match by removing one + # item from the current match + groundwater_info_lines_without_depth = [ + line + for line in groundwater_info_lines + if str(extracted_gw.groundwater.depth) not in line.text + ] + groundwater_info_lines_without_elevation = [ + line + for line in groundwater_info_lines + if str(extracted_gw.groundwater.elevation) not in line.text + ] + extracted_gw = groundwater_extractor.get_groundwater_info_from_lines( + groundwater_info_lines_without_depth, page_number + ) + + if not extracted_gw.groundwater.depth: + extracted_gw = groundwater_extractor.get_groundwater_info_from_lines( + groundwater_info_lines_without_elevation, page_number + ) + + if extracted_gw.groundwater.elevation and extracted_gw.groundwater.depth: + extract_terrain_elevation = round( + extracted_gw.groundwater.elevation + extracted_gw.groundwater.depth, 2 + ) + + if extract_terrain_elevation != terrain_elevation.elevation: + logger.warning( + "The extracted elevation and depth do not match the terrain elevation." + ) + logger.warning( + "Elevation: %s, Depth: %s, Terrain Elevation: %s", + extracted_gw.groundwater.elevation, + extracted_gw.groundwater.depth, + terrain_elevation.elevation, + ) + continue + + # Only if the groundwater information is not already in the list + if extracted_gw not in extracted_groundwater_list and extracted_gw.groundwater.date: + extracted_groundwater_list.append(extracted_gw) + confidence_list.append(confidence) + + # Remove the extracted groundwater information from the lines to avoid double extraction + for line in groundwater_info_lines: + # if the rectangle of the line is in contact with the rectangle of the extracted + # groundwater information, remove the line + if line.rect.intersects(extracted_gw.rect): + lines.remove(line) + + except ValueError as error: + logger.warning("ValueError: %s", error) + continue + + # TODO: Maybe we could stop the search if we found a good match with one of the templates + + return extracted_groundwater_list, confidence_list From f0c68380057069a42807a343c2a996a5fc072019 Mon Sep 17 00:00:00 2001 From: dcleres Date: Thu, 17 Oct 2024 11:30:32 +0200 Subject: [PATCH 13/14] Address suggestion made during PR --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c2e2b4d0..dca84095 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ groundwater_illustration_matching = [ "scikit-image==0.24.0" ] -all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools]"] +all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools, groundwater_illustration_matching]"] [project.scripts] boreholes-extract-all = "stratigraphy.main:click_pipeline" From ed6b6fb24467f5fc0e6029af8200336c7048ca15 Mon Sep 17 00:00:00 2001 From: dcleres Date: Thu, 17 Oct 2024 13:16:39 +0200 Subject: [PATCH 14/14] Changed underscore with hyphens in the toml file --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dca84095..684fab22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,11 +54,11 @@ visualize = [ devtools = [ "tqdm" ] -groundwater_illustration_matching = [ +groundwater-illustration-matching = [ "scikit-image==0.24.0" ] -all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools, groundwater_illustration_matching]"] +all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools, groundwater-illustration-matching]"] [project.scripts] boreholes-extract-all = "stratigraphy.main:click_pipeline"