From 9997ead43860bc504c48bdb5f98a2003edef7b84 Mon Sep 17 00:00:00 2001 From: gcappaert Date: Thu, 4 May 2023 21:32:11 -0400 Subject: [PATCH 1/3] Add regex pattern to eliminate extra citation text --- scripts/03-parse-inspection-pdfs.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/03-parse-inspection-pdfs.py b/scripts/03-parse-inspection-pdfs.py index 89a9d1d840c..0d2a2ff96e0 100644 --- a/scripts/03-parse-inspection-pdfs.py +++ b/scripts/03-parse-inspection-pdfs.py @@ -184,6 +184,7 @@ def add_bolded(self, text: str) -> None: def add_narrative(self, text: str) -> None: assert self.heading assert self.desc + self.narrative += "\n" + text def to_dict(self) -> dict[str, typing.Union[str, bool]]: @@ -210,6 +211,8 @@ def get_report_body( for line_words in cluster_objects(words, "top", tolerance=0): first = line_words[0] text = " ".join(x["text"] for x in line_words) + + addl = text.lower().strip(":") in [ "additional inspectors", # Generic edge-case "direct", # Specific edge-case from hash_id:0db69ec135a5b244 @@ -229,6 +232,11 @@ def get_report_body( full_text.append(cropped.extract_text().strip()) + if len(citations): + conclusion_patterns = r"(.*(exit interview|exit.*conducted|additional.*inspectors|end section|exit.*[facility representative | licensee]))" + res = re.search(conclusion_patterns, citations[-1].narrative.lower()) + if res: + citations[-1].narrative = citations[-1].narrative[0:res.start()] return ([v.to_dict() for v in citations], "\n\n".join(full_text)) @@ -359,13 +367,13 @@ def parse(pdf: pdfplumber.pdf.PDF) -> dict[str, typing.Any]: def parse_all(overwrite: bool = False, start: typing.Optional[int] = 0) -> None: - paths = sorted(Path("pdfs/inspections/").glob("*.pdf")) + paths = sorted(Path("../pdfs/inspections/").glob("*.pdf")) start_int = start or 0 for i, path in enumerate(paths): if i < start_int: continue - dest = Path(f"data/parsed/inspections/{path.stem}.json") + dest = Path(f"../data/parsed/inspections/{path.stem}.json") if dest.exists() and not overwrite: continue From c158e614b533bde809856963c7296c14c1637352 Mon Sep 17 00:00:00 2001 From: gcappaert Date: Thu, 4 May 2023 21:41:39 -0400 Subject: [PATCH 2/3] Tested and seems to be working --- scripts/03-parse-inspection-pdfs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/03-parse-inspection-pdfs.py b/scripts/03-parse-inspection-pdfs.py index 0d2a2ff96e0..aaaa7274bc5 100644 --- a/scripts/03-parse-inspection-pdfs.py +++ b/scripts/03-parse-inspection-pdfs.py @@ -367,13 +367,13 @@ def parse(pdf: pdfplumber.pdf.PDF) -> dict[str, typing.Any]: def parse_all(overwrite: bool = False, start: typing.Optional[int] = 0) -> None: - paths = sorted(Path("../pdfs/inspections/").glob("*.pdf")) + paths = sorted(Path("/pdfs/inspections/").glob("*.pdf")) start_int = start or 0 for i, path in enumerate(paths): if i < start_int: continue - dest = Path(f"../data/parsed/inspections/{path.stem}.json") + dest = Path(f"/data/parsed/inspections/{path.stem}.json") if dest.exists() and not overwrite: continue From d3dffc034e2bfad5b7d72679c65248c759247620 Mon Sep 17 00:00:00 2001 From: gcappaert Date: Thu, 4 May 2023 21:43:51 -0400 Subject: [PATCH 3/3] Fixed filepaths --- scripts/03-parse-inspection-pdfs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/03-parse-inspection-pdfs.py b/scripts/03-parse-inspection-pdfs.py index aaaa7274bc5..460917ba4e5 100644 --- a/scripts/03-parse-inspection-pdfs.py +++ b/scripts/03-parse-inspection-pdfs.py @@ -367,13 +367,13 @@ def parse(pdf: pdfplumber.pdf.PDF) -> dict[str, typing.Any]: def parse_all(overwrite: bool = False, start: typing.Optional[int] = 0) -> None: - paths = sorted(Path("/pdfs/inspections/").glob("*.pdf")) + paths = sorted(Path("pdfs/inspections/").glob("*.pdf")) start_int = start or 0 for i, path in enumerate(paths): if i < start_int: continue - dest = Path(f"/data/parsed/inspections/{path.stem}.json") + dest = Path(f"data/parsed/inspections/{path.stem}.json") if dest.exists() and not overwrite: continue