diff --git a/scripts/03-parse-inspection-pdfs.py b/scripts/03-parse-inspection-pdfs.py index 89a9d1d840c..460917ba4e5 100644 --- a/scripts/03-parse-inspection-pdfs.py +++ b/scripts/03-parse-inspection-pdfs.py @@ -184,6 +184,7 @@ def add_bolded(self, text: str) -> None: def add_narrative(self, text: str) -> None: assert self.heading assert self.desc + self.narrative += "\n" + text def to_dict(self) -> dict[str, typing.Union[str, bool]]: @@ -210,6 +211,8 @@ def get_report_body( for line_words in cluster_objects(words, "top", tolerance=0): first = line_words[0] text = " ".join(x["text"] for x in line_words) + + addl = text.lower().strip(":") in [ "additional inspectors", # Generic edge-case "direct", # Specific edge-case from hash_id:0db69ec135a5b244 @@ -229,6 +232,11 @@ def get_report_body( full_text.append(cropped.extract_text().strip()) + if len(citations): + conclusion_patterns = r"(.*(exit interview|exit.*conducted|additional.*inspectors|end section|exit.*[facility representative | licensee]))" + res = re.search(conclusion_patterns, citations[-1].narrative.lower()) + if res: + citations[-1].narrative = citations[-1].narrative[0:res.start()] return ([v.to_dict() for v in citations], "\n\n".join(full_text))