From e22d2b76d2fe6f3d1b0b27f65684c4053c1c5910 Mon Sep 17 00:00:00 2001 From: Sanjin <102841251+duckduckgrayduck@users.noreply.github.com> Date: Wed, 20 Sep 2023 10:53:58 -0500 Subject: [PATCH] First pass at updating positions with docTR --- main.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index fb94783..6b8c2aa 100644 --- a/main.py +++ b/main.py @@ -19,17 +19,45 @@ def main(self): doc = DocumentFile.from_pdf(pdf_name) result = model(doc) json_export = result.export() + pages = [] for page in json_export['pages']: page_idx = page['page_idx'] + text = '' + dc_page = { + "page_number": page_idx, + "text": text, + "ocr": "docTR", + "positions": [], + } print(f"Page {page_idx}:") for block in page['blocks']: for line in block['lines']: + line_text = "" for word in line['words']: + line_text += word['value'] + ' ' word_value = word['value'] word_bounding_box = word['geometry'] print(f"Word: {word_value}") print(f"Bounding Box: {word_bounding_box}") - + + x1 = word_bounding_box[0][0] + y1 = word_bounding_box[0][1] + x2 = word_bounding_box[1][0] + y2 = word_bounding_box[1][1] + position_info = { + "text": word_value, + "x1": x1, + "x2": x2, + "y1": y1, + "y2": y2, + } + dc_page["positions"].append(position_info) + text += line_text.strip() + '\n' + text += '\n' + dc_page['text'] = text + pages.append(dc_page) + resp = self.client.patch(f"documents/{document.id}/", json={"pages": pages}) + resp.raise_for_status() if __name__ == "__main__": docTR().main()