Skip to content

Commit

Permalink
First pass at updating positions with docTR
Browse files Browse the repository at this point in the history
  • Loading branch information
duckduckgrayduck authored Sep 20, 2023
1 parent f56eae7 commit e22d2b7
Showing 1 changed file with 29 additions and 1 deletion.
30 changes: 29 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,45 @@ def main(self):
doc = DocumentFile.from_pdf(pdf_name)
result = model(doc)
json_export = result.export()
pages = []
for page in json_export['pages']:
page_idx = page['page_idx']
text = ''
dc_page = {
"page_number": page_idx,
"text": text,
"ocr": "docTR",
"positions": [],
}
print(f"Page {page_idx}:")
for block in page['blocks']:
for line in block['lines']:
line_text = ""
for word in line['words']:
line_text += word['value'] + ' '
word_value = word['value']
word_bounding_box = word['geometry']
print(f"Word: {word_value}")
print(f"Bounding Box: {word_bounding_box}")


x1 = word_bounding_box[0][0]
y1 = word_bounding_box[0][1]
x2 = word_bounding_box[1][0]
y2 = word_bounding_box[1][1]
position_info = {
"text": word_value,
"x1": x1,
"x2": x2,
"y1": y1,
"y2": y2,
}
dc_page["positions"].append(position_info)
text += line_text.strip() + '\n'
text += '\n'
dc_page['text'] = text
pages.append(dc_page)
resp = self.client.patch(f"documents/{document.id}/", json={"pages": pages})
resp.raise_for_status()

if __name__ == "__main__":
docTR().main()

0 comments on commit e22d2b7

Please sign in to comment.