generated from MuckRock/documentcloud-hello-world-addon
-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
60 lines (56 loc) · 2.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
This is an Add-On that uses docTR https://github.com/mindee/doctr to OCR documents for DocumentCloud
"""
from documentcloud.addon import AddOn
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
class docTR(AddOn):
"""Class definition"""
def main(self):
"""The main add-on functionality goes here."""
model = ocr_predictor('db_resnet50_rotation', 'crnn_vgg16_bn', pretrained=True, assume_straight_pages=False, export_as_straight_boxes=True)
for document in self.get_documents():
pdf_name = f"'{document.id}.pdf'"
with open(pdf_name, "wb") as pdf:
pdf.write(document.pdf)
doc = DocumentFile.from_pdf(pdf_name)
result = model(doc)
json_export = result.export()
pages = []
for page in json_export['pages']:
page_idx = page['page_idx']
text = ''
dc_page = {
"page_number": page_idx,
"text": text,
"ocr": "doctr",
"positions": [],
}
for block in page['blocks']:
for line in block['lines']:
line_text = ""
for word in line['words']:
line_text += word['value'] + ' '
word_value = word['value']
word_bounding_box = word['geometry']
x1 = word_bounding_box[0][0]
y1 = word_bounding_box[0][1]
x2 = word_bounding_box[1][0]
y2 = word_bounding_box[1][1]
if word['value']:
position_info = {
"text": word_value,
"x1": float(x1),
"x2": float(x2),
"y1": float(y1),
"y2": float(y2),
}
dc_page["positions"].append(position_info)
text += line_text.strip() + '\n'
text += '\n'
dc_page['text'] = text
pages.append(dc_page)
resp = self.client.patch(f"documents/{document.id}/", json={"pages": pages})
resp.raise_for_status()
if __name__ == "__main__":
docTR().main()