-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathextraction.py
59 lines (46 loc) · 1.71 KB
/
extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""This module consists of:
* Collect Splitted Pdf files.
* Extract and save text files to output dir.
"""
import os
from io import StringIO
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def pdf_to_text(path):
'''Extract text from pdf documents
'''
manager = PDFResourceManager()
retstr = StringIO()
layout = LAParams(all_texts=False, detect_vertical=True)
device = TextConverter(manager, retstr, laparams=layout)
interpreter = PDFPageInterpreter(manager, device)
with open(path, 'rb') as filepath:
for page in PDFPage.get_pages(filepath, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
return text
def extraction(split_path, text_path):
'''Extract and save text files to output dir
'''
# entries names
entries = os.listdir(split_path)
# repeat the process for each entry
for entry in entries:
# define a custom list cotain entries files paths
custom_list = os.listdir(os.path.join(split_path, entry))
# list must be sorted
custom_list.sort(key=lambda f: int(re.sub(r'\D', '', f)))
# repeat the process for each file path
for file_path in custom_list:
text_output = pdf_to_text(
os.path.join(split_path, entry, file_path))
# save text file of each entry
with open(os.path.join(text_path, f"{entry}.txt"),
"a",
encoding="utf-8") as text_file:
text_file.write(text_output)