-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprepare.py
68 lines (59 loc) · 2.32 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import argparse
import subprocess
import os
import shutil
import re
def cut_doc_outline(file_):
aux_file = "%s.tmp" % file_
aux = open(aux_file, 'w')
with open(file_, 'r') as f:
for l in f.readlines():
if l == "<a name=\"outline\"></a><h1>Document Outline</h1>\n":
aux.write("</body>\n")
aux.write("</html>\n")
break
else:
aux.write(l)
aux.close()
shutil.move(aux_file, file_)
def process_pdf(party, pdf):
path = 'build/' + party
if not os.path.exists(path):
os.makedirs(path)
filename = os.path.basename(pdf)
basename, extension = os.path.splitext(filename)
shutil.copy(args.pdf, path + '/' + filename)
subprocess.call(['pdfseparate', path + "/" + filename,
path + "/" + basename + "-%d.pdf"])
one_page = re.compile(r'.*-\d+')
for f in os.listdir(path):
b, ext = os.path.splitext(f)
html = "%s/%s.html" % (path, b)
if ext == ".pdf":
if one_page.match(b):
subprocess.call(['pdf2htmlEX', "--tounicode", "1",
"--decompose-ligature", "1",
"--embed-external-font", "0",
"--printing", "0",
"--process-outline", "0",
"--process-nontext", "0",
"--optimize-text", "1",
"--fit-width", "680",
"%s/%s" % (path, f), html])
else:
subprocess.call(['pdftohtml', '-s', '-i', '-noframes',
'-nomerge', '-enc', 'UTF-8',
"%s/%s" % (path, f)])
cut_doc_outline(html)
assert os.path.exists(html), "No exists: %s" % html
def parse_args():
desc = "Split a PDF in pages and transform the pages and the \
complete PDF in HTML"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('party', type=str, help='The Party name')
parser.add_argument('pdf', type=str, help='The PDF file')
args = parser.parse_args()
print "party: %s pdf: %s" % (args.party, args.pdf)
return args
args = parse_args()
process_pdf(args.party, args.pdf)