-
Notifications
You must be signed in to change notification settings - Fork 12
/
headings.py
executable file
·147 lines (128 loc) · 4.27 KB
/
headings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#! /usr/bin/env python
# -*- coding: utf-8 -*-
""" Extract and tag References from a PDF.
Created on Mar 1, 2010
@author: John Harrison
Usage: headings.py OPTIONS FILEPATH
OPTIONS:
--help, -h Print help and exit
--noxml Do not tag individual headings with XML tags.
Default is to include tagging.
--title Only print title then exit
--author Only print author then exit
"""
import sys, getopt
from lxml import etree
from utils import UsageError, ConfigError, mean, median
from pdf2xml import pdf2etree
def pdf2heads(opts, args):
xmltag = True
highlight = False
titleonly = False
authonly = False
for o, a in opts:
if (o == '--noxml'):
xmltag = False
elif (o == '--highlight'):
highlight = True
if (o == '--title'):
titleonly = True
elif (o == '--author'):
authonly = True
tree = pdf2etree(args)
# find title
page = 1
block = 1
title_node = None
while True:
try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
except IndexError: page+=1
else: break
if page > 2:
# probably not going to find it now
break
# find author
page = 1
block = 2
auth_node = None
while True:
try: auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0]
except InbdexError: block+=1
else: break
if block > 4:
# probably not going to find it now
break
font_sizes = tree.xpath('//TOKEN/@font-size')
mean_font_size = mean(font_sizes)
median_font_size = median(font_sizes)
#print "Median Font Size (i.e. body text):", median_font_size
font_colors = tree.xpath('//TOKEN/@font-color')
font_color_hash = {}
for fc in font_colors:
try:
font_color_hash[fc]+=1
except KeyError:
font_color_hash[fc] = 1
sortlist = [(v,k) for k,v in font_color_hash.iteritems()]
sortlist.sort(reverse=True)
main_font_color = sortlist[0][1]
head_txts = []
stop = False
for page_node in tree.xpath('//PAGE'):
for block_node in page_node.xpath('.//BLOCK'):
if xmltag:
if block_node == title_node:
st = "<title>"
et = "</title>"
elif block_node == auth_node:
st = "<author>"
et = "</author>"
else:
st = "<heading>"
et = "</heading>"
if highlight:
st = "\033[0;32m{0}\033[0m".format(st)
et = "\033[0;32m{0}\033[0m".format(et)
else:
st = et = ""
if block_node == title_node and authonly:
continue
headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color))
head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers])
if len(head_txt):
head_txts.append("{0}{1}{2}".format(st, head_txt, et))
if block_node == title_node and titleonly:
stop = True
break
elif block_node == auth_node and authonly:
stop = True
break
if stop:
break
for txt in head_txts:
sys.stdout.writelines([txt, '\n'])
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
try:
try:
opts, args = getopt.getopt(argv, "ht", ["help", "test", "noxml", "highlight", "title", "author"])
except getopt.error as msg:
raise UsageError(msg)
for o, a in opts:
if (o in ['-h', '--help']):
# print help and exit
sys.stdout.write(__doc__)
sys.stdout.flush()
return 0
pdf2heads(opts, args)
except UsageError as err:
print >>sys.stderr, err.msg
print >>sys.stderr, "for help use --help"
return 2
except ConfigError, err:
sys.stderr.writelines([str(err.msg),'\n'])
sys.stderr.flush()
return 1
if __name__ == '__main__':
sys.exit(main())