-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathpdf.py
306 lines (281 loc) · 13.6 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# -*- coding: utf8 -*-
import os
import re
import sqlite3
import argparse
import sys
import glob
from collections import OrderedDict
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTTextLineHorizontal, LTChar, LTLine, \
LTText
def main(argv):
def create_database(output_folder):
if not os.path.exists(os.path.split(output_folder)[0]):
os.makedirs(os.path.split(output_folder)[0])
conn = sqlite3.connect(os.path.abspath(output_folder))
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS Title')
c.execute('''CREATE TABLE Title
(TitleId INTEGER PRIMARY KEY NOT NULL,Title VARCHAR)''')
c.execute('DROP TABLE IF EXISTS Person')
c.execute('''CREATE TABLE Person
(PersonId INTEGER PRIMARY KEY NOT NULL,Name VARCHAR,Surname VARCHAR)''')
c.execute('DROP TABLE IF EXISTS Experience')
c.execute('''CREATE TABLE Experience
(ExperienceID INTEGER PRIMARY KEY NOT NULL,PersonID INTEGER,CompanyID INTEGER,TitleID INTEGER,StartMonth VARCHAR,StartYear VARCHAR,EndMonth VARCHAR,EndYear VARCHAR,Ongoing INTEGER )''')
c.execute('DROP TABLE IF EXISTS Company')
c.execute('''CREATE TABLE Company
(CompanyId INTEGER PRIMARY KEY NOT NULL,CompanyName VARCHAR)''')
c.execute('DROP TABLE IF EXISTS Major')
c.execute('''CREATE TABLE Major
(MajorId INTEGER PRIMARY KEY NOT NULL,Major VARCHAR)''')
c.execute('DROP TABLE IF EXISTS School')
c.execute('''CREATE TABLE School
(SchoolId INTEGER PRIMARY KEY NOT NULL,School VARCHAR)''')
c.execute('DROP TABLE IF EXISTS Education')
c.execute('''CREATE TABLE Education
(EducationId INTEGER PRIMARY KEY NOT NULL,PersonID INTEGER,DegreeId INTEGER,SchoolId INTEGER,MajorId INTEGER,StartMonth VARCHAR,StartYear VARCHAR,EndMonth VARCHAR,EndYear VARCHAR,Ongoing INTEGER )''')
c.execute('DROP TABLE IF EXISTS Degree')
c.execute('''CREATE TABLE Degree
(DegreeId INTEGER PRIMARY KEY NOT NULL,Degree VARCHAR)''')
return conn
def getfilelist(path, extension=None):
filenames = []
for i in os.walk(path.decode('utf-8')).next()[2]:
if (extension):
if i.endswith(extension):
# print os.path.join(path,i)
filenames.append(os.path.join(path, i))
else:
filenames.append(os.path.join(path, i))
return filenames
def insert(table, column, row_value, c):
"""Checks if a row with 'value' exists in a 'column' of a 'table' using database cursor 'c', if so it returns an Id of
first matching row, otherwise it inserts a new row and returns it's id
"""
data = c.execute("SELECT * FROM {} WHERE {} = ?".format(table, column), [row_value]).fetchone()
if data is None:
c.execute("INSERT INTO {} VALUES (NULL,?)".format(table), [row_value])
dataId = c.lastrowid
else:
dataId = data[0]
return dataId
def parse_date(dates):
"""Parses a string with the dates in it,
take string, returns a dictionary: {'from_month':'','from_year':'','to_month':'','to_year':''}
"""
ret = {'from_month': '', 'from_year': '', 'to_month': '', 'to_year': ''}
dates = dates.strip().split('-')
if len(dates) == 2:
date_from = dates[0].strip().split(' ')
date_to = dates[1].strip().split(' ')
if len(date_from) == 1:
ret['from_month'] = ''
ret['from_year'] = date_from[0]
elif len(date_from) == 2:
ret['from_month'] = date_from[0]
ret['from_year'] = date_from[1]
if len(date_to) == 1:
ret['to_month'] = ''
ret['to_year'] = date_to[0]
elif len(date_to) == 2:
ret['to_month'] = date_to[0]
ret['to_year'] = date_to[1]
return ret
def get_chars(line, with_anno=True):
# get chars from the LTTextline
ret = []
for char in line:
if with_anno:
ret.append(char)
elif not with_anno and type(char) <> pdfminer.layout.LTAnno:
ret.append(char)
return ret
isiterable = lambda obj: isinstance(obj, basestring) or getattr(obj, '__iter__', False)
def get_objects(layout):
# collecting all objects from the layout, 1 level depth
objs = []
for obj in layout:
if isiterable(obj):
for element in obj:
objs.append(element)
else:
objs.append(obj)
return objs
def get_data(objs, name):
"""Collects objects from a header with 'name' in it.
Takes list of LTObjects, returns list of LTObjects
"""
FONTSIZE = 17 # heading's font size is 17.85
ed_st = ed_en = 0
for idx, obj in enumerate(objs):
if isinstance(obj, LTTextLineHorizontal) and name in obj.get_text() and get_chars(obj)[
0].size > FONTSIZE and not ed_st:
ed_st = idx
if isinstance(obj, LTLine) and ed_st and not ed_en:
ed_en = idx
return objs[ed_st + 1:ed_en]
def get_name(objs):
"""Collects persons' names, takes a list of LTObjects, returns a
list: [name,surname]
"""
name = ''
for obj in objs:
if isinstance(obj, LTTextLine):
for char in obj:
if isinstance(char, LTChar):
if char.size > 23:
name = obj.get_text()
break
name = name.encode('utf-8').strip().split(' ')
return [name[0], len(name) > 1 and ' '.join(name[1:])]
def get_experience_info(objs):
"""Collects companies' names,titles and dates, takes list of LTObjects, returns a
list: [title,company,{'from_month':'','from_year':'','to_month':'','to_year':''}]
"""
FONTSIZE = 13.4 # fontsize of bold headers
ret = []
company = title = ''
for idx, obj in enumerate(objs):
company = title = ''
if idx > 0 and get_chars(objs[idx - 1])[0].size > FONTSIZE:
brackets = re.search('([(]+(.)*[)]+)', obj.get_text())
# print brackets
if brackets:
header = objs[idx - 1].get_text().split(' at ')
if len(header) == 2:
company = header[1].strip()
title = header[0].strip()
ret.append([title, company, parse_date(obj.get_text()[:brackets.start()])])
return ret
def get_education_info(objs):
"""Collects schools,majors,dates, takes a list of LTObjects, returns a
list: [school,degree,major,{'from_month':'','from_year':'','to_month':'','to_year':''}]
"""
# collect schools and dates
FONTSIZE = 13.4 # fontsize of bold headers
ret = []
degree = major = dates = school = ''
for idx, obj in enumerate(objs):
if get_chars(obj)[0].size > FONTSIZE:
try:
next_object = objs[idx + 1].get_text()
except Exception, e:
print e
next_object = ''
school = obj.get_text()
# print next_object
if next_object:
second_line = next_object.split(',')
if len(second_line) >= 3:
degree = second_line[0].strip()
major = ' '.join(second_line[1:-1]).strip()
dates = parse_date(second_line[-1])
elif len(second_line) == 1:
dates = parse_date(second_line[0])
elif len(second_line) == 2:
major = second_line[0]
dates = parse_date(second_line[1])
else:
degree = major = ''
dates = {'from_month': '', 'from_year': '', 'to_month': '', 'to_year': ''}
ret.append([school, degree, major, dates])
return ret
output_file = os.path.abspath(argv.output)
input_folder = os.path.abspath(argv.input)
print 'Input folder: %s, output file: %s ' % (input_folder, output_file)
conn = create_database(output_file)
conn.commit()
c = conn.cursor()
filelist = getfilelist(input_folder, '.pdf')
if not filelist:
print 'No pdf files found in the provided folder.'
sys.exit(2)
for f in filelist:
# print f
# if not j.endswith('ReidRubsamen, M.D..pdf'): continue
fp = open(f, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
objs = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
# collecting objects from the all pages, sorting them by their Y coordinate
objs.append(sorted(get_objects(layout), key=lambda x: x.y0, reverse=True))
objs = sum(objs, []) # flattening to 1D array
# getting objects from the corresponding sections
exp = get_data(objs, 'Experience')
ed = get_data(objs, 'Education')
name = get_name(objs)[0].decode('utf8')
surname = get_name(objs)[1].decode('utf8')
person = c.execute('SELECT * FROM Person WHERE Name=? AND Surname=?', [name, surname]).fetchone()
if not person:
c.execute("INSERT INTO Person VALUES (NULL,?,?)", [name, surname])
personId = c.lastrowid
else:
personId = person[0]
exp_row = OrderedDict([('personId', personId), ('company', ''), ('title', ''), \
('from_month', ''), ('from_year', ''), ('to_month', ''), ('to_year', ''),
('ongoing', '')])
ed_row = OrderedDict([('personId', personId), ('Degree', ''), ('School', ''), ('Major', ''), \
('from_month', ''), ('from_year', ''), ('to_month', ''), ('to_year', ''),
('ongoing', '')])
for place in get_experience_info(exp):
for key in exp_row:
if key <> 'personId':
exp_row[key] = ''
exp_row['company'] = place[1].strip() and insert('Company', 'CompanyName', place[1], c)
exp_row['title'] = place[0].strip() and insert('Title', 'Title', place[0], c)
exp_row['from_month'] = place[2]['from_month'] if 'from_month' in place[2] else ''
exp_row['from_year'] = place[2]['from_year'] if 'from_year' in place[2] else ''
exp_row['to_month'] = place[2]['to_month'] if 'to_month' in place[2] else ''
exp_row['to_year'] = place[2]['to_year'] if 'to_year' in place[2] else ''
exp_row['ongoing'] = 1 if exp_row['to_year'] == 'Present' else 0
c.execute("INSERT INTO Experience VALUES (NULL,?,?,?,?,?,?,?,?)", exp_row.values())
for place in get_education_info(ed):
for key in ed_row:
if key <> 'personId':
ed_row[key] = ''
ed_row['Degree'] = place[1].strip() and insert('Degree', 'Degree', place[1], c)
ed_row['School'] = place[0].strip() and insert('School', 'School', place[0], c)
ed_row['Major'] = place[2].strip() and insert('Major', 'Major', place[2], c)
ed_row['from_month'] = place[3]['from_month'] if 'from_month' in place[3] else ''
ed_row['from_year'] = place[3]['from_year'] if 'from_year' in place[3] else ''
ed_row['to_month'] = place[3]['to_month'] if 'to_month' in place[2] else ''
ed_row['to_year'] = place[3]['to_year'] if 'to_year' in place[3] else ''
ed_row['ongoing'] = 1 if exp_row['to_year'] == 'Present' else 0
c.execute("INSERT INTO Education VALUES (NULL,?,?,?,?,?,?,?,?,?)", ed_row.values())
print get_name(objs)
conn.commit()
conn.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True, help="Directory with pdf files")
parser.add_argument('-o', '--output', required=True)
args = parser.parse_args()
if not os.path.exists(args.input):
exit("Please specify an existing direcory using the -i parameter.")
main(args)