forked from SavinaRoja/OpenAccess_EPUB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·305 lines (282 loc) · 12.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#! /usr/bin/python
__version__ = '0.0.15'
#Standard Library Modules
import argparse
import sys
import os.path
import shutil
import urllib2
import urlparse
import logging
#OpenAccess_EPUB Modules
import utils
import metadata
import opf
import tocncx
import content
from settings import Settings
from article import Article
settings = Settings()
def initCache(cache_loc):
'''Initiates the cache if it does not exist'''
os.mkdir(cache_loc)
os.mkdir(os.path.join(cache_loc, 'model'))
os.mkdir(os.path.join(cache_loc, 'PLoS'))
os.mkdir(os.path.join(cache_loc, 'Frontiers'))
os.mkdir(os.path.join(cache_loc, 'model', 'images'))
os.mkdir(os.path.join(cache_loc, 'model', 'images', 'figures'))
os.mkdir(os.path.join(cache_loc, 'model', 'images', 'tables'))
os.mkdir(os.path.join(cache_loc, 'model', 'images', 'equations'))
os.mkdir(os.path.join(cache_loc, 'model', 'images', 'supplementary'))
def urlInput(input, xml_dir):
'''Handles input in URL form to instantiate the document'''
try:
if '%2F10.1371%2F' in input: # This is a PLoS page
publisher = 'PLoS'
address = urlparse.urlparse(input)
_fetch = '/article/fetchObjectAttachment.action?uri='
print(address.path)
_id = address.path.split('/')[2]
_rep = '&representation=XML'
access = '{0}://{1}{2}{3}{4}'.format(address.scheme, address.netloc,
_fetch, _id, _rep)
print('Opening {0}'.format(access.__str__()))
open_xml = urllib2.urlopen(access)
elif '/10.3389/' in input: # This is a Frontiers page
publisher = 'Frontiers'
print('OpenAccess_EPUB does not yet support Frontiers')
sys.exit(0)
else: # We don't know how to handle this input
print('Invalid Link: Bad URL or unsupported publisher')
sys.exit(1)
except:
print('Invalid Link: Enter a corrected link or use local file')
sys.exit(1)
else:
filename = open_xml.headers['Content-disposition'].split('\"')[1]
filename = os.path.join(xml_dir, filename)
with open(filename, 'wb') as xml_file:
xml_file.write(open_xml.read())
document = Article(filename)
return(document, filename)
def doiInput(input, xml_dir):
'''Handles input in DOI form to instantiate the document'''
try:
doi_url = 'http://dx.doi.org/' + input[4:]
page = urllib2.urlopen(doi_url)
address = urlparse.urlparse(page.geturl())
path = address.path.replace(':', '%3A').replace('/', '%2F')
_fetch = '/article/fetchObjectAttachment.action?uri='
_id = path.split('article%2F')[1]
_rep = '&representation=XML'
access = '{0}://{1}{2}{3}{4}'.format(address.scheme, address.netloc,
_fetch, _id, _rep)
open_xml = urllib2.urlopen(access)
except:
print('Invalid DOI Link: Check for correct address and format')
print('A valid entry looks like: \"doi:10.1371/journal.pcbi.1002222\"')
sys.exit(1)
else:
filename = open_xml.headers['Content-Disposition'].split('\"')[1]
filename = os.path.join(xml_dir, filename)
with open(filename, 'wb') as xml_file:
xml_file.write(open_xml.read())
document = Article(filename)
return(document, filename)
def localInput(input):
'''Handles input in the form of local file to instantiate the document'''
xml_local = input
document = Article(xml_local)
return(document, xml_local)
def dirExists(outdirect, batch):
if not batch:
print(u'The directory {0} already exists.'.format(outdirect))
r = raw_input('Replace? [y/n]')
if r in ['y', 'Y', '']:
shutil.rmtree(outdirect)
else:
print('Aborting process.')
sys.exit()
else:
shutil.rmtree(outdirect)
def makeEPUB(document, xml_local, cache_dir, outdirect, log_to):
'''
Encapsulates the primary processing work-flow. Before this method is
called, pre-processing has occurred to define important directory and file
locations. The document has been processed for metadata and now it is time
to generate the ePub content.
'''
print(u'Processing output to {0}.epub'.format(outdirect))
if not os.path.isdir(settings.base_epub):
utils.makeEPUBBase(settings.base_epub, settings.css_location)
shutil.copytree(settings.base_epub, outdirect)
DOI = document.getDOI()
if DOI.split('/')[0] == '10.1371':
document.fetchPLoSImages(cache_dir, outdirect, settings.caching)
elif DOI.split('/')[0] == '10.3389':
document.fetchFrontiersImages(cache_dir, outdirect, settings.caching)
content.OPSContent(xml_local, DOI, outdirect, document)
toc = tocncx.TocNCX()
toc.takeArticle(document)
toc.write(outdirect)
myopf = opf.ContentOPF(outdirect)
myopf.takeArticle(document)
myopf.write()
utils.epubZip(outdirect)
#WARNING: shutil.rmtree() is a recursive deletion function, care should be
#taken whenever modifying this code
if settings.cleanup:
shutil.rmtree(outdirect)
def makeCollectionEPUB(documents, cache_dir, outdirect, log_to):
'''Encapsulates the processing workf-flow for the creation of
\"collection\", or \"omnibus\" ePubs from multiple PLoS journal articles.
Article objects have been instantiated and tupled to their local xml files
and now we may generate the file.
'''
print(u'Processing output to {0}.epub'.format(outdirect))
shutil.copytree(settings.base_epub, outdirect)
mytoc = tocncx.TocNCX(collection_mode=True)
myopf = opf.ContentOPF(outdirect, collection_mode=True)
for (doc, xml) in documents:
DOI = doc.getDOI()
doc.fetchPLoSImages(cache_dir, outdirect, settings.caching)
content.OPSContent(xml, DOI, outdirect, doc)
mytoc.takeArticle(doc)
myopf.takeArticle(doc)
mytoc.write(outdirect)
myopf.write()
utils.epubZip(outdirect)
#WARNING: shutil.rmtree() is a recursive deletion function, care should be
#taken whenever modifying this code
if settings.cleanup:
shutil.rmtree(outdirect)
def epubcheck(epubname):
'''This method takes the name of an epub file as an argument. This name is
the input for the java execution of a locally installed epubcheck-.jar. The
location of this .jar file is configured in settings.py.'''
r, e = os.path.splitext(epubname)
if not e:
print('Warning: Filename extension is empty, appending \'.epub\'...')
e = '.epub'
epubname = r + e
elif not e == '.epub':
print('Warning: Filename extension is not \'.epub\', appending it...')
epubname += '.epub'
os.execlp('java', 'OpenAccess_EPUB', '-jar', settings.epubcheck, epubname)
def main():
'''Main Script'''
parser = argparse.ArgumentParser(description='OpenAccess_EPUB Parser')
parser.add_argument('--version', action='version',
version='OpenAccess_EPUB {0}'.format(__version__))
#parser.add_argument('-q', '--quiet', action='store_true', default=False)
parser.add_argument('-o', '--output', action='store',
default=settings.default_output,
help='Use to specify a desired output directory')
parser.add_argument('-s', '--save-xml', action='store',
default=settings.xml_location,
help='''Use to specify a directory for storing \
downloaded xml files''')
parser.add_argument('-l', '--log-to', action='store',
default=settings.log_location,
help='Use to specify a non-default log directory')
parser.add_argument('-c', '--cache', action='store',
default=settings.cache_location,
help='Use to specify a non-default cache directory')
modes = parser.add_mutually_exclusive_group()
modes.add_argument('-i', '--input', action='store',
help='''Input may be a path to a local directory, a \
URL to a PLoS journal article, or a PLoS DOI \
string''')
modes.add_argument('-b', '--batch', action='store', default=False,
help='''Use to specify a batch directory; each \
article inside will be processed.''')
modes.add_argument('-C', '--collection', action='store', default=False,
help='''Use to create an ePub file containing \
multiple resources.''')
args = parser.parse_args()
#Check for directory existence, create if not found
#This will break if the path has no immediate parent directory, this could
#be fixed but I am not sure if it should
if not os.path.isdir(args.log_to):
os.mkdir(args.log_to)
if not os.path.isdir(args.cache):
initCache(args.cache)
if not os.path.isdir(args.save_xml):
os.mkdir(args.save_xml)
if not os.path.isdir(args.output):
os.mkdir(args.output)
#Initiate logging settings
logname = os.path.join(args.log_to, 'temp.log')
logging.basicConfig(filename=logname, level=logging.DEBUG)
logging.info('OpenAccess_EPUB Log v.{0}'.format(__version__))
#Batch Mode
if args.batch:
download = False
files = os.listdir(args.batch)
for file in files:
if not os.path.splitext(file)[1] == '.xml':
pass
else:
filename = os.path.join(args.batch, file)
doc, xml_local = localInput(filename)
input_name = os.path.splitext(os.path.split(xml_local)[1])[0]
output_name = os.path.join(args.output, input_name)
if os.path.isdir(output_name):
dirExists(output_name, args.batch)
makeEPUB(doc, xml_local, args.cache, output_name, args.log_to)
sys.exit(0)
#Collection Mode
if args.collection:
t = os.path.splitext(os.path.split(args.collection)[1])[0]
output_name = os.path.join(args.output, t)
if os.path.isdir(output_name):
dirExists(output_name, args.batch)
with open(args.collection, 'r') as collection:
inputs = collection.readlines()
documents = []
for i in inputs:
if 'http://www' in i:
download = True
document, xml_local = urlInput(i.rstrip('\n'), args.save_xml)
elif i[:4] == 'doi:':
download = True
document, xml_local = doiInput(i.rstrip('\n'), args.save_xml)
else:
if not i:
pass
else:
download = False
document, xml_local = localInput(i.rstrip('\n'))
documents += [(document, xml_local)]
makeCollectionEPUB(documents, args.cache, output_name, args.log_to)
epubcheck('{0}.epub'.format(output_name))
sys.exit(0)
#Single Input Mode
else:
#Determination of input type and processing
if 'http://www' in args.input:
download = True
document, xml_local = urlInput(args.input, args.save_xml)
elif args.input[:4] == 'doi:':
download = True
document, xml_local = doiInput(args.input, args.save_xml)
else:
download = False
document, xml_local = localInput(args.input)
#For now PloS naming will be maintained
#The name of the directory and the .epub should be a string like
#journal.pcbi.1002211
#or if already re-named, it will assume the xml name
input_name = os.path.splitext(os.path.split(xml_local)[1])[0]
output_name = os.path.join(args.output, input_name)
if os.path.isdir(output_name):
dirExists(output_name, args.batch)
makeEPUB(document, xml_local, args.cache, output_name, args.log_to)
if download and not settings.save_xml:
os.remove(xml_local)
newname = u'{0}.log'.format(input_name)
newname = os.path.join(args.log_to, newname)
os.rename(logname, newname)
epubcheck('{0}.epub'.format(output_name))
if __name__ == '__main__':
main()