forked from akkana/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathepubtag.py
executable file
·235 lines (207 loc) · 8.03 KB
/
epubtag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#! /usr/bin/env python
import os, sys
import zipfile
import xml.dom.minidom
def tag_epub_file(filename, new_tag_list=None, delete_tags=False, brief=False) :
subjectTag = 'dc:subject'
if brief :
print filename, '|',
else :
print filename
if not zipfile.is_zipfile(filename) :
print filename, "isn't an epub file (not zipped)"
return
zf = zipfile.ZipFile(filename)
content = None
for f in zf.namelist() :
if os.path.basename(f).endswith('.opf') :
contentfile = f
content = zf.open(f)
break
if not content :
raise RuntimeError('No content.opf in %s' % filename)
# Now content is a file handle on the content.opf XML file
try :
dom = xml.dom.minidom.parse(content)
except IOError, e :
raise IOError, filename + ': ' + str(e)
# Tags are inside <metadata> and look like this:
# <metadata>
# <dc:subject>Presidents -- United States -- Biography</dc:subject>
# Author (dc:creator) and Title (dc:title) are stored similarly.
def get_matches(elname, delete_tags=False) :
elements = dom.getElementsByTagName(elname)
parent = None
matches = []
for el in elements :
# Obviously there should be more error checking here
if not parent :
parent = el.parentNode
else :
assert parent == el.parentNode
if delete_tags :
if el.childNodes :
print "Deleting:", el.childNodes[0].wholeText
else :
print "Deleting empty", elname, "tag"
el.parentNode.removeChild(el)
elif el.childNodes :
# el.childNodes[0].wholeText is the unicode.
# Turn it into UTF-8 before returning.
# Uncomment the next line and run on micromegas.epub
# to test a weird thing: it happens if you run
# epubtag.py micromegas.epub | cat
# but not if you just run
# epubtag.py micromegas.epub
# See http://stackoverflow.com/questions/492483/setting-the-correct-encoding-when-piping-stdout-in-python
# matches.append(el.childNodes[0].wholeText)
matches.append(el.childNodes[0].wholeText.encode('utf-8',
'backslashreplace'))
else :
print "Empty", elname, "tag"
return matches, elements, parent
# But first, grab the title and author
titles, elements, parent = get_matches('dc:title')
if titles :
if brief :
print ', '.join(titles), '|',
else :
for t in titles :
print "Title:", t
authors, elements, parent = get_matches('dc:creator')
if brief :
print ', '.join(authors), '|',
else :
if len(authors) > 1 :
print 'Authors:',
else :
print 'Author:',
print ', '.join(authors)
# Now get the subject tags, deleting them if appropriate.
tags, elements, parent = get_matches(subjectTag, delete_tags)
if brief :
print ', '.join(tags)
else :
if tags :
print "Tags:"
for tag in tags :
print ' ', tag
# Now add new tags, if any
content.close()
if not new_tag_list :
zf.close()
return
# There are new tags to add.
# If we didn't see a dc:subject, we still need a parent, the <metadata> tag.
if not parent :
print "Warning: didn't see any subject tags previously"
parent = dom.getElementsByTagName("metadata")[0]
# If there's no metadata tag, maybe we should add one,
# but it might be better to throw an error.
if not parent :
print "No metadata tag! Bailing."
return
# We'll want to add the new subject tags after the last one.
if elements :
last_tag_el = elements[-1]
else :
last_tag_el = None
for new_tag in new_tag_list :
# Make the new node:
#newnode = tag.cloneNode(False)
newnode = dom.createElement(subjectTag)
# Make a text node inside it:
textnode = dom.createTextNode(new_tag)
newnode.appendChild(textnode)
# Also add a newline after each new node
textnode = dom.createTextNode('\n')
# Append nodenode after the last tag node we saw:
if last_tag_el and last_tag_el.nextSibling :
parent.insertBefore(textnode, last_tag_el.nextSibling)
parent.insertBefore(newnode, textnode)
# If we didn't see a tag, or the tag was the last child
# of its parent, we have to do it this way:
else :
parent.appendChild(newnode)
parent.appendChild(textnode)
print "Adding:", new_tag
# Open a new zip file to write to, and copy everything
# but change the content.opf (or whatever.opf) to the new one:
new_epub_file = filename + '.tmp'
ozf = zipfile.ZipFile(new_epub_file, 'w')
for info in zf.infolist() :
if os.path.basename(info.filename).endswith('.opf') :
# dom.toprettyprintxml() returns unicode, which zipfile.writestr()
# can't write. If you pass in encoding= then it works ...
# but minidom gives us no way to find out the encoding
# of the XML file we just parsed!
# So the best we can do is force it to UTF-8,
# barring re-opening the file and parsing the first line manually.
# So crazy!
encoding = 'UTF-8'
ozf.writestr(info, dom.toprettyxml(encoding=encoding,
newl=''))
# This also works:
#ozf.writestr(info, dom.toprettyxml().encode(encoding,
# 'xmlcharrefreplace'))
else :
bytes = zf.read(info.filename)
ozf.writestr(info, bytes)
ozf.close()
zf.close()
# Now we have the new file in new_epub_file, old in filename.
# Rename appropriately:
bakfile = filename + ".bak"
os.rename(filename, bakfile)
os.rename(new_epub_file, filename)
print "Wrote", filename
os.remove(bakfile)
def Usage() :
print "Usage: %s file.epub [file.epub...] [-d] [-t tag1 [tag2...]]" \
% os.path.basename(sys.argv[0])
print "Display, add or remove tags in epub ebooks."
print "Copyright 2012 by Akkana Peck -- share and enjoy under the GPL."
print "Options:"
print " -t: add tags (otherwise, just print existing tags)"
print " -d: delete existing tags before adding new ones"
print " -b: print only one line for each book (useful with grep)"
sys.exit(1)
# main
if __name__ == "__main__" :
# optparse can't handle multiple arguments of the same type
# (e.g. multiple tags), and the argparse doc is impenetrable.
# So let's just do this: any argument corresponding to a readable
# file must be an epub filename to be read/modified;
# any argument following a -t is a tag to be added;
# if there's a -d anywhere, we'll delete existing tags first;;
# any other flag, print a usage statement.
epubfiles = []
tags = []
add_tags = False
delete_tags = False
brief = False
for arg in sys.argv[1:] :
if arg == '-d' :
delete_tags = True
continue
if arg == '-t' :
add_tags = True
continue
if arg == '-b' :
brief = True
continue
if arg[0] == '-' :
Usage()
if not add_tags : # still adding files
if os.access(arg, os.R_OK) :
epubfiles.append(arg)
else :
print "Can't read", arg, "-- skipping"
else : # done adding files, adding tags now
tags.append(arg)
if not epubfiles :
Usage()
for f in epubfiles :
if not brief :
print "======="
tag_epub_file(f, tags, delete_tags, brief)