Skip to content

Commit

Permalink
ebook finetuning to satisfy https://validator.w3.org
Browse files Browse the repository at this point in the history
  • Loading branch information
entorb committed Mar 26, 2024
1 parent d7f6205 commit 59bd038
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 4 deletions.
39 changes: 39 additions & 0 deletions scripts/ebook/3.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
Modify flattened .tex file.
"""

import datetime as dt
import os
import re
Expand Down Expand Up @@ -87,25 +88,63 @@
r"\\section{Lord of the Rationality}\n",
cont,
flags=re.DOTALL,
count=1,
)
cont = re.sub(
r"\\OmakeIVspecialsection[^\n]+\{NarniaBLL\}.*?\n\n",
r"\\section{The Witch and the Wardrobe}\n",
cont,
flags=re.DOTALL,
count=1,
)
cont = re.sub(
r"\\OmakeIVspecialsection[^\n]+\{Thundercats\}.*?\n\n",
r"\\section{ThunderSmarts}\n",
cont,
flags=re.DOTALL,
count=1,
)

cont = re.sub(
r"\\OmakeIVspecialsection[^\n]+\{Twilight\}.*?\n\n",
r"\\section{Utilitarian Twilight}\n",
cont,
flags=re.DOTALL,
count=1,
)

# # remove Deathly_Hallows_Sign.pdf and other pdf images
# # \includegraphics[scale=0.125]{images/Deathly_Hallows_Sign.pdf}
# cont = re.sub(
# # r"\\includegraphics.*?\{images/Deathly_Hallows_Sign.*?\}",
# r"\\includegraphics.*?\.pdf\}",
# "",
# cont,
# )

# remove all images
cont = re.sub(
r"\\includegraphics\[.*?\]\{.*?\}",
"",
cont,
flags=re.DOTALL,
)

# remove empty envs
cont = re.sub(
r"\\begin\{([^\}]*)\}\s*\\end\{\1}",
"",
cont,
flags=re.DOTALL,
)

# remove end stuff
cont = re.sub(
r"(.*)\\end\{chapterOpeningAuthorNote\}.*?\\end\{document\}",
r"\1\\end{chapterOpeningAuthorNote}\n\\end{document}",
cont,
flags=re.DOTALL,
count=1,
)

with open(target_file, mode="w", encoding="utf-8", newline="\n") as fhOut:
Expand Down
36 changes: 32 additions & 4 deletions scripts/ebook/6.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
HTML modifications.
"""

import os
import re
import sys
Expand All @@ -24,21 +25,48 @@
r"\1",
cont,
flags=re.DOTALL | re.IGNORECASE,
count=1,
)

# cleanup hp-intro leftovers
cont = cont.replace(
cont = re.sub(
"""<p>Fanfiction based on the characters of</p>
<p>J. K. ROWLING</p>
<p>and her books:</p>""",
"<p>Fanfiction based on the characters of J. K. Rowling and her books:</p>",
cont,
count=1,
)
cont = cont.replace("<p>Year at Hogwarts</p>\n", "")
cont = cont.replace(

cont = re.sub("<p>Year at Hogwarts</p>\n", "", cont, count=7)
cont = re.sub(
"</em></p>\n<p><em>Harry Potter and the",
"<br/>\nHarry Potter and the",
"<br>\nHarry Potter and the",
cont,
count=7,
)

# set language
cont = re.sub(
r'(<html [^>]*) lang="" xml:lang=""',
r'\1 lang="en" xml:lang="en"',
cont,
count=1,
)

# remove training slashes to satisfy https://validator.w3.org
cont = cont.replace("<br />", "<br>")
cont = cont.replace("<hr />", "<hr>")

cont = re.sub(
r"(<meta [^>]*) />",
r"\1>",
cont,
)

# remove bad span ids (containing spaces) from newspaper spans
cont = re.sub(r'<span id="[^"]+" label="[^"]+">', r"<span>", cont, count=5)

# doc structure (not needed any more, using calibi --level1-toc flag instead)
# sed -i 's/<h1 /<h1 class="part"/g' $target_file
# sed -i 's/<h2 /<h2 class="chapter"/g' $target_file
Expand Down

0 comments on commit 59bd038

Please sign in to comment.