ebook finetuning to satisfy https://validator.w3.org

rrthomas · Mar 26, 2024 · 59bd038 · 59bd038
1 parent d7f6205
commit 59bd038
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 4 deletions.
diff --git a/scripts/ebook/3.py b/scripts/ebook/3.py
@@ -3,6 +3,7 @@
 """
 Modify flattened .tex file.
 """
+
 import datetime as dt
 import os
 import re
@@ -87,25 +88,63 @@
     r"\\section{Lord of the Rationality}\n",
     cont,
     flags=re.DOTALL,
+    count=1,
 )
 cont = re.sub(
     r"\\OmakeIVspecialsection[^\n]+\{NarniaBLL\}.*?\n\n",
     r"\\section{The Witch and the Wardrobe}\n",
     cont,
     flags=re.DOTALL,
+    count=1,
 )
 cont = re.sub(
     r"\\OmakeIVspecialsection[^\n]+\{Thundercats\}.*?\n\n",
     r"\\section{ThunderSmarts}\n",
     cont,
     flags=re.DOTALL,
+    count=1,
 )
 
 cont = re.sub(
     r"\\OmakeIVspecialsection[^\n]+\{Twilight\}.*?\n\n",
     r"\\section{Utilitarian Twilight}\n",
     cont,
     flags=re.DOTALL,
+    count=1,
+)
+
+# # remove Deathly_Hallows_Sign.pdf and other pdf images
+# # \includegraphics[scale=0.125]{images/Deathly_Hallows_Sign.pdf}
+# cont = re.sub(
+#     # r"\\includegraphics.*?\{images/Deathly_Hallows_Sign.*?\}",
+#     r"\\includegraphics.*?\.pdf\}",
+#     "",
+#     cont,
+# )
+
+# remove all images
+cont = re.sub(
+    r"\\includegraphics\[.*?\]\{.*?\}",
+    "",
+    cont,
+    flags=re.DOTALL,
+)
+
+# remove empty envs
+cont = re.sub(
+    r"\\begin\{([^\}]*)\}\s*\\end\{\1}",
+    "",
+    cont,
+    flags=re.DOTALL,
+)
+
+# remove end stuff
+cont = re.sub(
+    r"(.*)\\end\{chapterOpeningAuthorNote\}.*?\\end\{document\}",
+    r"\1\\end{chapterOpeningAuthorNote}\n\\end{document}",
+    cont,
+    flags=re.DOTALL,
+    count=1,
 )
 
 with open(target_file, mode="w", encoding="utf-8", newline="\n") as fhOut:

diff --git a/scripts/ebook/6.py b/scripts/ebook/6.py
@@ -3,6 +3,7 @@
 """
 HTML modifications.
 """
+
 import os
 import re
 import sys
@@ -24,21 +25,48 @@
     r"\1",
     cont,
     flags=re.DOTALL | re.IGNORECASE,
+    count=1,
 )
 
 # cleanup hp-intro leftovers
-cont = cont.replace(
+cont = re.sub(
     """<p>Fanfiction based on the characters of</p>
 <p>J. K. ROWLING</p>
 <p>and her books:</p>""",
     "<p>Fanfiction based on the characters of J. K. Rowling and her books:</p>",
+    cont,
+    count=1,
 )
-cont = cont.replace("<p>Year at Hogwarts</p>\n", "")
-cont = cont.replace(
+
+cont = re.sub("<p>Year at Hogwarts</p>\n", "", cont, count=7)
+cont = re.sub(
     "</em></p>\n<p><em>Harry Potter and the",
-    "<br/>\nHarry Potter and the",
+    "<br>\nHarry Potter and the",
+    cont,
+    count=7,
+)
+
+# set language
+cont = re.sub(
+    r'(<html [^>]*) lang="" xml:lang=""',
+    r'\1 lang="en" xml:lang="en"',
+    cont,
+    count=1,
+)
+
+# remove training slashes to satisfy https://validator.w3.org
+cont = cont.replace("<br />", "<br>")
+cont = cont.replace("<hr />", "<hr>")
+
+cont = re.sub(
+    r"(<meta [^>]*) />",
+    r"\1>",
+    cont,
 )
 
+# remove bad span ids (containing spaces) from newspaper spans
+cont = re.sub(r'<span id="[^"]+" label="[^"]+">', r"<span>", cont, count=5)
+
 # doc structure (not needed any more, using calibi --level1-toc flag instead)
 # sed -i 's/<h1 /<h1 class="part"/g' $target_file
 # sed -i 's/<h2 /<h2 class="chapter"/g' $target_file