From a5533c25d7e1e4861f07e52dcd2f106138a0ed39 Mon Sep 17 00:00:00 2001 From: Torben <59419684+entorb@users.noreply.github.com> Date: Sat, 27 Apr 2024 02:18:01 +0200 Subject: [PATCH] ebook script renaming and unit tests --- scripts/ebook/3.py | 156 ------------------------------ scripts/ebook/6.py | 131 ------------------------- scripts/ebook/{1.sh => step_1.sh} | 0 scripts/ebook/{2.sh => step_2.sh} | 0 scripts/ebook/step_3.py | 155 +++++++++++++++++++++++++++++ scripts/ebook/{4.py => step_4.py} | 24 ++--- scripts/ebook/step_4_test.py | 23 +++++ scripts/ebook/{5.sh => step_5.sh} | 0 scripts/ebook/step_6.py | 131 +++++++++++++++++++++++++ scripts/ebook/{7.sh => step_7.sh} | 0 scripts/make_ebooks.sh | 14 +-- 11 files changed, 328 insertions(+), 306 deletions(-) delete mode 100755 scripts/ebook/3.py delete mode 100755 scripts/ebook/6.py rename scripts/ebook/{1.sh => step_1.sh} (100%) rename scripts/ebook/{2.sh => step_2.sh} (100%) create mode 100755 scripts/ebook/step_3.py rename scripts/ebook/{4.py => step_4.py} (63%) create mode 100644 scripts/ebook/step_4_test.py rename scripts/ebook/{5.sh => step_5.sh} (100%) create mode 100755 scripts/ebook/step_6.py rename scripts/ebook/{7.sh => step_7.sh} (100%) diff --git a/scripts/ebook/3.py b/scripts/ebook/3.py deleted file mode 100755 index 861d2c82c..000000000 --- a/scripts/ebook/3.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -# by Torben Menke https://entorb.net - -""" -Modify flattened .tex file. -""" - -import datetime as dt -import os -import re -from pathlib import Path - -os.chdir(Path(__file__).parent.parent.parent) - -source_file = Path("tmp/hpmor-epub-2-flatten.tex") -target_file = Path("tmp/hpmor-epub-3-flatten-mod.tex") - -print("=== 3. modify flattened file ===") - - -with source_file.open(encoding="utf-8", newline="\n") as fh_in: - cont = fh_in.read() - -# \today -date_str = dt.datetime.now(dt.timezone.utc).date().strftime("%d.%m.%Y") -cont = cont.replace("\\today{}", date_str) - -# writtenNote env -> \writtenNoteA -cont = re.sub( - r"\s*\\begin\{writtenNote\}\s*(.*?)\s*\\end\{writtenNote\}", - r"\\writtenNoteA{\1}", - cont, - flags=re.DOTALL, -) - -# fix chapterOpeningAuthorNote -cont = re.sub( - r"(\\begin\{chapterOpeningAuthorNote\}\n)(.*?\n)(\\end\{chapterOpeningAuthorNote\}\n)", - r"\1E.~Y.:~\2\\newline\\rule[1ex]{\\textwidth}{.1pt}\\newline%\n\3", - cont, - flags=re.DOTALL, -) - -# some cleanup -# TODO: removed when switching to Ubuntu >= 23.04, -# since it let to a problem -# in line 31 of tmp/hpmor-epub-3-flatten-mod.tex -# cont = cont.replace("\\hplettrineextrapara", "") - -# additional linebreaks in verses of chapter 64 -cont = cont.replace("\\\\\n\n", "\n\n") - -# manual pagebreaks -cont = re.sub(r"\\clearpage(\{\}|)\n?", "", cont) - -# \vskip 1\baselineskip plus .5\textheight minus 1\baselineskip -cont = re.sub(r"\\vskip .*?\\baselineskip", "", cont) - -# remove \settowidth{\versewidth}... \begin{verse}[\versewidth] -cont = re.sub( - r"\n[^\n]*?\\settowidth\{\\versewidth\}[^\n]*?\n(\\begin\{verse\}\[\\versewidth\])", - r"\n\\begin{verse}", - cont, -) - -# remove \settowidth -cont = re.sub( - r"\\settowidth\{[^\}]*\}\{([^\}]*)\}", - r"\1", - cont, - flags=re.DOTALL, -) - -# fix „ at start of chapter -# \lettrine[ante=„] -> „\lettrine -# \lettrinepara[ante=„] -> „\lettrine -cont = re.sub( - r"\\(lettrine|lettrinepara)\[ante=(.)\]", - r"\2\\lettrine", - cont, -) - -# OMakeIV sections -# \OmakeIVsection{My Little Pony: Friendship is Science} -cont = re.sub(r"\\OmakeIVsection(\[[^\]]*\]|)\{(.*)\}\n+", r"\\section{\2}\n", cont) - -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{RingBearer\}.*?\n\n", - r"\\section{Lord of the Rationality}\n", - cont, - flags=re.DOTALL, - count=1, -) -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{NarniaBLL\}.*?\n\n", - r"\\section{The Witch and the Wardrobe}\n", - cont, - flags=re.DOTALL, - count=1, -) -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{Thundercats\}.*?\n\n", - r"\\section{ThunderSmarts}\n", - cont, - flags=re.DOTALL, - count=1, -) - -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{Twilight\}.*?\n\n", - r"\\section{Utilitarian Twilight}\n", - cont, - flags=re.DOTALL, - count=1, -) - -# \censor -cont = re.sub(r"\\censor\{.*?\}", r"xxxxxx", cont) - - -# # remove Deathly_Hallows_Sign.pdf and other pdf images -# # \includegraphics[scale=0.125]{images/Deathly_Hallows_Sign.pdf} -# cont = re.sub( -# # r"\\includegraphics.*?\{images/Deathly_Hallows_Sign.*?\}", -# r"\\includegraphics.*?\.pdf\}", -# "", -# cont, -# ) - -# remove all images -cont = re.sub( - r"\\includegraphics\[.*?\]\{.*?\}", - "", - cont, - flags=re.DOTALL, -) - -# remove empty envs -cont = re.sub( - r"\\begin\{([^\}]*)\}\s*\\end\{\1}", - "", - cont, - flags=re.DOTALL, -) - -# remove end stuff -cont = re.sub( - r"(.*)\\end\{chapterOpeningAuthorNote\}.*?\\end\{document\}", - r"\1\\end{chapterOpeningAuthorNote}\n\\end{document}", - cont, - flags=re.DOTALL, - count=1, -) - -with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: - fh_out.write(cont) diff --git a/scripts/ebook/6.py b/scripts/ebook/6.py deleted file mode 100755 index 6c03d53b5..000000000 --- a/scripts/ebook/6.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# by Torben Menke https://entorb.net -# ruff: noqa: RUF001 - -""" -HTML modifications. -""" - -import os -import re -from pathlib import Path - -os.chdir(Path(__file__).parent.parent.parent) - -source_file = Path("tmp/hpmor-epub-5-html-unmod.html") -target_file = Path("hpmor.html") - -print("=== 6. HTML modifications ===") - - -with source_file.open(encoding="utf-8", newline="\n") as fh_in: - cont = fh_in.read() - -# remove strange leftovers from tex -> html conversion -cont = re.sub( - r"().*?
Book :
\n", - r"\1", - cont, - flags=re.DOTALL | re.IGNORECASE, - count=1, -) - -# cleanup hp-intro leftovers -cont = re.sub( - """Fanfiction based on the characters of
-J. K. ROWLING
-and her books:
""", - "Fanfiction based on the characters of J. K. Rowling and her books:
", - cont, - count=1, -) - -cont = re.sub("Year at Hogwarts
\n", "", cont, count=7) -cont = re.sub( - "\nHarry Potter and the",
- " E. Y.: ", " E.Y.: ")
-
-# converting "color-marked" styles of 1.sh back to proper style classes
-cont = re.sub(
- r'<(div|span) style="color: (parsel|writtenNote|McGonagallWhiteBoard|headline)"',
- r'<\1 class="\2"',
- cont,
-)
-
-# add css style file format for \emph in \emph
-with Path("scripts/ebook/html.css").open(encoding="utf-8", newline="\n") as fh_in:
- css = fh_in.read()
-cont = cont.replace("\n", css + "\n\n")
-
-
-with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out:
- fh_out.write(cont)
diff --git a/scripts/ebook/1.sh b/scripts/ebook/step_1.sh
similarity index 100%
rename from scripts/ebook/1.sh
rename to scripts/ebook/step_1.sh
diff --git a/scripts/ebook/2.sh b/scripts/ebook/step_2.sh
similarity index 100%
rename from scripts/ebook/2.sh
rename to scripts/ebook/step_2.sh
diff --git a/scripts/ebook/step_3.py b/scripts/ebook/step_3.py
new file mode 100755
index 000000000..21da654a1
--- /dev/null
+++ b/scripts/ebook/step_3.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# by Torben Menke https://entorb.net
+
+"""
+Modify flattened .tex file.
+"""
+
+import datetime as dt
+import os
+import re
+from pathlib import Path
+
+os.chdir(Path(__file__).parent.parent.parent)
+
+source_file = Path("tmp/hpmor-epub-2-flatten.tex")
+target_file = Path("tmp/hpmor-epub-3-flatten-mod.tex")
+
+if __name__ == "__main__":
+ print("=== 3. modify flattened file ===")
+
+ with source_file.open(encoding="utf-8", newline="\n") as fh_in:
+ cont = fh_in.read()
+
+ # \today
+ date_str = dt.datetime.now(dt.timezone.utc).date().strftime("%d.%m.%Y")
+ cont = cont.replace("\\today{}", date_str)
+
+ # writtenNote env -> \writtenNoteA
+ cont = re.sub(
+ r"\s*\\begin\{writtenNote\}\s*(.*?)\s*\\end\{writtenNote\}",
+ r"\\writtenNoteA{\1}",
+ cont,
+ flags=re.DOTALL,
+ )
+
+ # fix chapterOpeningAuthorNote
+ cont = re.sub(
+ r"(\\begin\{chapterOpeningAuthorNote\}\n)(.*?\n)(\\end\{chapterOpeningAuthorNote\}\n)",
+ r"\1E.~Y.:~\2\\newline\\rule[1ex]{\\textwidth}{.1pt}\\newline%\n\3",
+ cont,
+ flags=re.DOTALL,
+ )
+
+ # some cleanup
+ # TODO: removed when switching to Ubuntu >= 23.04,
+ # since it let to a problem
+ # in line 31 of tmp/hpmor-epub-3-flatten-mod.tex
+ # cont = cont.replace("\\hplettrineextrapara", "")
+
+ # additional linebreaks in verses of chapter 64
+ cont = cont.replace("\\\\\n\n", "\n\n")
+
+ # manual pagebreaks
+ cont = re.sub(r"\\clearpage(\{\}|)\n?", "", cont)
+
+ # \vskip 1\baselineskip plus .5\textheight minus 1\baselineskip
+ cont = re.sub(r"\\vskip .*?\\baselineskip", "", cont)
+
+ # remove \settowidth{\versewidth}... \begin{verse}[\versewidth]
+ cont = re.sub(
+ r"\n[^\n]*?\\settowidth\{\\versewidth\}[^\n]*?\n(\\begin\{verse\}\[\\versewidth\])",
+ r"\n\\begin{verse}",
+ cont,
+ )
+
+ # remove \settowidth
+ cont = re.sub(
+ r"\\settowidth\{[^\}]*\}\{([^\}]*)\}",
+ r"\1",
+ cont,
+ flags=re.DOTALL,
+ )
+
+ # fix „ at start of chapter
+ # \lettrine[ante=„] -> „\lettrine
+ # \lettrinepara[ante=„] -> „\lettrine
+ cont = re.sub(
+ r"\\(lettrine|lettrinepara)\[ante=(.)\]",
+ r"\2\\lettrine",
+ cont,
+ )
+
+ # OMakeIV sections
+ # \OmakeIVsection{My Little Pony: Friendship is Science}
+ cont = re.sub(r"\\OmakeIVsection(\[[^\]]*\]|)\{(.*)\}\n+", r"\\section{\2}\n", cont)
+
+ cont = re.sub(
+ r"\\OmakeIVspecialsection[^\n]+\{RingBearer\}.*?\n\n",
+ r"\\section{Lord of the Rationality}\n",
+ cont,
+ flags=re.DOTALL,
+ count=1,
+ )
+ cont = re.sub(
+ r"\\OmakeIVspecialsection[^\n]+\{NarniaBLL\}.*?\n\n",
+ r"\\section{The Witch and the Wardrobe}\n",
+ cont,
+ flags=re.DOTALL,
+ count=1,
+ )
+ cont = re.sub(
+ r"\\OmakeIVspecialsection[^\n]+\{Thundercats\}.*?\n\n",
+ r"\\section{ThunderSmarts}\n",
+ cont,
+ flags=re.DOTALL,
+ count=1,
+ )
+
+ cont = re.sub(
+ r"\\OmakeIVspecialsection[^\n]+\{Twilight\}.*?\n\n",
+ r"\\section{Utilitarian Twilight}\n",
+ cont,
+ flags=re.DOTALL,
+ count=1,
+ )
+
+ # \censor
+ cont = re.sub(r"\\censor\{.*?\}", r"xxxxxx", cont)
+
+ # # remove Deathly_Hallows_Sign.pdf and other pdf images
+ # # \includegraphics[scale=0.125]{images/Deathly_Hallows_Sign.pdf}
+ # cont = re.sub(
+ # # r"\\includegraphics.*?\{images/Deathly_Hallows_Sign.*?\}",
+ # r"\\includegraphics.*?\.pdf\}",
+ # "",
+ # cont,
+ # )
+
+ # remove all images
+ cont = re.sub(
+ r"\\includegraphics\[.*?\]\{.*?\}",
+ "",
+ cont,
+ flags=re.DOTALL,
+ )
+
+ # remove empty envs
+ cont = re.sub(
+ r"\\begin\{([^\}]*)\}\s*\\end\{\1}",
+ "",
+ cont,
+ flags=re.DOTALL,
+ )
+
+ # remove end stuff
+ cont = re.sub(
+ r"(.*)\\end\{chapterOpeningAuthorNote\}.*?\\end\{document\}",
+ r"\1\\end{chapterOpeningAuthorNote}\n\\end{document}",
+ cont,
+ flags=re.DOTALL,
+ count=1,
+ )
+
+ with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out:
+ fh_out.write(cont)
diff --git a/scripts/ebook/4.py b/scripts/ebook/step_4.py
similarity index 63%
rename from scripts/ebook/4.py
rename to scripts/ebook/step_4.py
index b1f35d2e1..e4c9ccd50 100755
--- a/scripts/ebook/4.py
+++ b/scripts/ebook/step_4.py
@@ -14,8 +14,6 @@
source_file = Path("tmp/hpmor-epub-3-flatten-mod.tex")
target_file = Path("tmp/hpmor-epub-4-flatten-parsel.tex")
-print("=== 4. parselify flattened file in python ===")
-
def convert_parsel(s: str) -> str:
"""Convert text to Parsel."""
@@ -34,16 +32,18 @@ def convert_parsel(s: str) -> str:
return s
-with source_file.open(encoding="utf-8", newline="\n") as fh_in:
- cont = fh_in.read()
+if __name__ == "__main__":
+ print("=== 4. parselify flattened file in python ===")
+ with source_file.open(encoding="utf-8", newline="\n") as fh_in:
+ cont = fh_in.read()
-# \parsel
-my_matches = re.finditer(r"(\\parsel\{([^\}\\]+)\})", cont)
-for my_match in my_matches:
- was = my_match.group(1)
- womit = convert_parsel(my_match.group(2))
- cont = cont.replace(was, "\\parsel{" + womit + "}")
+ # \parsel
+ my_matches = re.finditer(r"(\\parsel\{([^\}\\]+)\})", cont)
+ for my_match in my_matches:
+ was = my_match.group(1)
+ womit = convert_parsel(my_match.group(2))
+ cont = cont.replace(was, "\\parsel{" + womit + "}")
-with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out:
- fh_out.write(cont)
+ with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out:
+ fh_out.write(cont)
diff --git a/scripts/ebook/step_4_test.py b/scripts/ebook/step_4_test.py
new file mode 100644
index 000000000..c2b027112
--- /dev/null
+++ b/scripts/ebook/step_4_test.py
@@ -0,0 +1,23 @@
+"""Unit Tests.""" # noqa: INP001
+# ruff: noqa: S101
+
+from step_4 import convert_parsel
+
+assert convert_parsel("foo") == "foo"
+# s
+assert convert_parsel("house") == "housse"
+assert convert_parsel("Special") == "Sspecial"
+# ss and ß
+assert convert_parsel("Professor") == "Professsor"
+assert convert_parsel("muß") == "musss"
+# z
+assert convert_parsel("zero") == "zzero"
+assert convert_parsel("Zero") == "Zzero"
+# zz
+assert convert_parsel("puzzled") == "puzzzled"
+# x -> xs
+assert convert_parsel("Bellatrix") == "Bellatrixs"
+
+# combined
+assert convert_parsel("expression") == "exspresssion"
+assert convert_parsel("Salazar") == "Ssalazzar"
diff --git a/scripts/ebook/5.sh b/scripts/ebook/step_5.sh
similarity index 100%
rename from scripts/ebook/5.sh
rename to scripts/ebook/step_5.sh
diff --git a/scripts/ebook/step_6.py b/scripts/ebook/step_6.py
new file mode 100755
index 000000000..6f1edcae0
--- /dev/null
+++ b/scripts/ebook/step_6.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# by Torben Menke https://entorb.net
+# ruff: noqa: RUF001
+
+"""
+HTML modifications.
+"""
+
+import os
+import re
+from pathlib import Path
+
+os.chdir(Path(__file__).parent.parent.parent)
+
+source_file = Path("tmp/hpmor-epub-5-html-unmod.html")
+target_file = Path("hpmor.html")
+
+
+if __name__ == "__main__":
+ print("=== 6. HTML modifications ===")
+
+ with source_file.open(encoding="utf-8", newline="\n") as fh_in:
+ cont = fh_in.read()
+
+ # remove strange leftovers from tex -> html conversion
+ cont = re.sub(
+ r"().*? Book : Fanfiction based on the characters of J. K. ROWLING and her books: Fanfiction based on the characters of J. K. Rowling and her books: Year at Hogwarts
\nHarry Potter and the",
- cont,
- count=7,
-)
-
-# now done via pandoc -V lang=en in 5.sh
-# # set language
-# cont = re.sub(
-# r'(]*) lang="" xml:lang=""',
-# r'\1 lang="en" xml:lang="en"',
-# cont,
-# count=1,
-# )
-
-# remove training slashes to satisfy https://validator.w3.org
-cont = cont.replace("
", "
")
-cont = cont.replace("
", "
")
-
-cont = re.sub(
- r"(]*) />",
- r"\1>",
- cont,
-)
-
-# remove bad span ids (containing spaces) from newspaper spans
-cont = re.sub(r'', r"", cont, count=5)
-
-# doc structure (not needed any more, using calibi --level1-toc flag instead)
-# sed -i 's/" in cont:
- part_no += 1
- cont = cont.replace("
", f"
")
-
-# add chapter numbers
-chapter_no = 0
-while "
" in cont:
- chapter_no += 1
- cont = cont.replace("
", f"
")
-
-# fix double rules
-# cont = cont.replace("
\n
", "
")
-cont = re.sub(
- r"
\n
",
- r"
",
- cont,
- flags=re.DOTALL | re.IGNORECASE,
-)
-# fixing linebreak at author's comment
-cont = cont.replace("
Harry Potter and the",
+ " E. Y.: ", " E.Y.: ")
+
+ # converting "color-marked" styles of 1.sh back to proper style classes
+ cont = re.sub(
+ r'<(div|span) style="color: (parsel|writtenNote|McGonagallWhiteBoard|headline)"', # noqa: E501
+ r'<\1 class="\2"',
+ cont,
+ )
+
+ # add css style file format for \emph in \emph
+ with Path("scripts/ebook/html.css").open(encoding="utf-8", newline="\n") as fh_in:
+ css = fh_in.read()
+ cont = cont.replace("\n", css + "\n\n")
+
+ with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out:
+ fh_out.write(cont)
diff --git a/scripts/ebook/7.sh b/scripts/ebook/step_7.sh
similarity index 100%
rename from scripts/ebook/7.sh
rename to scripts/ebook/step_7.sh
diff --git a/scripts/make_ebooks.sh b/scripts/make_ebooks.sh
index a4e7d7572..eee9eb24e 100755
--- a/scripts/make_ebooks.sh
+++ b/scripts/make_ebooks.sh
@@ -7,10 +7,10 @@ cd $script_dir/..
# TODO:
# image on last page
-sh scripts/ebook/1.sh
-sh scripts/ebook/2.sh
-python3 scripts/ebook/3.py
-python3 scripts/ebook/4.py
-sh scripts/ebook/5.sh
-python3 scripts/ebook/6.py
-sh scripts/ebook/7.sh
+sh scripts/ebook/step_1.sh
+sh scripts/ebook/step_2.sh
+python3 scripts/ebook/step_3.py
+python3 scripts/ebook/step_4.py
+sh scripts/ebook/step_5.sh
+python3 scripts/ebook/step_6.py
+sh scripts/ebook/step_7.sh
\nHarry Potter and the",
+ cont,
+ count=7,
+ )
+
+ # now done via pandoc -V lang=en in 5.sh
+ # # set language
+ # cont = re.sub(
+ # r'(]*) lang="" xml:lang=""',
+ # r'\1 lang="en" xml:lang="en"',
+ # cont,
+ # count=1,
+ # )
+
+ # remove training slashes to satisfy https://validator.w3.org
+ cont = cont.replace("
", "
")
+ cont = cont.replace("
", "
")
+
+ cont = re.sub(
+ r"(]*) />",
+ r"\1>",
+ cont,
+ )
+
+ # remove bad span ids (containing spaces) from newspaper spans
+ cont = re.sub(r'', r"", cont, count=5)
+
+ # doc structure (not needed any more, using calibi --level1-toc flag instead)
+ # sed -i 's/" in cont:
+ part_no += 1
+ cont = cont.replace("
", f"
")
+
+ # add chapter numbers
+ chapter_no = 0
+ while "
" in cont:
+ chapter_no += 1
+ cont = cont.replace("
", f"
")
+
+ # fix double rules
+ # cont = cont.replace("
\n
", "
")
+ cont = re.sub(
+ r"
\n
",
+ r"
",
+ cont,
+ flags=re.DOTALL | re.IGNORECASE,
+ )
+ # fixing linebreak at author's comment
+ cont = cont.replace("