From a5533c25d7e1e4861f07e52dcd2f106138a0ed39 Mon Sep 17 00:00:00 2001 From: Torben <59419684+entorb@users.noreply.github.com> Date: Sat, 27 Apr 2024 02:18:01 +0200 Subject: [PATCH] ebook script renaming and unit tests --- scripts/ebook/3.py | 156 ------------------------------ scripts/ebook/6.py | 131 ------------------------- scripts/ebook/{1.sh => step_1.sh} | 0 scripts/ebook/{2.sh => step_2.sh} | 0 scripts/ebook/step_3.py | 155 +++++++++++++++++++++++++++++ scripts/ebook/{4.py => step_4.py} | 24 ++--- scripts/ebook/step_4_test.py | 23 +++++ scripts/ebook/{5.sh => step_5.sh} | 0 scripts/ebook/step_6.py | 131 +++++++++++++++++++++++++ scripts/ebook/{7.sh => step_7.sh} | 0 scripts/make_ebooks.sh | 14 +-- 11 files changed, 328 insertions(+), 306 deletions(-) delete mode 100755 scripts/ebook/3.py delete mode 100755 scripts/ebook/6.py rename scripts/ebook/{1.sh => step_1.sh} (100%) rename scripts/ebook/{2.sh => step_2.sh} (100%) create mode 100755 scripts/ebook/step_3.py rename scripts/ebook/{4.py => step_4.py} (63%) create mode 100644 scripts/ebook/step_4_test.py rename scripts/ebook/{5.sh => step_5.sh} (100%) create mode 100755 scripts/ebook/step_6.py rename scripts/ebook/{7.sh => step_7.sh} (100%) diff --git a/scripts/ebook/3.py b/scripts/ebook/3.py deleted file mode 100755 index 861d2c82c..000000000 --- a/scripts/ebook/3.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -# by Torben Menke https://entorb.net - -""" -Modify flattened .tex file. -""" - -import datetime as dt -import os -import re -from pathlib import Path - -os.chdir(Path(__file__).parent.parent.parent) - -source_file = Path("tmp/hpmor-epub-2-flatten.tex") -target_file = Path("tmp/hpmor-epub-3-flatten-mod.tex") - -print("=== 3. modify flattened file ===") - - -with source_file.open(encoding="utf-8", newline="\n") as fh_in: - cont = fh_in.read() - -# \today -date_str = dt.datetime.now(dt.timezone.utc).date().strftime("%d.%m.%Y") -cont = cont.replace("\\today{}", date_str) - -# writtenNote env -> \writtenNoteA -cont = re.sub( - r"\s*\\begin\{writtenNote\}\s*(.*?)\s*\\end\{writtenNote\}", - r"\\writtenNoteA{\1}", - cont, - flags=re.DOTALL, -) - -# fix chapterOpeningAuthorNote -cont = re.sub( - r"(\\begin\{chapterOpeningAuthorNote\}\n)(.*?\n)(\\end\{chapterOpeningAuthorNote\}\n)", - r"\1E.~Y.:~\2\\newline\\rule[1ex]{\\textwidth}{.1pt}\\newline%\n\3", - cont, - flags=re.DOTALL, -) - -# some cleanup -# TODO: removed when switching to Ubuntu >= 23.04, -# since it let to a problem -# in line 31 of tmp/hpmor-epub-3-flatten-mod.tex -# cont = cont.replace("\\hplettrineextrapara", "") - -# additional linebreaks in verses of chapter 64 -cont = cont.replace("\\\\\n\n", "\n\n") - -# manual pagebreaks -cont = re.sub(r"\\clearpage(\{\}|)\n?", "", cont) - -# \vskip 1\baselineskip plus .5\textheight minus 1\baselineskip -cont = re.sub(r"\\vskip .*?\\baselineskip", "", cont) - -# remove \settowidth{\versewidth}... \begin{verse}[\versewidth] -cont = re.sub( - r"\n[^\n]*?\\settowidth\{\\versewidth\}[^\n]*?\n(\\begin\{verse\}\[\\versewidth\])", - r"\n\\begin{verse}", - cont, -) - -# remove \settowidth -cont = re.sub( - r"\\settowidth\{[^\}]*\}\{([^\}]*)\}", - r"\1", - cont, - flags=re.DOTALL, -) - -# fix „ at start of chapter -# \lettrine[ante=„] -> „\lettrine -# \lettrinepara[ante=„] -> „\lettrine -cont = re.sub( - r"\\(lettrine|lettrinepara)\[ante=(.)\]", - r"\2\\lettrine", - cont, -) - -# OMakeIV sections -# \OmakeIVsection{My Little Pony: Friendship is Science} -cont = re.sub(r"\\OmakeIVsection(\[[^\]]*\]|)\{(.*)\}\n+", r"\\section{\2}\n", cont) - -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{RingBearer\}.*?\n\n", - r"\\section{Lord of the Rationality}\n", - cont, - flags=re.DOTALL, - count=1, -) -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{NarniaBLL\}.*?\n\n", - r"\\section{The Witch and the Wardrobe}\n", - cont, - flags=re.DOTALL, - count=1, -) -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{Thundercats\}.*?\n\n", - r"\\section{ThunderSmarts}\n", - cont, - flags=re.DOTALL, - count=1, -) - -cont = re.sub( - r"\\OmakeIVspecialsection[^\n]+\{Twilight\}.*?\n\n", - r"\\section{Utilitarian Twilight}\n", - cont, - flags=re.DOTALL, - count=1, -) - -# \censor -cont = re.sub(r"\\censor\{.*?\}", r"xxxxxx", cont) - - -# # remove Deathly_Hallows_Sign.pdf and other pdf images -# # \includegraphics[scale=0.125]{images/Deathly_Hallows_Sign.pdf} -# cont = re.sub( -# # r"\\includegraphics.*?\{images/Deathly_Hallows_Sign.*?\}", -# r"\\includegraphics.*?\.pdf\}", -# "", -# cont, -# ) - -# remove all images -cont = re.sub( - r"\\includegraphics\[.*?\]\{.*?\}", - "", - cont, - flags=re.DOTALL, -) - -# remove empty envs -cont = re.sub( - r"\\begin\{([^\}]*)\}\s*\\end\{\1}", - "", - cont, - flags=re.DOTALL, -) - -# remove end stuff -cont = re.sub( - r"(.*)\\end\{chapterOpeningAuthorNote\}.*?\\end\{document\}", - r"\1\\end{chapterOpeningAuthorNote}\n\\end{document}", - cont, - flags=re.DOTALL, - count=1, -) - -with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: - fh_out.write(cont) diff --git a/scripts/ebook/6.py b/scripts/ebook/6.py deleted file mode 100755 index 6c03d53b5..000000000 --- a/scripts/ebook/6.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# by Torben Menke https://entorb.net -# ruff: noqa: RUF001 - -""" -HTML modifications. -""" - -import os -import re -from pathlib import Path - -os.chdir(Path(__file__).parent.parent.parent) - -source_file = Path("tmp/hpmor-epub-5-html-unmod.html") -target_file = Path("hpmor.html") - -print("=== 6. HTML modifications ===") - - -with source_file.open(encoding="utf-8", newline="\n") as fh_in: - cont = fh_in.read() - -# remove strange leftovers from tex -> html conversion -cont = re.sub( - r"().*?

Book :

\n", - r"\1", - cont, - flags=re.DOTALL | re.IGNORECASE, - count=1, -) - -# cleanup hp-intro leftovers -cont = re.sub( - """

Fanfiction based on the characters of

-

J. K. ROWLING

-

and her books:

""", - "

Fanfiction based on the characters of J. K. Rowling and her books:

", - cont, - count=1, -) - -cont = re.sub("

Year at Hogwarts

\n", "", cont, count=7) -cont = re.sub( - "

\n

Harry Potter and the", - "
\nHarry Potter and the", - cont, - count=7, -) - -# now done via pandoc -V lang=en in 5.sh -# # set language -# cont = re.sub( -# r'(]*) lang="" xml:lang=""', -# r'\1 lang="en" xml:lang="en"', -# cont, -# count=1, -# ) - -# remove training slashes to satisfy https://validator.w3.org -cont = cont.replace("
", "
") -cont = cont.replace("


", "
") - -cont = re.sub( - r"(]*) />", - r"\1>", - cont, -) - -# remove bad span ids (containing spaces) from newspaper spans -cont = re.sub(r'', r"", cont, count=5) - -# doc structure (not needed any more, using calibi --level1-toc flag instead) -# sed -i 's/

" in cont: - part_no += 1 - cont = cont.replace("

", f"{part_no}. ", 1) -cont = cont.replace("", "

") - -# add chapter numbers -chapter_no = 0 -while "

" in cont: - chapter_no += 1 - cont = cont.replace("

", f"{chapter_no}. ", 1) -cont = cont.replace("", "

") - -# fix double rules -# cont = cont.replace("
\n
", "
") -cont = re.sub( - r"
\n
", - r"
", - cont, - flags=re.DOTALL | re.IGNORECASE, -) -# fixing linebreak at author's comment -cont = cont.replace("

E. Y.: 

\n

", "

E.Y.: ") - -# converting "color-marked" styles of 1.sh back to proper style classes -cont = re.sub( - r'<(div|span) style="color: (parsel|writtenNote|McGonagallWhiteBoard|headline)"', - r'<\1 class="\2"', - cont, -) - -# add css style file format for \emph in \emph -with Path("scripts/ebook/html.css").open(encoding="utf-8", newline="\n") as fh_in: - css = fh_in.read() -cont = cont.replace("\n", css + "\n\n") - - -with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: - fh_out.write(cont) diff --git a/scripts/ebook/1.sh b/scripts/ebook/step_1.sh similarity index 100% rename from scripts/ebook/1.sh rename to scripts/ebook/step_1.sh diff --git a/scripts/ebook/2.sh b/scripts/ebook/step_2.sh similarity index 100% rename from scripts/ebook/2.sh rename to scripts/ebook/step_2.sh diff --git a/scripts/ebook/step_3.py b/scripts/ebook/step_3.py new file mode 100755 index 000000000..21da654a1 --- /dev/null +++ b/scripts/ebook/step_3.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# by Torben Menke https://entorb.net + +""" +Modify flattened .tex file. +""" + +import datetime as dt +import os +import re +from pathlib import Path + +os.chdir(Path(__file__).parent.parent.parent) + +source_file = Path("tmp/hpmor-epub-2-flatten.tex") +target_file = Path("tmp/hpmor-epub-3-flatten-mod.tex") + +if __name__ == "__main__": + print("=== 3. modify flattened file ===") + + with source_file.open(encoding="utf-8", newline="\n") as fh_in: + cont = fh_in.read() + + # \today + date_str = dt.datetime.now(dt.timezone.utc).date().strftime("%d.%m.%Y") + cont = cont.replace("\\today{}", date_str) + + # writtenNote env -> \writtenNoteA + cont = re.sub( + r"\s*\\begin\{writtenNote\}\s*(.*?)\s*\\end\{writtenNote\}", + r"\\writtenNoteA{\1}", + cont, + flags=re.DOTALL, + ) + + # fix chapterOpeningAuthorNote + cont = re.sub( + r"(\\begin\{chapterOpeningAuthorNote\}\n)(.*?\n)(\\end\{chapterOpeningAuthorNote\}\n)", + r"\1E.~Y.:~\2\\newline\\rule[1ex]{\\textwidth}{.1pt}\\newline%\n\3", + cont, + flags=re.DOTALL, + ) + + # some cleanup + # TODO: removed when switching to Ubuntu >= 23.04, + # since it let to a problem + # in line 31 of tmp/hpmor-epub-3-flatten-mod.tex + # cont = cont.replace("\\hplettrineextrapara", "") + + # additional linebreaks in verses of chapter 64 + cont = cont.replace("\\\\\n\n", "\n\n") + + # manual pagebreaks + cont = re.sub(r"\\clearpage(\{\}|)\n?", "", cont) + + # \vskip 1\baselineskip plus .5\textheight minus 1\baselineskip + cont = re.sub(r"\\vskip .*?\\baselineskip", "", cont) + + # remove \settowidth{\versewidth}... \begin{verse}[\versewidth] + cont = re.sub( + r"\n[^\n]*?\\settowidth\{\\versewidth\}[^\n]*?\n(\\begin\{verse\}\[\\versewidth\])", + r"\n\\begin{verse}", + cont, + ) + + # remove \settowidth + cont = re.sub( + r"\\settowidth\{[^\}]*\}\{([^\}]*)\}", + r"\1", + cont, + flags=re.DOTALL, + ) + + # fix „ at start of chapter + # \lettrine[ante=„] -> „\lettrine + # \lettrinepara[ante=„] -> „\lettrine + cont = re.sub( + r"\\(lettrine|lettrinepara)\[ante=(.)\]", + r"\2\\lettrine", + cont, + ) + + # OMakeIV sections + # \OmakeIVsection{My Little Pony: Friendship is Science} + cont = re.sub(r"\\OmakeIVsection(\[[^\]]*\]|)\{(.*)\}\n+", r"\\section{\2}\n", cont) + + cont = re.sub( + r"\\OmakeIVspecialsection[^\n]+\{RingBearer\}.*?\n\n", + r"\\section{Lord of the Rationality}\n", + cont, + flags=re.DOTALL, + count=1, + ) + cont = re.sub( + r"\\OmakeIVspecialsection[^\n]+\{NarniaBLL\}.*?\n\n", + r"\\section{The Witch and the Wardrobe}\n", + cont, + flags=re.DOTALL, + count=1, + ) + cont = re.sub( + r"\\OmakeIVspecialsection[^\n]+\{Thundercats\}.*?\n\n", + r"\\section{ThunderSmarts}\n", + cont, + flags=re.DOTALL, + count=1, + ) + + cont = re.sub( + r"\\OmakeIVspecialsection[^\n]+\{Twilight\}.*?\n\n", + r"\\section{Utilitarian Twilight}\n", + cont, + flags=re.DOTALL, + count=1, + ) + + # \censor + cont = re.sub(r"\\censor\{.*?\}", r"xxxxxx", cont) + + # # remove Deathly_Hallows_Sign.pdf and other pdf images + # # \includegraphics[scale=0.125]{images/Deathly_Hallows_Sign.pdf} + # cont = re.sub( + # # r"\\includegraphics.*?\{images/Deathly_Hallows_Sign.*?\}", + # r"\\includegraphics.*?\.pdf\}", + # "", + # cont, + # ) + + # remove all images + cont = re.sub( + r"\\includegraphics\[.*?\]\{.*?\}", + "", + cont, + flags=re.DOTALL, + ) + + # remove empty envs + cont = re.sub( + r"\\begin\{([^\}]*)\}\s*\\end\{\1}", + "", + cont, + flags=re.DOTALL, + ) + + # remove end stuff + cont = re.sub( + r"(.*)\\end\{chapterOpeningAuthorNote\}.*?\\end\{document\}", + r"\1\\end{chapterOpeningAuthorNote}\n\\end{document}", + cont, + flags=re.DOTALL, + count=1, + ) + + with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: + fh_out.write(cont) diff --git a/scripts/ebook/4.py b/scripts/ebook/step_4.py similarity index 63% rename from scripts/ebook/4.py rename to scripts/ebook/step_4.py index b1f35d2e1..e4c9ccd50 100755 --- a/scripts/ebook/4.py +++ b/scripts/ebook/step_4.py @@ -14,8 +14,6 @@ source_file = Path("tmp/hpmor-epub-3-flatten-mod.tex") target_file = Path("tmp/hpmor-epub-4-flatten-parsel.tex") -print("=== 4. parselify flattened file in python ===") - def convert_parsel(s: str) -> str: """Convert text to Parsel.""" @@ -34,16 +32,18 @@ def convert_parsel(s: str) -> str: return s -with source_file.open(encoding="utf-8", newline="\n") as fh_in: - cont = fh_in.read() +if __name__ == "__main__": + print("=== 4. parselify flattened file in python ===") + with source_file.open(encoding="utf-8", newline="\n") as fh_in: + cont = fh_in.read() -# \parsel -my_matches = re.finditer(r"(\\parsel\{([^\}\\]+)\})", cont) -for my_match in my_matches: - was = my_match.group(1) - womit = convert_parsel(my_match.group(2)) - cont = cont.replace(was, "\\parsel{" + womit + "}") + # \parsel + my_matches = re.finditer(r"(\\parsel\{([^\}\\]+)\})", cont) + for my_match in my_matches: + was = my_match.group(1) + womit = convert_parsel(my_match.group(2)) + cont = cont.replace(was, "\\parsel{" + womit + "}") -with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: - fh_out.write(cont) + with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: + fh_out.write(cont) diff --git a/scripts/ebook/step_4_test.py b/scripts/ebook/step_4_test.py new file mode 100644 index 000000000..c2b027112 --- /dev/null +++ b/scripts/ebook/step_4_test.py @@ -0,0 +1,23 @@ +"""Unit Tests.""" # noqa: INP001 +# ruff: noqa: S101 + +from step_4 import convert_parsel + +assert convert_parsel("foo") == "foo" +# s +assert convert_parsel("house") == "housse" +assert convert_parsel("Special") == "Sspecial" +# ss and ß +assert convert_parsel("Professor") == "Professsor" +assert convert_parsel("muß") == "musss" +# z +assert convert_parsel("zero") == "zzero" +assert convert_parsel("Zero") == "Zzero" +# zz +assert convert_parsel("puzzled") == "puzzzled" +# x -> xs +assert convert_parsel("Bellatrix") == "Bellatrixs" + +# combined +assert convert_parsel("expression") == "exspresssion" +assert convert_parsel("Salazar") == "Ssalazzar" diff --git a/scripts/ebook/5.sh b/scripts/ebook/step_5.sh similarity index 100% rename from scripts/ebook/5.sh rename to scripts/ebook/step_5.sh diff --git a/scripts/ebook/step_6.py b/scripts/ebook/step_6.py new file mode 100755 index 000000000..6f1edcae0 --- /dev/null +++ b/scripts/ebook/step_6.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# by Torben Menke https://entorb.net +# ruff: noqa: RUF001 + +""" +HTML modifications. +""" + +import os +import re +from pathlib import Path + +os.chdir(Path(__file__).parent.parent.parent) + +source_file = Path("tmp/hpmor-epub-5-html-unmod.html") +target_file = Path("hpmor.html") + + +if __name__ == "__main__": + print("=== 6. HTML modifications ===") + + with source_file.open(encoding="utf-8", newline="\n") as fh_in: + cont = fh_in.read() + + # remove strange leftovers from tex -> html conversion + cont = re.sub( + r"().*?

Book :

\n", + r"\1", + cont, + flags=re.DOTALL | re.IGNORECASE, + count=1, + ) + + # cleanup hp-intro leftovers + cont = re.sub( + """

Fanfiction based on the characters of

+

J. K. ROWLING

+

and her books:

""", + "

Fanfiction based on the characters of J. K. Rowling and her books:

", + cont, + count=1, + ) + + cont = re.sub("

Year at Hogwarts

\n", "", cont, count=7) + cont = re.sub( + "

\n

Harry Potter and the", + "
\nHarry Potter and the", + cont, + count=7, + ) + + # now done via pandoc -V lang=en in 5.sh + # # set language + # cont = re.sub( + # r'(]*) lang="" xml:lang=""', + # r'\1 lang="en" xml:lang="en"', + # cont, + # count=1, + # ) + + # remove training slashes to satisfy https://validator.w3.org + cont = cont.replace("
", "
") + cont = cont.replace("


", "
") + + cont = re.sub( + r"(]*) />", + r"\1>", + cont, + ) + + # remove bad span ids (containing spaces) from newspaper spans + cont = re.sub(r'', r"", cont, count=5) + + # doc structure (not needed any more, using calibi --level1-toc flag instead) + # sed -i 's/

" in cont: + part_no += 1 + cont = cont.replace("

", f"{part_no}. ", 1) + cont = cont.replace("", "

") + + # add chapter numbers + chapter_no = 0 + while "

" in cont: + chapter_no += 1 + cont = cont.replace("

", f"{chapter_no}. ", 1) + cont = cont.replace("", "

") + + # fix double rules + # cont = cont.replace("
\n
", "
") + cont = re.sub( + r"
\n
", + r"
", + cont, + flags=re.DOTALL | re.IGNORECASE, + ) + # fixing linebreak at author's comment + cont = cont.replace("

E. Y.: 

\n

", "

E.Y.: ") + + # converting "color-marked" styles of 1.sh back to proper style classes + cont = re.sub( + r'<(div|span) style="color: (parsel|writtenNote|McGonagallWhiteBoard|headline)"', # noqa: E501 + r'<\1 class="\2"', + cont, + ) + + # add css style file format for \emph in \emph + with Path("scripts/ebook/html.css").open(encoding="utf-8", newline="\n") as fh_in: + css = fh_in.read() + cont = cont.replace("\n", css + "\n\n") + + with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out: + fh_out.write(cont) diff --git a/scripts/ebook/7.sh b/scripts/ebook/step_7.sh similarity index 100% rename from scripts/ebook/7.sh rename to scripts/ebook/step_7.sh diff --git a/scripts/make_ebooks.sh b/scripts/make_ebooks.sh index a4e7d7572..eee9eb24e 100755 --- a/scripts/make_ebooks.sh +++ b/scripts/make_ebooks.sh @@ -7,10 +7,10 @@ cd $script_dir/.. # TODO: # image on last page -sh scripts/ebook/1.sh -sh scripts/ebook/2.sh -python3 scripts/ebook/3.py -python3 scripts/ebook/4.py -sh scripts/ebook/5.sh -python3 scripts/ebook/6.py -sh scripts/ebook/7.sh +sh scripts/ebook/step_1.sh +sh scripts/ebook/step_2.sh +python3 scripts/ebook/step_3.py +python3 scripts/ebook/step_4.py +sh scripts/ebook/step_5.sh +python3 scripts/ebook/step_6.py +sh scripts/ebook/step_7.sh