mdbook-pdf v0.1.7: Fix several bugs for table of content generation

Signed-off-by: Hollow Man <[email protected]>
HollowMan6 · Jun 11, 2023 · 9199582 · 9199582
1 parent 9574a5a
commit 9199582
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 78 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,7 +9,7 @@ license = "GPL-3.0"
 name = "mdbook-pdf"
 readme = "README.md"
 repository = "https://github.com/HollowMan6/mdbook-pdf"
-version = "0.1.6"
+version = "0.1.7"
 include = [
     "**/*.rs",
     "Cargo.toml",

diff --git a/mdbook_pdf_outline/mdbook_pdf_outline.py b/mdbook_pdf_outline/mdbook_pdf_outline.py
@@ -1,16 +1,25 @@
 #!/usr/bin/env python3
 # vim: set fileencoding=utf-8 :
 # vim: set et ts=4 sw=4:
-'''
+"""
   mdbook-pdf-outline
+  An outline (Table of Content) generator for mdBook-pdf.
+
   Author:  Hollow Man <[email protected]>
+  License: GPL-3.0
 
-  Copyright © 2022 Hollow Man(@HollowMan6). All rights reserved.
+  Copyright © 2022-2023 Hollow Man (@HollowMan6). All rights reserved.
 
   This document is free software; you can redistribute it and/or modify it under the terms of the GNU General
   Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option)
   any later version.
-'''
+
+  This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
+  warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License along with this program.
+  If not, see <http://www.gnu.org/licenses/>.
+"""
 
 
 from pypdf import PdfReader, PdfWriter
@@ -19,15 +28,47 @@
 import lxml.html
 import re
 import json
+import sys
+
+
+buffer = []
 
 
-def update_parent_dict(parent_dict, level, node):
+def get_parent(level, parent_dict):
+    if level > 1:
+        temp = parent_dict
+        for _ in range(1, level - 1):
+            if temp["child"] and temp["child"]["node"]:
+                temp = temp["child"]
+        return temp["node"]
+    return None
+
+
+def update_parent_dict(
+    parent_dict, level, writer, text, page, fit=None, handle_buffer=False
+):
     temp = parent_dict
     for _ in range(1, level):
         if not temp["child"]:
             temp["child"] = {"node": None, "child": {}}
         temp = temp["child"]
-    temp["node"] = node
+
+    if not handle_buffer:
+        if page is None:
+            buffer.append((level, text))
+            return
+        else:
+            # Flush buffer so that ToC items without page destinations are
+            # added to the outline with the next page destination
+            for item in buffer:
+                update_parent_dict(
+                    parent_dict, item[0], writer, item[1], page, fit, True
+                )
+            buffer.clear()
+
+    temp["node"] = writer.add_outline_item(
+        text, page, get_parent(level, parent_dict), fit=fit
+    )
     temp["child"] = {}
 
 
@@ -46,24 +87,20 @@ def add_wkhtmltopdf_like_outline(html_page, reader, writer):
             if not results.tag[1:].isdigit():
                 continue
             level = int(results.tag[1:])
-            dest = reader.named_destinations["/{}".format(
-                urllib.parse.quote(id))]
-            parent = None
-            if level > 1:
-                temp = parent_dict
-                for _ in range(1, level - 1):
-                    if temp["child"] and temp["child"]["node"]:
-                        temp = temp["child"]
-                parent = temp["node"]
-
-            if dest.get('/Type') == '/Fit':
-                update_parent_dict(parent_dict, level, writer.add_outline_item(
-                    results.text_content(), None, parent))
-                continue
-            update_parent_dict(parent_dict, level, writer.add_outline_item(
-                results.text_content(), reader.get_destination_page_number(
-                    dest), parent, fit=Fit(
-                        dest.get('/Type'), (dest.get('/Left'), dest.get('/Top'), dest.get('/Zoom')))))
+            dest = reader.named_destinations["/{}".format(urllib.parse.quote(id))]
+
+            page = None
+            fit = None
+            if dest.get("/Type") != "/Fit":
+                page = reader.get_destination_page_number(dest)
+                fit = Fit(
+                    dest.get("/Type"),
+                    (dest.get("/Left"), dest.get("/Top"), dest.get("/Zoom")),
+                )
+
+            update_parent_dict(
+                parent_dict, level, writer, results.text_content(), page, fit
+            )
 
 
 def parse_toc(toc, reader, writer, parent_dict, level=1):
@@ -75,7 +112,11 @@ def parse_toc(toc, reader, writer, parent_dict, level=1):
             dest_name = ""
             target_element = None
             for element in head.iter():
-                if element.tag == "a" or element.tag == "div":
+                if (
+                    element.tag == "a"
+                    or element.tag == "div"
+                    or element.find_class("part-title")
+                ):
                     target_element = element
                     break
             to_remove = head.find_class("toggle")
@@ -84,39 +125,34 @@ def parse_toc(toc, reader, writer, parent_dict, level=1):
             if target_element is None:
                 continue
             dest = None
-            parent = None
-            if "href" in element.attrib:
+            if "href" in target_element.attrib:
                 for content in target_element.attrib["href"].split("#"):
-                    dest_name += content.rstrip(".html").replace("/",
-                                                                 "-") + "-"
+                    dest_name += content.removesuffix(".html").replace("/", "-") + "-"
                 dest_name = dest_name.rstrip("-")
                 dest_name = "/{}".format(urllib.parse.quote(dest_name.lower()))
-                dest_name = dest_name.replace(".", "")
+
                 if dest_name in reader.named_destinations:
                     dest = reader.named_destinations[dest_name]
                 else:
+                    print("Dest not found: {}".format(dest_name))
                     for d in reader.named_destinations.items():
                         if d[0].startswith(dest_name):
                             dest = d[1]
                             break
-                if not dest:
-                    continue
-                if level > 1:
-                    temp = parent_dict
-                    for _ in range(1, level - 1):
-                        if temp["child"] and temp["child"]["node"]:
-                            temp = temp["child"]
-                    parent = temp["node"]
-
-                if dest.get('/Type') == '/Fit':
-                    update_parent_dict(parent_dict, level, writer.add_outline_item(
-                        head.text_content(), None, parent))
-                    continue
+
             page = None
+            fit = None
             if dest:
-                page = reader.get_destination_page_number(dest)
-            update_parent_dict(parent_dict, level, writer.add_outline_item(
-                head.text_content(), page, parent))
+                if dest.get("/Type") != "/Fit":
+                    page = reader.get_destination_page_number(dest)
+                    fit = Fit(
+                        dest.get("/Type"),
+                        (dest.get("/Left"), dest.get("/Top"), dest.get("/Zoom")),
+                    )
+
+            update_parent_dict(
+                parent_dict, level, writer, head.text_content(), page, fit
+            )
 
 
 def add_toc_outline(html_page, reader, writer):
@@ -132,7 +168,10 @@ def add_toc_outline(html_page, reader, writer):
 
 
 def main():
-    conf = json.loads(input())["config"]["output"]["pdf-outline"]
+    sys.stdin.reconfigure(encoding="utf8")
+    context = json.loads(sys.stdin.read())
+
+    conf = context["config"]["output"]["pdf-outline"]
 
     reader = PdfReader("../pdf/output.pdf")
 
@@ -144,6 +183,23 @@ def main():
     else:
         add_toc_outline("../html/print.html", reader, writer)
 
+    meta = context["config"]["book"]
+    try:
+        writer.add_metadata(
+            {
+                "/DisplayDocTitle": True,
+                "/Title": meta.get("title") or "",
+                "/Author": ", ".join(meta["authors"]),
+                "/Subject": meta.get("description") or "",
+                "/CreationDate": reader.metadata["/CreationDate"],
+                "/ModDate": reader.metadata["/ModDate"],
+                "/Creator": "mdBook-pdf",
+                "/Lang": meta.get("language") or "",
+            }
+        )
+    except Exception:
+        pass
+
     with open("output.pdf", "wb") as f:
         writer.write(f)
 

diff --git a/setup.py b/setup.py
@@ -1,44 +1,58 @@
 #!/usr/bin/env python3
 # vim: set fileencoding=utf-8 :
 # vim: set et ts=4 sw=4:
-'''
+"""
   mdbook-pdf-outline
+  An outline (Table of Content) generator for mdBook-pdf.
+
   Author:  Hollow Man <[email protected]>
+  License: GPL-3.0
 
-  Copyright © 2022 Hollow Man(@HollowMan6). All rights reserved.
+  Copyright © 2022-2023 Hollow Man (@HollowMan6). All rights reserved.
 
   This document is free software; you can redistribute it and/or modify it under the terms of the GNU General
   Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option)
   any later version.
-'''
+
+  This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
+  warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License along with this program.
+  If not, see <http://www.gnu.org/licenses/>.
+"""
 
 from setuptools import setup
 
 # read the contents of README file
 from os import path
+
 this_directory = path.abspath(path.dirname(__file__))
-with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
+with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
-setup(name='mdbook-pdf-outline',
-      version='0.1.3',
-      description='Tool for generating outlines for PDF files generated by mdbook-pdf.',
-      url='https://github.com/HollowMan6/mdbook-pdf',
-      author='Hollow Man (Domain Address)',
-      author_email='[email protected]',
-      license='GPL-3.0-or-later',
-      install_requires=['lxml', 'pypdf'],
-      packages=['mdbook_pdf_outline'],
-      entry_points={'console_scripts': [
-          'mdbook-pdf-outline=mdbook_pdf_outline.mdbook_pdf_outline:main']},
-      long_description=long_description,
-      project_urls={
-          "Bug Tracker": "https://github.com/HollowMan6/mdbook-pdf/issues",
-      },
-      classifiers=[
-          "Programming Language :: Python :: 3",
-          "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
-          "Topic :: Text Processing :: Markup :: Markdown",
-      ],
-      long_description_content_type='text/markdown'
-      )
+setup(
+    name="mdbook-pdf-outline",
+    version="0.1.4",
+    description="Tool for generating outlines for PDF files generated by mdbook-pdf.",
+    url="https://github.com/HollowMan6/mdbook-pdf",
+    author="Hollow Man (Domain Address)",
+    author_email="[email protected]",
+    license="GPL-3.0-or-later",
+    install_requires=["lxml", "pypdf"],
+    packages=["mdbook_pdf_outline"],
+    entry_points={
+        "console_scripts": [
+            "mdbook-pdf-outline=mdbook_pdf_outline.mdbook_pdf_outline:main"
+        ]
+    },
+    long_description=long_description,
+    project_urls={
+        "Bug Tracker": "https://github.com/HollowMan6/mdbook-pdf/issues",
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+        "Topic :: Text Processing :: Markup :: Markdown",
+    ],
+    long_description_content_type="text/markdown",
+)
diff --git a/src/main.rs b/src/main.rs
@@ -1,6 +1,11 @@
 /**
  * mdbook-pdf
- * Copyright (C) 2022-2023 Hollow Man
+ * A PDF generator for mdBook using headless Chrome.
+ * 
+ * Author:  Hollow Man <[email protected]>
+ * License: GPL-3.0
+ * 
+ * Copyright (C) 2022-2023 Hollow Man (@HollowMan6)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,9 +64,31 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let mut buf_reader = BufReader::new(file);
     let mut contents = String::new();
     buf_reader.read_to_string(&mut contents)?;
-    contents = contents.replacen(
-        "</script>",
-        "</script>
+
+    // Insert a link to the page div in the print.html to make sure that generated pdf
+    // contains the destination for ToC to locate the specific page in pdf.
+    let mut toc_fix = "<div style=\"display: none\">".to_owned();
+    for item in ctx.book.iter() {
+        if let mdbook::book::BookItem::Chapter(chapter) = item {
+            let path = chapter.path.clone();
+            if let Some(path) = path {
+                let print_page_id = {
+                    let mut base = path.display().to_string();
+                    if base.ends_with(".md") {
+                        base.truncate(base.len() - 3);
+                    }
+                    &base
+                        .replace("/", "-")
+                        .replace("\\", "-")
+                        .to_ascii_lowercase()
+                };
+                toc_fix.push_str(&(format!(r##"<a href="#{print_page_id}">{print_page_id}</a>"##)));
+            }
+        }
+    }
+    toc_fix.push_str("</div>");
+
+    let script = "</script>
 
         <!-- Custom JS scripts for mdbook-pdf PDF generation -->
         <script type='text/javascript'>
@@ -86,7 +113,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                     markAllContentHasLoadedForPrinting();
                 }
             });
-        </script>",
+        </script>
+    ".to_owned();
+
+    contents = contents.replacen(
+        "</script>",
+        &(script + &toc_fix),
         1,
     );
     if !cfg.static_site_url.is_empty() {