Refactor number of pages alias (fix #1090) (#1203)

* refactor number of pages alias * clean commented code * ignore empty fragment. update tests * rename "alias" to "text substitution" add test * change type to isinstance for pylint * fix tests * change disable text shaping text now that {nb} in shaping is fixed * add docstrings and changelog * formatting
py-pdf · Oct 30, 2024 · 9df0cae · 9df0cae
1 parent 3fba534
commit 9df0cae
Show file tree

Hide file tree

Showing 11 changed files with 244 additions and 74 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,8 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 ### Fixed
 * `FPDF.set_text_shaping(False)` was broken since version 2.7.8 and is now working properly - [issue #1287](https://github.com/py-pdf/fpdf2/issues/1287)
 * fixed bug where cells with `rowspan`, `colspan` > 1 and null text were not displayed properly - [issue #1293](https://github.com/py-pdf/fpdf2/issues/1293)
+### Changed
+* improved logic for handling text substitution of the total number of pages, ensuring compatibility with text shaping - [issue #1090](https://github.com/py-pdf/fpdf2/issues/1090)
 
 ## [2.8.1] - 2024-10-04
 ### Added

diff --git a/fpdf/fpdf.py b/fpdf/fpdf.py
@@ -17,7 +17,7 @@
 from numbers import Number
 from os.path import splitext
 from pathlib import Path
-from typing import Callable, Iterator, NamedTuple, Optional, Union
+from typing import Callable, Dict, Iterator, NamedTuple, Optional, Union
 
 try:
     from endesive import signer
@@ -101,7 +101,12 @@ class Image:
     preload_image,
 )
 from .linearization import LinearizedOutputProducer
-from .line_break import Fragment, MultiLineBreak, TextLine
+from .line_break import (
+    Fragment,
+    MultiLineBreak,
+    TextLine,
+    TotalPagesSubstitutionFragment,
+)
 from .outline import OutlineSection
 from .output import (
     OutputProducer,
@@ -250,7 +255,8 @@ def __init__(
         but is less compatible with the PDF spec.
         """
         self.page = 0  # current page number
-        self.pages = {}  # array of PDFPage objects starting at index 1
+        # array of PDFPage objects starting at index 1:
+        self.pages: Dict[int, PDFPage] = {}
         self.fonts = {}  # map font string keys to an instance of CoreFont or TTFFont
         # map page numbers to a set of font indices:
         self.fonts_used_per_page_number = defaultdict(set)
@@ -3175,6 +3181,8 @@ def _render_styled_text_line(
                 f"{(self.h - self.y - 0.5 * h - 0.3 * max_font_size) * k:.2f} Td"
             )
             for i, frag in enumerate(fragments):
+                if isinstance(frag, TotalPagesSubstitutionFragment):
+                    self.pages[self.page].add_text_substitution(frag)
                 if frag.graphics_state["text_color"] != last_used_color:
                     # allow to change color within the line of text.
                     last_used_color = frag.graphics_state["text_color"]
@@ -3427,6 +3435,22 @@ def get_fallback_font(self, char, style=""):
     def _parse_chars(self, text: str, markdown: bool) -> Iterator[Fragment]:
         "Split text into fragments"
         if not markdown and not self.text_shaping and not self._fallback_font_ids:
+            if self.str_alias_nb_pages:
+                for seq, fragment_text in enumerate(
+                    text.split(self.str_alias_nb_pages)
+                ):
+                    if seq > 0:
+                        yield TotalPagesSubstitutionFragment(
+                            self.str_alias_nb_pages,
+                            self._get_current_graphics_state(),
+                            self.k,
+                        )
+                    if fragment_text:
+                        yield Fragment(
+                            fragment_text, self._get_current_graphics_state(), self.k
+                        )
+                return
+
             yield Fragment(text, self._get_current_graphics_state(), self.k)
             return
         txt_frag, in_bold, in_italics, in_underline = (
@@ -3486,6 +3510,23 @@ def frag():
                     yield frag()
                 current_text_script = text_script
 
+            if self.str_alias_nb_pages:
+                if text[: len(self.str_alias_nb_pages)] == self.str_alias_nb_pages:
+                    if txt_frag:
+                        yield frag()
+                    gstate = self._get_current_graphics_state()
+                    gstate["font_style"] = ("B" if in_bold else "") + (
+                        "I" if in_italics else ""
+                    )
+                    gstate["underline"] = in_underline
+                    yield TotalPagesSubstitutionFragment(
+                        self.str_alias_nb_pages,
+                        gstate,
+                        self.k,
+                    )
+                    text = text[len(self.str_alias_nb_pages) :]
+                    continue
+
             # Check that previous & next characters are not identical to the marker:
             if markdown:
                 if (
@@ -4675,26 +4716,6 @@ def sign(
         )
         self.pages[self.page].annots.append(annotation)
 
-    def _substitute_page_number(self):
-        substituted = False
-        # Replace number of pages in fonts using subsets (unicode)
-        alias = self.str_alias_nb_pages.encode("utf-16-be")
-        encoded_nb = str(self.pages_count).encode("utf-16-be")
-        for page in self.pages.values():
-            substituted |= alias in page.contents
-            page.contents = page.contents.replace(alias, encoded_nb)
-        # Now repeat for no pages in non-subset fonts
-        alias = self.str_alias_nb_pages.encode("latin-1")
-        encoded_nb = str(self.pages_count).encode("latin-1")
-        for page in self.pages.values():
-            substituted |= alias in page.contents
-            page.contents = page.contents.replace(alias, encoded_nb)
-        if substituted:
-            LOGGER.debug(
-                "Substitution of '%s' was performed in the document",
-                self.str_alias_nb_pages,
-            )
-
     def _insert_table_of_contents(self):
         # Doc has been closed but we want to write to self.pages[self.page] instead of self.buffer:
         tocp = self._toc_placeholder
@@ -5252,7 +5273,16 @@ def output(
             if self._toc_placeholder:
                 self._insert_table_of_contents()
             if self.str_alias_nb_pages:
-                self._substitute_page_number()
+                for page in self.pages.values():
+                    for substitution_item in page.get_text_substitutions():
+                        page.contents = page.contents.replace(
+                            substitution_item.get_placeholder_string().encode(
+                                "latin-1"
+                            ),
+                            substitution_item.render_text_substitution(
+                                str(self.pages_count)
+                            ).encode("latin-1"),
+                        )
             if linearize:
                 output_producer_class = LinearizedOutputProducer
             output_producer = output_producer_class(self)

diff --git a/fpdf/line_break.py b/fpdf/line_break.py
@@ -9,6 +9,7 @@
 
 from numbers import Number
 from typing import NamedTuple, Any, List, Optional, Union, Sequence
+from uuid import uuid4
 
 from .enums import Align, CharVPos, TextDirection, WrapMode
 from .errors import FPDFException
@@ -234,6 +235,14 @@ def get_width(
                 w += char_spacing * (char_len - 1)
         return w / self.k
 
+    def has_same_style(self, other: "Fragment"):
+        """Returns if 2 fragments are equivalent other than the characters/string"""
+        return (
+            self.graphics_state == other.graphics_state
+            and self.k == other.k
+            and isinstance(other, self.__class__)
+        )
+
     def get_character_width(self, character: str, print_sh=False, initial_cs=True):
         """
         Return the width of a single character out of the stored text.
@@ -350,6 +359,50 @@ def render_pdf_text_core(self, frag_ws, current_ws):
         return ret
 
 
+class TotalPagesSubstitutionFragment(Fragment):
+    """
+    A special type of text fragment that represents a placeholder for the total number of pages
+    in a PDF document.
+
+    A placeholder will be generated during the initial content rendering phase of a PDF document.
+    This placeholder is later replaced by the total number of pages in the document when the final
+    output is being produced.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.uuid = uuid4()
+
+    def get_placeholder_string(self):
+        """
+        This method returns a placeholder string containing a universally unique identifier (UUID4),
+        ensuring that the placeholder is distinct and does not conflict with other placeholders
+        within the document.
+        """
+        return f"::placeholder:{self.uuid}::"
+
+    def render_pdf_text(self, *args, **kwargs):
+        """
+        This method is invoked during the page content rendering phase, which is common to all
+        `Fragment` instances. It stores the provided arguments and keyword arguments to preserve
+        the necessary information and graphic state for the final substitution rendering.
+
+        The method then returns the unique placeholder string.
+        """
+        self._render_args = args
+        self._render_kwargs = kwargs
+        return self.get_placeholder_string()
+
+    def render_text_substitution(self, replacement_text: str):
+        """
+        This method is invoked at the output phase. It calls `render_pdf_text()` from the superclass
+        to render the fragment with the preserved rendering state (stored in `_render_args` and `_render_kwargs`)
+        and insert the final text in place of the placeholder.
+        """
+        self.characters = list(replacement_text)
+        return super().render_pdf_text(*self._render_args, **self._render_kwargs)
+
+
 class TextLine(NamedTuple):
     fragments: tuple
     text_width: float
@@ -445,8 +498,7 @@ def add_character(
         self,
         character: str,
         character_width: float,
-        graphics_state: dict,
-        k: float,
+        original_fragment: Fragment,
         original_fragment_index: int,
         original_character_index: int,
         height: float,
@@ -455,16 +507,29 @@ def add_character(
         assert character != NEWLINE
         self.height = height
         if not self.fragments:
-            self.fragments.append(Fragment("", graphics_state, k, url))
+            self.fragments.append(
+                original_fragment.__class__(
+                    characters="",
+                    graphics_state=original_fragment.graphics_state,
+                    k=original_fragment.k,
+                    link=url,
+                )
+            )
 
         # characters are expected to be grouped into fragments by font and
         # character attributes. If the last existing fragment doesn't match
         # the properties of the pending character -> add a new fragment.
-        elif (
-            graphics_state != self.fragments[-1].graphics_state
-            or k != self.fragments[-1].k
-        ):
-            self.fragments.append(Fragment("", graphics_state, k, url))
+        elif isinstance(
+            original_fragment, Fragment
+        ) and not original_fragment.has_same_style(self.fragments[-1]):
+            self.fragments.append(
+                original_fragment.__class__(
+                    characters="",
+                    graphics_state=original_fragment.graphics_state,
+                    k=original_fragment.k,
+                    link=url,
+                )
+            )
         active_fragment = self.fragments[-1]
 
         if character in BREAKING_SPACE_SYMBOLS_STR:
@@ -491,8 +556,8 @@ def add_character(
                 self.number_of_spaces,
                 HYPHEN,
                 character_width,
-                graphics_state,
-                k,
+                original_fragment.graphics_state,
+                original_fragment.k,
             )
 
         if character != SOFT_HYPHEN or self.print_sh:
@@ -550,8 +615,7 @@ def automatic_break(self, align: Align):
             self.add_character(
                 self.hyphen_break_hint.curchar,
                 self.hyphen_break_hint.curchar_width,
-                self.hyphen_break_hint.graphics_state,
-                self.hyphen_break_hint.k,
+                self.hyphen_break_hint,
                 self.hyphen_break_hint.original_fragment_index,
                 self.hyphen_break_hint.original_character_index,
                 self.height,
@@ -716,8 +780,7 @@ def get_line(self):
             current_line.add_character(
                 character,
                 character_width,
-                current_fragment.graphics_state,
-                current_fragment.k,
+                current_fragment,
                 self.fragment_index,
                 self.character_index,
                 current_font_height * self.line_height,

diff --git a/fpdf/output.py b/fpdf/output.py
@@ -13,9 +13,11 @@
 from contextlib import contextmanager
 from io import BytesIO
 
+
 from .annotations import PDFAnnotation
 from .enums import SignatureFlag
 from .errors import FPDFException
+from .line_break import TotalPagesSubstitutionFragment
 from .image_datastructures import RasterImageInfo
 from .outline import build_outline_objs
 from .sign import Signature, sign_content
@@ -243,6 +245,7 @@ class PDFPage(PDFObject):
         "_index",
         "_width_pt",
         "_height_pt",
+        "_text_substitution_fragments",
     )
 
     def __init__(
@@ -265,6 +268,7 @@ def __init__(
         self.parent = None  # must always be set before calling .serialize()
         self._index = index
         self._width_pt, self._height_pt = None, None
+        self._text_substitution_fragments: list[TotalPagesSubstitutionFragment] = []
 
     def index(self):
         return self._index
@@ -277,6 +281,12 @@ def set_dimensions(self, width_pt, height_pt):
         "Accepts a pair (width, height) in the unit specified to FPDF constructor"
         self._width_pt, self._height_pt = width_pt, height_pt
 
+    def get_text_substitutions(self):
+        return self._text_substitution_fragments
+
+    def add_text_substitution(self, fragment):
+        self._text_substitution_fragments.append(fragment)
+
 
 class PDFPagesRoot(PDFObject):
     def __init__(self, count, media_box):

diff --git a/test/alias_nb_pages.pdf b/test/alias_nb_pages.pdf
diff --git a/test/alias_with_text_shaping.pdf b/test/alias_with_text_shaping.pdf
diff --git a/test/outline/toc_with_nb_and_footer.pdf b/test/outline/toc_with_nb_and_footer.pdf
diff --git a/test/test_alias.py b/test/test_alias.py
@@ -29,3 +29,24 @@ def test_custom_alias_nb_pages(tmp_path):
     pdf.add_page()
     pdf.cell(0, 10, f"Page {pdf.page_no()}/{alias}", align="C")
     assert_pdf_equal(pdf, HERE / "alias_nb_pages.pdf", tmp_path)
+
+
+def test_alias_with_shaping(tmp_path):
+    pdf = fpdf.FPDF()
+    pdf.add_font("Quicksand", style="", fname=HERE / "fonts" / "Quicksand-Regular.otf")
+    pdf.add_page()
+    pdf.set_font("Quicksand", "", 24)
+    pdf.set_text_shaping(True)
+    pdf.write(text="Pages {nb}")
+    pdf.ln()
+    pdf.cell(text="{nb}", new_x="left", new_y="next")
+    pdf.write_html("<h1>{nb}</h1>")
+    pdf.multi_cell(w=pdf.epw, text="Number of pages: {nb}\nAgain:{nb}")
+    pdf.add_page()
+    pdf.set_text_shaping(False)
+    pdf.write(text="Pages {nb}")
+    pdf.ln()
+    pdf.cell(text="{nb}", new_x="left", new_y="next")
+    pdf.write_html("<h1>{nb}</h1>")
+    pdf.multi_cell(w=pdf.epw, text="Number of pages: {nb}\nAgain:{nb}")
+    assert_pdf_equal(pdf, HERE / "alias_with_text_shaping.pdf", tmp_path)