ENH: add get_pages_from_field (#2494)

* DEV: add _get_page_number_from_indirect in writer create similar function to have same API as in reader used in future dev --------- Co-authored-by: Stefan <[email protected]>
py-pdf · Mar 2, 2024 · cb146e8 · cb146e8
1 parent f32a964
commit cb146e8
Show file tree

Hide file tree

Showing 4 changed files with 316 additions and 3 deletions.
diff --git a/docs/user/forms.md b/docs/user/forms.md
@@ -50,7 +50,7 @@ PDF forms have a dual-nature approach about the fields:
  Inside it you could find (optional):
 
  - some global elements (Fonts, Resources,...)
- - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
+ - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_page_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
  - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content
  - `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots)
 
@@ -99,3 +99,9 @@ However, it's also important to note that the two lists do not *always* refer to
 __Caution: Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead.__
 
 In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_.
+
+## Identify pages where fields are used
+
+On order to ease locating page fields you can use `page.get_pages_using_field`. This methods accepts a field object, id est a *PdfObject* that represents a field (as are extracted from `_root_object["/AcroForm"]["/Fields"]`. The method returns a list of pages, because a field can have multiple widgets as mentioned previously (e.g. radio buttons or text displayed on multiple pages).
+
+The page numbers can then be retrieved as usual by using `page.page_number`.
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -667,6 +667,76 @@ def indexed_key(k: str, fields: Dict[Any, Any]) -> str:
  ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
  return ff
 
+ def get_pages_showing_field(
+ self, field: Union[Field, PdfObject, IndirectObject]
+ ) -> List[PageObject]:
+ """
+ Provides list of pages where the field is called.
+
+ Args:
+ field: Field Object, PdfObject or IndirectObject referencing a Field
+
+ Returns:
+ List of pages:
+ - Empty list:
+ The field has no widgets attached
+ (either hidden field or ancestor field).
+ - Single page list:
+ Page where the widget is present
+ (most common).
+ - Multi-page list:
+ Field with multiple kids widgets
+ (example: radio buttons, field repeated on multiple pages).
+ """
+
+ def _get_inherited(obj: DictionaryObject, key: str) -> Any:
+ if key in obj:
+ return obj[key]
+ elif "/Parent" in obj:
+ return _get_inherited(
+ cast(DictionaryObject, obj["/Parent"].get_object()), key
+ )
+ else:
+ return None
+
+ try:
+ # to cope with all types
+ field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
+ except Exception as exc:
+ raise ValueError("field type is invalid") from exc
+ if _get_inherited(field, "/FT") is None:
+ raise ValueError("field is not valid")
+ ret = []
+ if field.get("/Subtype", "") == "/Widget":
+ if "/P" in field:
+ ret = [field["/P"].get_object()]
+ else:
+ ret = [
+ p
+ for p in self.pages
+ if field.indirect_reference in p.get("/Annots", "")
+ ]
+ else:
+ kids = field.get("/Kids", ())
+ for k in kids:
+ k = k.get_object()
+ if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
+ # Kid that is just a widget, not a field:
+ if "/P" in k:
+ ret += [k["/P"].get_object()]
+ else:
+ ret += [
+ p
+ for p in self.pages
+ if k.indirect_reference in p.get("/Annots", "")
+ ]
+ return [
+ x
+ if isinstance(x, PageObject)
+ else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
+ for x in ret
+ ]
+
  def _get_named_destinations(
  self,
  tree: Union[TreeObject, None] = None,
@@ -1813,7 +1883,9 @@ def decrypt(self, password: Union[str, bytes]) -> PasswordType:
  def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
  """Take the permissions as an integer, return the allowed access."""
  deprecate_with_replacement(
- old_name="decode_permissions", new_name="user_access_permissions", removed_in="5.0.0"
+ old_name="decode_permissions",
+ new_name="user_access_permissions",
+ removed_in="5.0.0",
  )
 
  permissions_mapping = {

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -96,6 +96,7 @@
  DecodedStreamObject,
  Destination,
  DictionaryObject,
+ Field,
  Fit,
  FloatObject,
  IndirectObject,
@@ -1003,6 +1004,76 @@ def reattach_fields(
  lst.append(ano)
  return lst
 
+ def get_pages_showing_field(
+ self, field: Union[Field, PdfObject, IndirectObject]
+ ) -> List[PageObject]:
+ """
+ Provides list of pages where the field is called.
+
+ Args:
+ field: Field Object, PdfObject or IndirectObject referencing a Field
+
+ Returns:
+ List of pages:
+ - Empty list:
+ The field has no widgets attached
+ (either hidden field or ancestor field).
+ - Single page list:
+ Page where the widget is present
+ (most common).
+ - Multi-page list:
+ Field with multiple kids widgets
+ (example: radio buttons, field repeated on multiple pages).
+ """
+
+ def _get_inherited(obj: DictionaryObject, key: str) -> Any:
+ if key in obj:
+ return obj[key]
+ elif "/Parent" in obj:
+ return _get_inherited(
+ cast(DictionaryObject, obj["/Parent"].get_object()), key
+ )
+ else:
+ return None
+
+ try:
+ # to cope with all types
+ field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
+ except Exception as exc:
+ raise ValueError("field type is invalid") from exc
+ if _get_inherited(field, "/FT") is None:
+ raise ValueError("field is not valid")
+ ret = []
+ if field.get("/Subtype", "") == "/Widget":
+ if "/P" in field:
+ ret = [field["/P"].get_object()]
+ else:
+ ret = [
+ p
+ for p in self.pages
+ if field.indirect_reference in p.get("/Annots", "")
+ ]
+ else:
+ kids = field.get("/Kids", ())
+ for k in kids:
+ k = k.get_object()
+ if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
+ # Kid that is just a widget, not a field:
+ if "/P" in k:
+ ret += [k["/P"].get_object()]
+ else:
+ ret += [
+ p
+ for p in self.pages
+ if k.indirect_reference in p.get("/Annots", "")
+ ]
+ return [
+ x
+ if isinstance(x, PageObject)
+ else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
+ for x in ret
+ ]
+
  def clone_reader_document_root(self, reader: PdfReader) -> None:
  """
  Copy the reader document root to the writer and all sub elements,

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -17,7 +17,14 @@
 from pypdf import PdfMerger, PdfReader, PdfWriter
 from pypdf.constants import PageAttributes as PG
 from pypdf.errors import PdfReadError, PdfReadWarning
-from pypdf.generic import ContentStream, NameObject, read_object
+from pypdf.generic import (
+ ArrayObject,
+ ContentStream,
+ DictionaryObject,
+ NameObject,
+ TextStringObject,
+ read_object,
+)
 
 from . import get_data_from_url, normalize_warnings
 
@@ -1108,3 +1115,160 @@ def test_text_extraction_invalid_mode():
  reader = PdfReader(pdf_path)
  with pytest.raises(ValueError, match="Invalid text extraction mode"):
  reader.pages[0].extract_text(extraction_mode="foo") # type: ignore
+
+
+@pytest.mark.enable_socket()
+def test_get_page_showing_field():
+ """
+ Uses testfile from #2452 in order to get fields on multiple pages,
+ choices boxes,...
+ """
+ url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf"
+ name = "iss2452.pdf"
+ reader = PdfReader(BytesIO(get_data_from_url(url, name)))
+ writer = PdfWriter(clone_from=reader)
+
+ # validate with Field: only works on Reader (no get_fields on writer yet)
+ fld = reader.get_fields()
+ assert [
+ p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"])
+ ] == [0]
+
+ # validate with dictionary object
+ # NRCategory field is a radio box
+ assert [
+ p.page_number
+ for p in reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
+ )
+ ] == [0, 0, 0, 0, 0]
+ assert [
+ p.page_number
+ for p in writer.get_pages_showing_field(
+ writer._root_object["/AcroForm"]["/Fields"][8].get_object()
+ )
+ ] == [0, 0, 0, 0, 0]
+
+ # validate with IndirectObject
+ # SiteID field is a textbox on multiple pages
+ assert [
+ p.page_number
+ for p in reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]
+ )
+ ] == [0, 1]
+ assert [
+ p.page_number
+ for p in writer.get_pages_showing_field(
+ writer._root_object["/AcroForm"]["/Fields"][99]
+ )
+ ] == [0, 1]
+ # test directly on the widget:
+ assert [
+ p.page_number
+ for p in reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1]
+ )
+ ] == [1]
+ assert [
+ p.page_number
+ for p in writer.get_pages_showing_field(
+ writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1]
+ )
+ ] == [1]
+
+ # Exceptions:
+ # Invalid Object
+ with pytest.raises(ValueError) as exc:
+ reader.get_pages_showing_field(None)
+ with pytest.raises(ValueError) as exc:
+ writer.get_pages_showing_field(None)
+ assert "field type is invalid" in exc.value.args[0]
+
+ # Damage Field
+ del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"]
+ del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"]
+ with pytest.raises(ValueError) as exc:
+ reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
+ )
+ with pytest.raises(ValueError) as exc:
+ writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
+ assert "field is not valid" in exc.value.args[0]
+
+ # missing Parent in field
+ del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
+ "/Parent"
+ ]
+ del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
+ "/Parent"
+ ]
+ with pytest.raises(ValueError) as exc:
+ reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
+ )
+ with pytest.raises(ValueError) as exc:
+ writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
+
+ # remove "/P" (optional)
+ del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[
+ "/P"
+ ]
+ del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"]
+ assert [
+ p.page_number
+ for p in reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1]
+ )
+ ] == [0]
+ assert [
+ p.page_number
+ for p in writer.get_pages_showing_field(
+ writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1]
+ )
+ ] == [0]
+ assert [
+ p.page_number
+ for p in reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
+ )
+ ] == [0, 0, 0, 0, 0]
+ assert [
+ p.page_number
+ for p in writer.get_pages_showing_field(
+ writer._root_object["/AcroForm"]["/Fields"][8].get_object()
+ )
+ ] == [0, 0, 0, 0, 0]
+
+ # Grouping fields
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[
+ NameObject("/Kids")
+ ] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]])
+ del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"]
+ del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"]
+ del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"]
+ writer._root_object["/AcroForm"]["/Fields"].append(
+ writer._add_object(
+ DictionaryObject(
+ {
+ NameObject("/T"): TextStringObject("grouping"),
+ NameObject("/FT"): NameObject("/Tx"),
+ NameObject("/Kids"): ArrayObject(
+ [reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]
+ ),
+ }
+ )
+ )
+ )
+ assert [
+ p.page_number
+ for p in reader.get_pages_showing_field(
+ reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1]
+ )
+ ] == []
+ assert [
+ p.page_number
+ for p in writer.get_pages_showing_field(
+ writer._root_object["/AcroForm"]["/Fields"][-1]
+ )
+ ] == []