Skip to content

Commit

Permalink
ENH: add get_pages_from_field (#2494)
Browse files Browse the repository at this point in the history
* DEV: add _get_page_number_from_indirect in writer

create similar function to have same API as in reader
used in future dev


---------

Co-authored-by: Stefan <[email protected]>
  • Loading branch information
pubpub-zz and stefan6419846 authored Mar 2, 2024
1 parent f32a964 commit cb146e8
Show file tree
Hide file tree
Showing 4 changed files with 316 additions and 3 deletions.
8 changes: 7 additions & 1 deletion docs/user/forms.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ PDF forms have a dual-nature approach about the fields:
Inside it you could find (optional):

- some global elements (Fonts, Resources,...)
- some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
- some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_page_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
- `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content
- `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots)

Expand Down Expand Up @@ -99,3 +99,9 @@ However, it's also important to note that the two lists do not *always* refer to
__Caution: Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead.__

In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_.

## Identify pages where fields are used

On order to ease locating page fields you can use `page.get_pages_using_field`. This methods accepts a field object, id est a *PdfObject* that represents a field (as are extracted from `_root_object["/AcroForm"]["/Fields"]`. The method returns a list of pages, because a field can have multiple widgets as mentioned previously (e.g. radio buttons or text displayed on multiple pages).

The page numbers can then be retrieved as usual by using `page.page_number`.
74 changes: 73 additions & 1 deletion pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,76 @@ def indexed_key(k: str, fields: Dict[Any, Any]) -> str:
ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
return ff

def get_pages_showing_field(
self, field: Union[Field, PdfObject, IndirectObject]
) -> List[PageObject]:
"""
Provides list of pages where the field is called.
Args:
field: Field Object, PdfObject or IndirectObject referencing a Field
Returns:
List of pages:
- Empty list:
The field has no widgets attached
(either hidden field or ancestor field).
- Single page list:
Page where the widget is present
(most common).
- Multi-page list:
Field with multiple kids widgets
(example: radio buttons, field repeated on multiple pages).
"""

def _get_inherited(obj: DictionaryObject, key: str) -> Any:
if key in obj:
return obj[key]
elif "/Parent" in obj:
return _get_inherited(
cast(DictionaryObject, obj["/Parent"].get_object()), key
)
else:
return None

try:
# to cope with all types
field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
except Exception as exc:
raise ValueError("field type is invalid") from exc
if _get_inherited(field, "/FT") is None:
raise ValueError("field is not valid")
ret = []
if field.get("/Subtype", "") == "/Widget":
if "/P" in field:
ret = [field["/P"].get_object()]
else:
ret = [
p
for p in self.pages
if field.indirect_reference in p.get("/Annots", "")
]
else:
kids = field.get("/Kids", ())
for k in kids:
k = k.get_object()
if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
# Kid that is just a widget, not a field:
if "/P" in k:
ret += [k["/P"].get_object()]
else:
ret += [
p
for p in self.pages
if k.indirect_reference in p.get("/Annots", "")
]
return [
x
if isinstance(x, PageObject)
else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
for x in ret
]

def _get_named_destinations(
self,
tree: Union[TreeObject, None] = None,
Expand Down Expand Up @@ -1813,7 +1883,9 @@ def decrypt(self, password: Union[str, bytes]) -> PasswordType:
def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
"""Take the permissions as an integer, return the allowed access."""
deprecate_with_replacement(
old_name="decode_permissions", new_name="user_access_permissions", removed_in="5.0.0"
old_name="decode_permissions",
new_name="user_access_permissions",
removed_in="5.0.0",
)

permissions_mapping = {
Expand Down
71 changes: 71 additions & 0 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
DecodedStreamObject,
Destination,
DictionaryObject,
Field,
Fit,
FloatObject,
IndirectObject,
Expand Down Expand Up @@ -1003,6 +1004,76 @@ def reattach_fields(
lst.append(ano)
return lst

def get_pages_showing_field(
self, field: Union[Field, PdfObject, IndirectObject]
) -> List[PageObject]:
"""
Provides list of pages where the field is called.
Args:
field: Field Object, PdfObject or IndirectObject referencing a Field
Returns:
List of pages:
- Empty list:
The field has no widgets attached
(either hidden field or ancestor field).
- Single page list:
Page where the widget is present
(most common).
- Multi-page list:
Field with multiple kids widgets
(example: radio buttons, field repeated on multiple pages).
"""

def _get_inherited(obj: DictionaryObject, key: str) -> Any:
if key in obj:
return obj[key]
elif "/Parent" in obj:
return _get_inherited(
cast(DictionaryObject, obj["/Parent"].get_object()), key
)
else:
return None

try:
# to cope with all types
field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
except Exception as exc:
raise ValueError("field type is invalid") from exc
if _get_inherited(field, "/FT") is None:
raise ValueError("field is not valid")
ret = []
if field.get("/Subtype", "") == "/Widget":
if "/P" in field:
ret = [field["/P"].get_object()]
else:
ret = [
p
for p in self.pages
if field.indirect_reference in p.get("/Annots", "")
]
else:
kids = field.get("/Kids", ())
for k in kids:
k = k.get_object()
if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
# Kid that is just a widget, not a field:
if "/P" in k:
ret += [k["/P"].get_object()]
else:
ret += [
p
for p in self.pages
if k.indirect_reference in p.get("/Annots", "")
]
return [
x
if isinstance(x, PageObject)
else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
for x in ret
]

def clone_reader_document_root(self, reader: PdfReader) -> None:
"""
Copy the reader document root to the writer and all sub elements,
Expand Down
166 changes: 165 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@
from pypdf import PdfMerger, PdfReader, PdfWriter
from pypdf.constants import PageAttributes as PG
from pypdf.errors import PdfReadError, PdfReadWarning
from pypdf.generic import ContentStream, NameObject, read_object
from pypdf.generic import (
ArrayObject,
ContentStream,
DictionaryObject,
NameObject,
TextStringObject,
read_object,
)

from . import get_data_from_url, normalize_warnings

Expand Down Expand Up @@ -1108,3 +1115,160 @@ def test_text_extraction_invalid_mode():
reader = PdfReader(pdf_path)
with pytest.raises(ValueError, match="Invalid text extraction mode"):
reader.pages[0].extract_text(extraction_mode="foo") # type: ignore


@pytest.mark.enable_socket()
def test_get_page_showing_field():
"""
Uses testfile from #2452 in order to get fields on multiple pages,
choices boxes,...
"""
url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf"
name = "iss2452.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name)))
writer = PdfWriter(clone_from=reader)

# validate with Field: only works on Reader (no get_fields on writer yet)
fld = reader.get_fields()
assert [
p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"])
] == [0]

# validate with dictionary object
# NRCategory field is a radio box
assert [
p.page_number
for p in reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
)
] == [0, 0, 0, 0, 0]
assert [
p.page_number
for p in writer.get_pages_showing_field(
writer._root_object["/AcroForm"]["/Fields"][8].get_object()
)
] == [0, 0, 0, 0, 0]

# validate with IndirectObject
# SiteID field is a textbox on multiple pages
assert [
p.page_number
for p in reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]
)
] == [0, 1]
assert [
p.page_number
for p in writer.get_pages_showing_field(
writer._root_object["/AcroForm"]["/Fields"][99]
)
] == [0, 1]
# test directly on the widget:
assert [
p.page_number
for p in reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1]
)
] == [1]
assert [
p.page_number
for p in writer.get_pages_showing_field(
writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1]
)
] == [1]

# Exceptions:
# Invalid Object
with pytest.raises(ValueError) as exc:
reader.get_pages_showing_field(None)
with pytest.raises(ValueError) as exc:
writer.get_pages_showing_field(None)
assert "field type is invalid" in exc.value.args[0]

# Damage Field
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"]
del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"]
with pytest.raises(ValueError) as exc:
reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
)
with pytest.raises(ValueError) as exc:
writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
assert "field is not valid" in exc.value.args[0]

# missing Parent in field
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
"/Parent"
]
del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
"/Parent"
]
with pytest.raises(ValueError) as exc:
reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
)
with pytest.raises(ValueError) as exc:
writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])

# remove "/P" (optional)
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[
"/P"
]
del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"]
assert [
p.page_number
for p in reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1]
)
] == [0]
assert [
p.page_number
for p in writer.get_pages_showing_field(
writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1]
)
] == [0]
assert [
p.page_number
for p in reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
)
] == [0, 0, 0, 0, 0]
assert [
p.page_number
for p in writer.get_pages_showing_field(
writer._root_object["/AcroForm"]["/Fields"][8].get_object()
)
] == [0, 0, 0, 0, 0]

# Grouping fields
reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[
NameObject("/Kids")
] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]])
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"]
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"]
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"]
writer._root_object["/AcroForm"]["/Fields"].append(
writer._add_object(
DictionaryObject(
{
NameObject("/T"): TextStringObject("grouping"),
NameObject("/FT"): NameObject("/Tx"),
NameObject("/Kids"): ArrayObject(
[reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]
),
}
)
)
)
assert [
p.page_number
for p in reader.get_pages_showing_field(
reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1]
)
] == []
assert [
p.page_number
for p in writer.get_pages_showing_field(
writer._root_object["/AcroForm"]["/Fields"][-1]
)
] == []

0 comments on commit cb146e8

Please sign in to comment.