diff --git a/pyhanko/pdf_utils/writer.py b/pyhanko/pdf_utils/writer.py index 3e4a014b..96061b50 100644 --- a/pyhanko/pdf_utils/writer.py +++ b/pyhanko/pdf_utils/writer.py @@ -4,6 +4,7 @@ for the original license. """ +import logging import os import typing from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, cast @@ -53,6 +54,8 @@ 'copy_into_new_writer', ] +logger = logging.getLogger(__name__) + # TODO move this to content.py? def init_xobject_dictionary( @@ -1240,6 +1243,44 @@ def process_reference(self, ref: generic.Reference) -> generic.PdfObject: self.queued_references.append((ref, new_ido.reference)) return new_ido + def preprocess_signature_data(self): + # Signature /Contents is never encrypted => ensure we respect that + # (even though the import operation is guaranteed to break the signature + # there are valid use cases for stripping the encryption on such files, + # e.g. for downstream processing) + from ..sign.fields import enumerate_sig_fields + + signature_dict_refs = [ + field_value.reference + for fq_name, field_value, field_ref in enumerate_sig_fields( + self.source, filled_status=True + ) + # this is the case in all valid PDFs + if isinstance(field_value, generic.IndirectObject) + ] + if signature_dict_refs: + logger.warning( + "Source document contains filled signature fields--the copy " + "operation will invalidate them." + ) + for ref in signature_dict_refs: + sig_dict = ref.get_object() + assert isinstance(sig_dict, generic.DictionaryObject) + raw_dict = { + k: self._ingest(v) + for k, v in sig_dict.items() + if k != '/Contents' + } + raw_dict['/Contents'] = generic.ByteStringObject( + sig_dict.raw_get( + '/Contents', decrypt=generic.EncryptedObjAccess.RAW + ).original_bytes + ) + self.reference_map[ref] = self.target.add_object( + generic.DictionaryObject(raw_dict), + obj_stream=None, + ) + def copy_into_new_writer( input_handler: PdfHandler, writer_kwargs: Optional[dict] = None @@ -1290,6 +1331,7 @@ def copy_into_new_writer( }, obj_stream=None, ) + importer.preprocess_signature_data() new_root_dict = importer.import_object(input_handler.root) # override the old root ref ix = (output_root_ref.generation, output_root_ref.idnum) diff --git a/pyhanko_tests/data/pdf/signed-encrypted-pubkey-with-catalog-ref.pdf b/pyhanko_tests/data/pdf/signed-encrypted-pubkey-with-catalog-ref.pdf new file mode 100644 index 00000000..db188395 Binary files /dev/null and b/pyhanko_tests/data/pdf/signed-encrypted-pubkey-with-catalog-ref.pdf differ diff --git a/pyhanko_tests/test_sign_encrypted.py b/pyhanko_tests/test_sign_encrypted.py index b535be64..19586088 100644 --- a/pyhanko_tests/test_sign_encrypted.py +++ b/pyhanko_tests/test_sign_encrypted.py @@ -5,12 +5,14 @@ from pyhanko.pdf_utils.incremental_writer import IncrementalPdfFileWriter from pyhanko.pdf_utils.reader import PdfFileReader +from pyhanko.pdf_utils.writer import copy_into_new_writer from pyhanko.sign import signers from pyhanko.sign.diff_analysis import ModificationLevel from pyhanko.sign.signers.pdf_signer import ( DSSContentSettings, SigDSSPlacementPreference, ) +from pyhanko.sign.validation import validate_pdf_signature from pyhanko_tests.samples import ( MINIMAL_AES256, MINIMAL_ONE_FIELD_AES256, @@ -18,11 +20,13 @@ MINIMAL_PUBKEY_ONE_FIELD_AES256, MINIMAL_PUBKEY_ONE_FIELD_RC4, MINIMAL_RC4, + PDF_DATA_DIR, PUBKEY_SELFSIGNED_DECRYPTER, ) from pyhanko_tests.signing_commons import ( DUMMY_HTTP_TS, FROM_CA, + SIMPLE_V_CONTEXT, live_testing_vc, val_trusted, ) @@ -175,3 +179,50 @@ def test_sign_encrypted_with_post_sign(requests_mock, password, file): assert status.modification_level == ModificationLevel.LTA_UPDATES assert len(r.embedded_regular_signatures) == 1 assert len(r.embedded_timestamp_signatures) == 1 + + +def test_copy_encrypted_signed_file(): + w = IncrementalPdfFileWriter(BytesIO(MINIMAL_ONE_FIELD_AES256)) + w.encrypt("ownersecret") + out = signers.sign_pdf( + w, + signers.PdfSignatureMetadata(), + signer=FROM_CA, + existing_fields_only=True, + ) + + r = PdfFileReader(out) + r.decrypt("ownersecret") + w = copy_into_new_writer(r) + out2 = BytesIO() + w.write(out2) + + r = PdfFileReader(out2) + assert not r.encrypted + s = r.embedded_signatures[0] + s.compute_integrity_info() + status = validate_pdf_signature(s, SIMPLE_V_CONTEXT(), skip_diff=True) + assert not status.intact + + +def test_copy_file_with_mdp_signature_and_backref(): + # This file has /Data in a signature reference dictionary + # pointing back to the root (which is sometimes still seen in + # FieldMDP signatures generated by Acrobat, among others) + + fname = f"{PDF_DATA_DIR}/signed-encrypted-pubkey-with-catalog-ref.pdf" + with open(fname, 'rb') as inf: + + r = PdfFileReader(inf) + r.decrypt_pubkey(PUBKEY_SELFSIGNED_DECRYPTER) + + w = copy_into_new_writer(r) + out2 = BytesIO() + w.write(out2) + + r = PdfFileReader(out2) + assert not r.encrypted + s = r.embedded_signatures[0] + s.compute_integrity_info() + status = validate_pdf_signature(s, SIMPLE_V_CONTEXT(), skip_diff=True) + assert not status.intact