From 5164a7208c8fb6fdc187c201c20b1263d94ff062 Mon Sep 17 00:00:00 2001
From: Aurelien Gateau <aurelien.gateau@gitguardian.com>
Date: Tue, 11 Jul 2023 11:44:13 +0200
Subject: [PATCH] fix: do not make the encoded document larger than expected

The Unicode replacement character becomes 2 bytes in UTF-8 (0xFF 0xFD).
Replacing \0 with this character causes the encoded string to be one
byte longer, making it possible for the encoded document to be longer
than the maximum document size.

Use the ASCII substitute character instead: it's only 1 byte long in
UTF-8, so it does not make the encoded document grow.
---
 ...30712_172917_aurelien.gateau_fix_maximum_size_check.md | 3 +++
 pygitguardian/client.py                                   | 8 ++++----
 pygitguardian/models.py                                   | 8 ++++++--
 tests/test_models.py                                      | 2 +-
 4 files changed, 14 insertions(+), 7 deletions(-)
 create mode 100644 changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md

diff --git a/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md b/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md
new file mode 100644
index 00000000..25bf9992
--- /dev/null
+++ b/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md
@@ -0,0 +1,3 @@
+### Fixed
+
+- Do not make documents longer when preparing them to be sent to the API.
diff --git a/pygitguardian/client.py b/pygitguardian/client.py
index 5551b542..50ccc492 100644
--- a/pygitguardian/client.py
+++ b/pygitguardian/client.py
@@ -312,8 +312,8 @@ def content_scan(
         """
         content_scan handles the /scan endpoint of the API.
 
-        If document contains `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If document contains `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param filename: name of file, example: "intro.py"
         :param document: content of file
@@ -355,8 +355,8 @@ def multi_content_scan(
         """
         multi_content_scan handles the /multiscan endpoint of the API.
 
-        If documents contain `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If documents contain `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param documents: List of dictionaries containing the keys document
         and, optionally, filename.
diff --git a/pygitguardian/models.py b/pygitguardian/models.py
index 4e2e4c0c..573f377f 100644
--- a/pygitguardian/models.py
+++ b/pygitguardian/models.py
@@ -94,8 +94,12 @@ def validate_size(document: Dict[str, Any], maximum_size: int) -> None:
     @post_load
     def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         doc = in_data["document"]
-        # Our API does not accept 0 bytes in documents, so replace them with the replacement character
-        in_data["document"] = doc.replace("\0", "\uFFFD")
+        # Our API does not accept 0 bytes in documents so replace them with
+        # the ASCII substitute character.
+        # We no longer uses the Unicode replacement character (U+FFFD) because
+        # it makes the encoded string one byte longer, making it possible to
+        # hit the maximum size limit.
+        in_data["document"] = doc.replace("\0", "\x1a")
         return in_data
 
     @post_load
diff --git a/tests/test_models.py b/tests/test_models.py
index 1bdaad3c..8a1bd444 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -40,7 +40,7 @@ def test_document_handle_0_bytes(self):
         document = Document.SCHEMA.load(
             {"filename": "name", "document": "hello\0world"}
         )
-        assert document["document"] == "hello\uFFFDworld"
+        assert document["document"] == "hello\x1aworld"
 
     def test_document_handle_surrogates(self):
         document = Document.SCHEMA.load(