fix: do not make the encoded document larger than expected

The Unicode replacement character becomes 2 bytes in UTF-8 (0xFF 0xFD). Replacing \0 with this character causes the encoded string to be one byte longer, making it possible for the encoded document to be longer than the maximum document size. Use the ASCII substitute character instead: it's only 1 byte long in UTF-8, so it does not make the encoded document grow.
GitGuardian · Jul 12, 2023 · 5164a72 · 5164a72
1 parent da5d95d
commit 5164a72
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 7 deletions.
diff --git a/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md b/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md
@@ -0,0 +1,3 @@
+### Fixed
+
+- Do not make documents longer when preparing them to be sent to the API.
diff --git a/pygitguardian/client.py b/pygitguardian/client.py
@@ -312,8 +312,8 @@ def content_scan(
         """
         content_scan handles the /scan endpoint of the API.
 
-        If document contains `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If document contains `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param filename: name of file, example: "intro.py"
         :param document: content of file
@@ -355,8 +355,8 @@ def multi_content_scan(
         """
         multi_content_scan handles the /multiscan endpoint of the API.
 
-        If documents contain `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If documents contain `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param documents: List of dictionaries containing the keys document
         and, optionally, filename.

diff --git a/pygitguardian/models.py b/pygitguardian/models.py
@@ -94,8 +94,12 @@ def validate_size(document: Dict[str, Any], maximum_size: int) -> None:
     @post_load
     def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         doc = in_data["document"]
-        # Our API does not accept 0 bytes in documents, so replace them with the replacement character
-        in_data["document"] = doc.replace("\0", "\uFFFD")
+        # Our API does not accept 0 bytes in documents so replace them with
+        # the ASCII substitute character.
+        # We no longer uses the Unicode replacement character (U+FFFD) because
+        # it makes the encoded string one byte longer, making it possible to
+        # hit the maximum size limit.
+        in_data["document"] = doc.replace("\0", "\x1a")
         return in_data
 
     @post_load

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -40,7 +40,7 @@ def test_document_handle_0_bytes(self):
         document = Document.SCHEMA.load(
             {"filename": "name", "document": "hello\0world"}
         )
-        assert document["document"] == "hello\uFFFDworld"
+        assert document["document"] == "hello\x1aworld"
 
     def test_document_handle_surrogates(self):
         document = Document.SCHEMA.load(