Merge pull request #67 from GitGuardian/agateau/fix-maximum-size-check

Do not make the encoded document larger than expected
GitGuardian · Jul 12, 2023 · ccd3e47 · ccd3e47
2 parents da5d95d + 5164a72
commit ccd3e47
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 7 deletions.
diff --git a/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md b/changelog.d/20230712_172917_aurelien.gateau_fix_maximum_size_check.md
@@ -0,0 +1,3 @@
+### Fixed
+
+- Do not make documents longer when preparing them to be sent to the API.
diff --git a/pygitguardian/client.py b/pygitguardian/client.py
@@ -312,8 +312,8 @@ def content_scan(
         """
         content_scan handles the /scan endpoint of the API.
 
-        If document contains `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If document contains `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param filename: name of file, example: "intro.py"
         :param document: content of file
@@ -355,8 +355,8 @@ def multi_content_scan(
         """
         multi_content_scan handles the /multiscan endpoint of the API.
 
-        If documents contain `0` bytes, they will be replaced with the Unicode
-        replacement character.
+        If documents contain `0` bytes, they will be replaced with the ASCII substitute
+        character.
 
         :param documents: List of dictionaries containing the keys document
         and, optionally, filename.

diff --git a/pygitguardian/models.py b/pygitguardian/models.py
@@ -94,8 +94,12 @@ def validate_size(document: Dict[str, Any], maximum_size: int) -> None:
     @post_load
     def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         doc = in_data["document"]
-        # Our API does not accept 0 bytes in documents, so replace them with the replacement character
-        in_data["document"] = doc.replace("\0", "\uFFFD")
+        # Our API does not accept 0 bytes in documents so replace them with
+        # the ASCII substitute character.
+        # We no longer uses the Unicode replacement character (U+FFFD) because
+        # it makes the encoded string one byte longer, making it possible to
+        # hit the maximum size limit.
+        in_data["document"] = doc.replace("\0", "\x1a")
         return in_data
 
     @post_load

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -40,7 +40,7 @@ def test_document_handle_0_bytes(self):
         document = Document.SCHEMA.load(
             {"filename": "name", "document": "hello\0world"}
         )
-        assert document["document"] == "hello\uFFFDworld"
+        assert document["document"] == "hello\x1aworld"
 
     def test_document_handle_surrogates(self):
         document = Document.SCHEMA.load(