Skip to content

Commit

Permalink
Merge pull request #67 from GitGuardian/agateau/fix-maximum-size-check
Browse files Browse the repository at this point in the history
Do not make the encoded document larger than expected
  • Loading branch information
agateau-gg authored Jul 12, 2023
2 parents da5d95d + 5164a72 commit ccd3e47
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Fixed

- Do not make documents longer when preparing them to be sent to the API.
8 changes: 4 additions & 4 deletions pygitguardian/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,8 @@ def content_scan(
"""
content_scan handles the /scan endpoint of the API.
If document contains `0` bytes, they will be replaced with the Unicode
replacement character.
If document contains `0` bytes, they will be replaced with the ASCII substitute
character.
:param filename: name of file, example: "intro.py"
:param document: content of file
Expand Down Expand Up @@ -355,8 +355,8 @@ def multi_content_scan(
"""
multi_content_scan handles the /multiscan endpoint of the API.
If documents contain `0` bytes, they will be replaced with the Unicode
replacement character.
If documents contain `0` bytes, they will be replaced with the ASCII substitute
character.
:param documents: List of dictionaries containing the keys document
and, optionally, filename.
Expand Down
8 changes: 6 additions & 2 deletions pygitguardian/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,12 @@ def validate_size(document: Dict[str, Any], maximum_size: int) -> None:
@post_load
def replace_0_bytes(self, in_data: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
doc = in_data["document"]
# Our API does not accept 0 bytes in documents, so replace them with the replacement character
in_data["document"] = doc.replace("\0", "\uFFFD")
# Our API does not accept 0 bytes in documents so replace them with
# the ASCII substitute character.
# We no longer uses the Unicode replacement character (U+FFFD) because
# it makes the encoded string one byte longer, making it possible to
# hit the maximum size limit.
in_data["document"] = doc.replace("\0", "\x1a")
return in_data

@post_load
Expand Down
2 changes: 1 addition & 1 deletion tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_document_handle_0_bytes(self):
document = Document.SCHEMA.load(
{"filename": "name", "document": "hello\0world"}
)
assert document["document"] == "hello\uFFFDworld"
assert document["document"] == "hello\x1aworld"

def test_document_handle_surrogates(self):
document = Document.SCHEMA.load(
Expand Down

0 comments on commit ccd3e47

Please sign in to comment.