From ae6cfd6d102d885ac6b0873d31f0dac139b1ddae Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 22:13:09 -0400 Subject: [PATCH] [DRIVERS-2926] [PYTHON-4577] BSON Binary Vector Subtype Support (#1813) Co-authored-by: Steven Silvester Co-authored-by: Steven Silvester --- .evergreen/resync-specs.sh | 3 + bson/binary.py | 152 +++++++++++++++++++++++- doc/api/bson/binary.rst | 8 ++ doc/changelog.rst | 1 - test/bson_binary_vector/float32.json | 42 +++++++ test/bson_binary_vector/int8.json | 57 +++++++++ test/bson_binary_vector/packed_bit.json | 50 ++++++++ test/bson_corpus/binary.json | 30 +++++ test/test_bson.py | 81 ++++++++++++- test/test_bson_binary_vector.py | 105 ++++++++++++++++ 10 files changed, 519 insertions(+), 10 deletions(-) create mode 100644 test/bson_binary_vector/float32.json create mode 100644 test/bson_binary_vector/int8.json create mode 100644 test/bson_binary_vector/packed_bit.json create mode 100644 test/test_bson_binary_vector.py diff --git a/.evergreen/resync-specs.sh b/.evergreen/resync-specs.sh index ac69449729..dca116c2d3 100755 --- a/.evergreen/resync-specs.sh +++ b/.evergreen/resync-specs.sh @@ -76,6 +76,9 @@ do atlas-data-lake-testing|data_lake) cpjson atlas-data-lake-testing/tests/ data_lake ;; + bson-binary-vector|bson_binary_vector) + cpjson bson-binary-vector/tests/ bson_binary_vector + ;; bson-corpus|bson_corpus) cpjson bson-corpus/tests/ bson_corpus ;; diff --git a/bson/binary.py b/bson/binary.py index 5fe1bacd16..47c52d4892 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -13,7 +13,10 @@ # limitations under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Any, Tuple, Type, Union +import struct +from dataclasses import dataclass +from enum import Enum +from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union from uuid import UUID """Tools for representing BSON binary data. @@ -191,21 +194,75 @@ class UuidRepresentation: """ +VECTOR_SUBTYPE = 9 +"""**(BETA)** BSON binary subtype for densely packed vector data. + +.. versionadded:: 4.10 +""" + + USER_DEFINED_SUBTYPE = 128 """BSON binary subtype for any user defined structure. """ +class BinaryVectorDtype(Enum): + """**(BETA)** Datatypes of vector subtype. + + :param FLOAT32: (0x27) Pack list of :class:`float` as float32 + :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8 + :param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8 + + The `PACKED_BIT` value represents a special case where vector values themselves + can only be of two values (0 or 1) but these are packed together into groups of 8, + a byte. In Python, these are displayed as ints in range [0, 255] + + Each value is of type bytes with a length of one. + + .. versionadded:: 4.10 + """ + + INT8 = b"\x03" + FLOAT32 = b"\x27" + PACKED_BIT = b"\x10" + + +@dataclass +class BinaryVector: + """**(BETA)** Vector of numbers along with metadata for binary interoperability. + .. versionadded:: 4.10 + """ + + __slots__ = ("data", "dtype", "padding") + + def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0): + """ + :param data: Sequence of numbers representing the mathematical vector. + :param dtype: The data type stored in binary + :param padding: The number of bits in the final byte that are to be ignored + when a vector element's size is less than a byte + and the length of the vector is not a multiple of 8. + """ + self.data = data + self.dtype = dtype + self.padding = padding + + class Binary(bytes): """Representation of BSON binary data. - This is necessary because we want to represent Python strings as - the BSON string type. We need to wrap binary data so we can tell + We want to represent Python strings as the BSON string type. + We need to wrap binary data so that we can tell the difference between what should be considered binary data and what should be considered a string when we encode to BSON. - Raises TypeError if `data` is not an instance of :class:`bytes` - or `subtype` is not an instance of :class:`int`. + **(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data. + Its data is prepended with two bytes of metadata. + The first (dtype) describes its data type, such as float32 or int8. + The second (padding) prescribes the number of bits to ignore in the final byte. + This is relevant when the element size of the dtype is not a multiple of 8. + + Raises TypeError if `subtype` is not an instance of :class:`int`. Raises ValueError if `subtype` is not in [0, 256). .. note:: @@ -218,7 +275,10 @@ class Binary(bytes): to use .. versionchanged:: 3.9 - Support any bytes-like type that implements the buffer protocol. + Support any bytes-like type that implements the buffer protocol. + + .. versionchanged:: 4.10 + **(BETA)** Addition of vector subtype. """ _type_marker = 5 @@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}" ) + @classmethod + def from_vector( + cls: Type[Binary], + vector: list[int, float], + dtype: BinaryVectorDtype, + padding: int = 0, + ) -> Binary: + """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. + + To interpret the representation of the numbers, a data type must be included. + See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions. + + The dtype and padding are prepended to the binary data's value. + + :param vector: List of values + :param dtype: Data type of the values + :param padding: For fractional bytes, number of bits to ignore at end of vector. + :return: Binary packed data identified by dtype and padding. + + .. versionadded:: 4.10 + """ + if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8 + format_str = "b" + if padding: + raise ValueError(f"padding does not apply to {dtype=}") + elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8 + format_str = "B" + elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 + format_str = "f" + if padding: + raise ValueError(f"padding does not apply to {dtype=}") + else: + raise NotImplementedError("%s not yet supported" % dtype) + + metadata = struct.pack(" BinaryVector: + """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding. + + :return: BinaryVector + + .. versionadded:: 4.10 + """ + + if self.subtype != VECTOR_SUBTYPE: + raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.") + + position = 0 + dtype, padding = struct.unpack_from(" int: """Subtype of this binary data.""" diff --git a/doc/api/bson/binary.rst b/doc/api/bson/binary.rst index c933a687b9..084fd02d50 100644 --- a/doc/api/bson/binary.rst +++ b/doc/api/bson/binary.rst @@ -21,6 +21,14 @@ .. autoclass:: UuidRepresentation :members: + .. autoclass:: BinaryVectorDtype + :members: + :show-inheritance: + + .. autoclass:: BinaryVector + :members: + + .. autoclass:: Binary(data, subtype=BINARY_SUBTYPE) :members: :show-inheritance: diff --git a/doc/changelog.rst b/doc/changelog.rst index 3b7ddd1553..6c8b8261ac 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -19,7 +19,6 @@ in this release. .. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553 - Changes in Version 4.9.0 ------------------------- diff --git a/test/bson_binary_vector/float32.json b/test/bson_binary_vector/float32.json new file mode 100644 index 0000000000..bbbe00b758 --- /dev/null +++ b/test/bson_binary_vector/float32.json @@ -0,0 +1,42 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector FLOAT32", + "valid": true, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000" + }, + { + "description": "Empty Vector FLOAT32", + "valid": true, + "vector": [], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009270000" + }, + { + "description": "Infinity Vector FLOAT32", + "valid": true, + "vector": ["-inf", 0.0, "inf"], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00" + }, + { + "description": "FLOAT32 with padding", + "valid": false, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 3 + } + ] +} + diff --git a/test/bson_binary_vector/int8.json b/test/bson_binary_vector/int8.json new file mode 100644 index 0000000000..7529721e5e --- /dev/null +++ b/test/bson_binary_vector/int8.json @@ -0,0 +1,57 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype INT8", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector INT8", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000903007F0700" + }, + { + "description": "Empty Vector INT8", + "valid": true, + "vector": [], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009030000" + }, + { + "description": "Overflow Vector INT8", + "valid": false, + "vector": [128], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "Underflow Vector INT8", + "valid": false, + "vector": [-129], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "INT8 with padding", + "valid": false, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 3 + }, + { + "description": "INT8 with float inputs", + "valid": false, + "vector": [127.77, 7.77], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + } + ] +} + diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json new file mode 100644 index 0000000000..a41cd593f5 --- /dev/null +++ b/test/bson_binary_vector/packed_bit.json @@ -0,0 +1,50 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector PACKED_BIT", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000910007F0700" + }, + { + "description": "Empty Vector PACKED_BIT", + "valid": true, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009100000" + }, + { + "description": "PACKED_BIT with padding", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 3, + "canonical_bson": "1600000005766563746F7200040000000910037F0700" + }, + { + "description": "Overflow Vector PACKED_BIT", + "valid": false, + "vector": [256], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "Underflow Vector PACKED_BIT", + "valid": false, + "vector": [-1], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + } + ] +} + diff --git a/test/bson_corpus/binary.json b/test/bson_corpus/binary.json index 20aaef743b..0e0056f3a2 100644 --- a/test/bson_corpus/binary.json +++ b/test/bson_corpus/binary.json @@ -74,6 +74,36 @@ "description": "$type query operator (conflicts with legacy $binary form with $type field)", "canonical_bson": "180000000378001000000010247479706500020000000000", "canonical_extjson": "{\"x\" : { \"$type\" : {\"$numberInt\": \"2\"}}}" + }, + { + "description": "subtype 0x09 Vector FLOAT32", + "canonical_bson": "170000000578000A0000000927000000FE420000E04000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector INT8", + "canonical_bson": "11000000057800040000000903007F0700", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector PACKED_BIT", + "canonical_bson": "11000000057800040000000910007F0700", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) FLOAT32", + "canonical_bson": "0F0000000578000200000009270000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) INT8", + "canonical_bson": "0F0000000578000200000009030000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) PACKED_BIT", + "canonical_bson": "0F0000000578000200000009100000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" } ], "decodeErrors": [ diff --git a/test/test_bson.py b/test/test_bson.py index a0190ef2d8..96aa897d19 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -49,8 +49,9 @@ decode_iter, encode, is_valid, + json_util, ) -from bson.binary import USER_DEFINED_SUBTYPE, Binary, UuidRepresentation +from bson.binary import USER_DEFINED_SUBTYPE, Binary, BinaryVectorDtype, UuidRepresentation from bson.code import Code from bson.codec_options import CodecOptions, DatetimeConversion from bson.datetime_ms import _DATETIME_ERROR_SUGGESTION @@ -148,6 +149,9 @@ def helper(doc): helper({"a binary": Binary(b"test", 128)}) helper({"a binary": Binary(b"test", 254)}) helper({"another binary": Binary(b"test", 2)}) + helper({"binary packed bit vector": Binary(b"\x10\x00\x7f\x07", 9)}) + helper({"binary int8 vector": Binary(b"\x03\x00\x7f\x07", 9)}) + helper({"binary float32 vector": Binary(b"'\x00\x00\x00\xfeB\x00\x00\xe0@", 9)}) helper(SON([("test dst", datetime.datetime(1993, 4, 4, 2))])) helper(SON([("test negative dst", datetime.datetime(1, 1, 1, 1, 1, 1))])) helper({"big float": float(10000000000)}) @@ -447,6 +451,20 @@ def test_basic_encode(self): encode({"test": Binary(b"test", 128)}), b"\x14\x00\x00\x00\x05\x74\x65\x73\x74\x00\x04\x00\x00\x00\x80\x74\x65\x73\x74\x00", ) + self.assertEqual( + encode({"vector_int8": Binary.from_vector([-128, -1, 127], BinaryVectorDtype.INT8)}), + b"\x1c\x00\x00\x00\x05vector_int8\x00\x05\x00\x00\x00\t\x03\x00\x80\xff\x7f\x00", + ) + self.assertEqual( + encode({"vector_bool": Binary.from_vector([1, 127], BinaryVectorDtype.PACKED_BIT)}), + b"\x1b\x00\x00\x00\x05vector_bool\x00\x04\x00\x00\x00\t\x10\x00\x01\x7f\x00", + ) + self.assertEqual( + encode( + {"vector_float32": Binary.from_vector([-1.1, 1.1e10], BinaryVectorDtype.FLOAT32)} + ), + b"$\x00\x00\x00\x05vector_float32\x00\n\x00\x00\x00\t'\x00\xcd\xcc\x8c\xbf\xac\xe9#P\x00", + ) self.assertEqual(encode({"test": None}), b"\x0B\x00\x00\x00\x0A\x74\x65\x73\x74\x00\x00") self.assertEqual( encode({"date": datetime.datetime(2007, 1, 8, 0, 30, 11)}), @@ -711,9 +729,66 @@ def test_uuid_legacy(self): transformed = bin.as_uuid(UuidRepresentation.PYTHON_LEGACY) self.assertEqual(id, transformed) - # The C extension was segfaulting on unicode RegExs, so we have this test - # that doesn't really test anything but the lack of a segfault. + def test_vector(self): + """Tests of subtype 9""" + # We start with valid cases, across the 3 dtypes implemented. + # Work with a simple vector that can be interpreted as int8, float32, or ubyte + list_vector = [127, 7] + # As INT8, vector has length 2 + binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8) + vector = binary_vector.as_vector() + assert vector.data == list_vector + # test encoding roundtrip + assert {"vector": binary_vector} == decode(encode({"vector": binary_vector})) + # test json roundtrip + assert binary_vector == json_util.loads(json_util.dumps(binary_vector)) + + # For vectors of bits, aka PACKED_BIT type, vector has length 8 * 2 + packed_bit_binary = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT) + packed_bit_vec = packed_bit_binary.as_vector() + assert packed_bit_vec.data == list_vector + + # A padding parameter permits vectors of length that aren't divisible by 8 + # The following ignores the last 3 bits in list_vector, + # hence it's length is 8 * len(list_vector) - padding + padding = 3 + padded_vec = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT, padding=padding) + assert padded_vec.as_vector().data == list_vector + # To visualize how this looks as a binary vector.. + uncompressed = "" + for val in list_vector: + uncompressed += format(val, "08b") + assert uncompressed[:-padding] == "0111111100000" + + # It is worthwhile explicitly showing the values encoded to BSON + padded_doc = {"padded_vec": padded_vec} + assert ( + encode(padded_doc) + == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00" + ) + # and dumped to json + assert ( + json_util.dumps(padded_doc) + == '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}' + ) + + # FLOAT32 is also implemented + float_binary = Binary.from_vector(list_vector, BinaryVectorDtype.FLOAT32) + assert all(isinstance(d, float) for d in float_binary.as_vector().data) + + # Now some invalid cases + for x in [-1, 257]: + try: + Binary.from_vector([x], BinaryVectorDtype.PACKED_BIT) + except Exception as exc: + self.assertTrue(isinstance(exc, struct.error)) + else: + self.fail("Failed to raise an exception.") + def test_unicode_regex(self): + """Tests we do not get a segfault for C extension on unicode RegExs. + This had been happening. + """ regex = re.compile("revisi\xf3n") decode(encode({"regex": regex})) diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py new file mode 100644 index 0000000000..00c82bbb65 --- /dev/null +++ b/test/test_bson_binary_vector.py @@ -0,0 +1,105 @@ +# Copyright 2024-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import binascii +import codecs +import json +import struct +from pathlib import Path +from test import unittest + +from bson import decode, encode +from bson.binary import Binary, BinaryVectorDtype + +_TEST_PATH = Path(__file__).parent / "bson_binary_vector" + + +class TestBSONBinaryVector(unittest.TestCase): + """Runs Binary Vector subtype tests. + + Follows the style of the BSON corpus specification tests. + Tests are automatically generated on import + from json files in _TEST_PATH via `create_tests`. + The actual tests are defined in the inner function `run_test` + of the test generator `create_test`.""" + + +def create_test(case_spec): + """Create standard test given specification in json. + + We use the naming convention expected (exp) and observed (obj) + to differentiate what is in the json (expected or suffix _exp) + from what is produced by the API (observed or suffix _obs) + """ + test_key = case_spec.get("test_key") + + def run_test(self): + for test_case in case_spec.get("tests", []): + description = test_case["description"] + vector_exp = test_case["vector"] + dtype_hex_exp = test_case["dtype_hex"] + dtype_alias_exp = test_case.get("dtype_alias") + padding_exp = test_case.get("padding", 0) + canonical_bson_exp = test_case.get("canonical_bson") + # Convert dtype hex string into bytes + dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) + + if test_case["valid"]: + # Convert bson string to bytes + cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) + decoded_doc = decode(cB_exp) + binary_obs = decoded_doc[test_key] + # Handle special float cases like '-inf' + if dtype_exp in [BinaryVectorDtype.FLOAT32]: + vector_exp = [float(x) for x in vector_exp] + + # Test round-tripping canonical bson. + self.assertEqual(encode(decoded_doc), cB_exp, description) + + # Test BSON to Binary Vector + vector_obs = binary_obs.as_vector() + self.assertEqual(vector_obs.dtype, dtype_exp, description) + if dtype_alias_exp: + self.assertEqual( + vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp], description + ) + self.assertEqual(vector_obs.data, vector_exp, description) + self.assertEqual(vector_obs.padding, padding_exp, description) + + # Test Binary Vector to BSON + vector_exp = Binary.from_vector(vector_exp, dtype_exp, padding_exp) + cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper() + self.assertEqual(cB_obs, canonical_bson_exp, description) + + else: + with self.assertRaises((struct.error, ValueError), msg=description): + Binary.from_vector(vector_exp, dtype_exp, padding_exp) + + return run_test + + +def create_tests(): + for filename in _TEST_PATH.glob("*.json"): + with codecs.open(str(filename), encoding="utf-8") as test_file: + test_method = create_test(json.load(test_file)) + setattr(TestBSONBinaryVector, "test_" + filename.stem, test_method) + + +create_tests() + + +if __name__ == "__main__": + unittest.main()