From f3324cea847035dd84611c89deedb19b16418fee Mon Sep 17 00:00:00 2001 From: Justin Turner Arthur Date: Mon, 26 Jul 2021 02:04:08 -0500 Subject: [PATCH] Faster sets, empty set option, faster non-float serialize --- ddbcereal/__init__.py | 5 ++- ddbcereal/serializing.py | 76 ++++++++++++++++++++++++---------------- ddbcereal/types.py | 7 ++-- docs/changelog.rst | 7 ++++ docs/performance.rst | 41 +++++++++++----------- docs/usage.rst | 67 +++++++++++++++++++++-------------- 6 files changed, 122 insertions(+), 81 deletions(-) diff --git a/ddbcereal/__init__.py b/ddbcereal/__init__.py index 3a8286b..da8540b 100644 --- a/ddbcereal/__init__.py +++ b/ddbcereal/__init__.py @@ -17,14 +17,17 @@ from ddbcereal.serializing import Serializer from ddbcereal.types import DateFormat, DynamoDBType, PythonNumber -VERSION = 2, 0, 1 +VERSION = 2, 1, 0 ISO_8601 = DateFormat.ISO_8601 UNIX_MILLISECONDS = DateFormat.UNIX_MILLISECONDS UNIX_SECONDS = DateFormat.UNIX_SECONDS +BINARY_SET = DynamoDBType.BINARY_SET NUMBER = DynamoDBType.NUMBER +NUMBER_SET = DynamoDBType.NUMBER_SET STRING = DynamoDBType.STRING +STRING_SET = DynamoDBType.STRING_SET DECIMAL_ONLY = PythonNumber.DECIMAL_ONLY FRACTION_ONLY = PythonNumber.FRACTION_ONLY diff --git a/ddbcereal/serializing.py b/ddbcereal/serializing.py index 089c679..fae2200 100644 --- a/ddbcereal/serializing.py +++ b/ddbcereal/serializing.py @@ -13,7 +13,6 @@ # limitations under the License. import decimal -import fractions from base64 import b64encode from collections.abc import ByteString, Set from datetime import datetime @@ -25,13 +24,11 @@ NoneType = type(None) - DDB_NUMBER_EMIN = -128 DDB_NUMBER_EMAX = 126 DDB_NUMBER_PREC = 38 INFINITY = decimal.Decimal('Infinity') NAN = decimal.Decimal('NaN') -NUMBER_TYPES = decimal.Decimal, fractions.Fraction, int, float class Serializer: @@ -46,6 +43,7 @@ def __init__( raw_transport=False, datetime_format=DateFormat.ISO_8601, fraction_type=DynamoDBType.NUMBER, + empty_set_type=DynamoDBType.NUMBER_SET ): decimal_traps = [ decimal.Clamped, @@ -61,10 +59,17 @@ def __init__( _serialize_bytes = serialize_bytes if validate_numbers: - serialize_num = self._serialize_number_strict + _serialize_float = self._serialize_float_strict + _serialize_number = self._serialize_number_strict + else: + _serialize_float = serialize_number + _serialize_number = serialize_number + self._serialize_num = _serialize_number + + if fraction_type == DynamoDBType.NUMBER: + _serialize_fraction = self._serialize_fraction_as_number else: - serialize_num = serialize_number - self._serialize_num = serialize_num + _serialize_fraction = serialize_any_as_string self._type_methods: MutableMapping[type, Callable] = { bool: serialize_bool, @@ -72,10 +77,11 @@ def __init__( bytearray: _serialize_bytes, memoryview: _serialize_bytes, datetime: date_serializers[datetime_format], - decimal.Decimal: serialize_num, + decimal.Decimal: _serialize_number, dict: self._serialize_mapping, - float: serialize_num if allow_inexact else serialize_float_exact, - int: serialize_num, + float: _serialize_float, + Fraction: _serialize_fraction, + int: _serialize_number, list: self._serialize_listlike, Mapping: self._serialize_mapping, NoneType: serialize_none, @@ -85,11 +91,6 @@ def __init__( str: serialize_str, } - if fraction_type == DynamoDBType.NUMBER: - self._type_methods[Fraction] = self._serialize_fraction_as_number - else: - self._type_methods[Fraction] = serialize_any_as_string - decimal_ctx = decimal.Context( Emin=DDB_NUMBER_EMIN, Emax=DDB_NUMBER_EMAX, @@ -100,6 +101,8 @@ def __init__( self._create_decimal = decimal_ctx.create_decimal self._decimal_divide = decimal_ctx.divide + self._empty_set = {empty_set_type.value: []} + def serialize(self, value: Any) -> DynamoDBValue: value_type = type(value) try: @@ -130,6 +133,18 @@ def _serialize_fraction_as_number(self, value: Fraction): def _serialize_number_strict( self, value: Union[int, float, decimal.Decimal] + ): + try: + dec_value = self._create_decimal(value) + except decimal.Inexact: + raise NumberInexactError() + if dec_value in (INFINITY, NAN): + raise NumberNotAllowedError(f'{dec_value} not supported') + return {'N': str(dec_value)} + + def _serialize_float_strict( + self, + value: Union[int, float, decimal.Decimal] ): try: dec_value = self._create_decimal(str(value)) @@ -144,23 +159,22 @@ def _serialize_listlike(self, value: Union[list, tuple]): def _serialize_set(self, value: Set): if all(isinstance(element, str) for element in value): - return {'SS': [element for element in value]} - if all(isinstance(element, NUMBER_TYPES) for element in value): - return { - 'NS': [ - val - for element in value - for val in self.serialize(element).values() - ] - } - if all(isinstance(element, ByteString) for element in value): - return { - 'BS': [ - val - for element in value - for val in self.serialize(element).values() - ] - } + # Shortcut to faster string set: + return {'SS': list(value)} + if not value: + return self._empty_set + vals = [ + self.serialize(element) + for element in value + ] + first_type = next(iter(vals[0])) + if ( + first_type in {'N', 'S', 'B'} + and all(first_type in val for val in vals) + ): + return {first_type + 'S': [val[first_type] for val in vals]} + + raise ValueError('Invalid or mixed types in set.') def _serialize_mapping(self, value: Mapping): return { diff --git a/ddbcereal/types.py b/ddbcereal/types.py index 654cd36..a1e8ae6 100644 --- a/ddbcereal/types.py +++ b/ddbcereal/types.py @@ -31,8 +31,11 @@ class DateFormat(enum.Enum): class DynamoDBType(enum.Enum): - NUMBER = enum.auto() - STRING = enum.auto() + NUMBER = 'N' + NUMBER_SET = 'NS' + STRING = 'S' + STRING_SET = 'SS' + BINARY_SET = 'BS' class PythonNumber(enum.Enum): diff --git a/docs/changelog.rst b/docs/changelog.rst index 8eadb15..af1b131 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,5 +1,12 @@ Changelog ========= +2.1.0 +----- +* Empty Python set is now serialized to a configurable Set type. +* Faster serialization for Number Sets and Binary Sets. +* Behind the scenes, DynamoDBType enumerations are now ``str``\ s of + their type symbol. + 2.0.1 ----- * Fix exceptions from typing on pre-3.9 Python diff --git a/docs/performance.rst b/docs/performance.rst index 0fab0fe..c546aaf 100644 --- a/docs/performance.rst +++ b/docs/performance.rst @@ -7,10 +7,9 @@ happen once in the lifecycle of an application. Squeezing More Performance -------------------------- -By default, time is spent on validating that data being supplied to a -serializer will be allowed by DynamoDB once serialized. Significantly faster -serialization is possible when not validating input before sending it to -DynamoDB. +By default, time is spent validating that data being supplied to a serializer +will be allowed by DynamoDB once serialized. Significantly faster serialization +is possible when not validating input before sending it to DynamoDB. .. code-block:: python @@ -29,13 +28,15 @@ lookup every time ``serialize`` is called. Known Limitations ----------------- -ddbcereal is faster than boto3 at serializing everything *except* for Number -Sets (e.g. `set[Decimal]`, `frozenset[int]`) - +* Constructing a serializer or deserializer is slow. It should be done once and + the serializer or deserializer should be reused. +* Map serialization and deserialization uses recursion in its current + implementation, so deep Maps will use more memory and could take longer than + expected to process. boto3's Map processing has this same issue. Benchmarks ---------- -.. list-table:: Serializer Benchmarks (ddbcereal 1.0.0 cpython 3.9.4, 3.1 GHz +.. list-table:: Serializer Benchmarks (ddbcereal 2.1.0 cpython 3.9.4, 3.1 GHz Intel Core i7) :widths: 25 25 50 :header-rows: 1 @@ -47,26 +48,26 @@ Benchmarks - 60x Slower - 60x Slower * - Decimal to Number - - 1.4x faster - - 2.9x faster + - 1.9x faster + - 2.8x faster * - int to Number - - 1.4x faster - - 2.4x faster + - 2x faster + - 3x faster * - str to String - 3.6x faster - 3.6x faster * - Mixed number types Set to Number Set - - 1.2x slower - 1.1x faster + - 1.4x faster * - Set[str] to String Set - - 3.9x faster - - 3.9x faster + - 4.2x faster + - 4.2x faster * - List of mixed types to List - - 3.1x faster - - 4.1x faster + - 3.4x faster + - 4x faster * - dict of mixed types to Map - - 3.6x faster + - 4x faster - 4.6x faster * - dict of 2 levels to Map - - 3.4x faster - - 4.6x faster + - 4x faster + - 4.8x faster diff --git a/docs/usage.rst b/docs/usage.rst index 4e41c2c..6bc4e97 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -98,11 +98,12 @@ your needs. validate_numbers=True, \ raw_transport=False, \ datetime_format=ddbcereal.ISO_8601, \ - fraction_type=ddbcereal.NUMBER) + fraction_type=ddbcereal.NUMBER, \ + empty_set_type=ddbcereal.NUMBER_SET) :param bool allow_inexact: Whether to allow numbers whose exact value can't be represented in DynamoDB or Python. DynamoDB's Number type stores exact - numbers (fixed decimals). floats are considered inexact by their nature + numbers (fixed decimals). ``float``\ s are considered inexact by their nature and are only accepted with this option enabled. :param bool validate_numbers: Whether to check inputted numbers to determine @@ -110,28 +111,40 @@ your needs. to the ``allow_inexact`` parameter. When enabled, attempts to serialize invalid numbers will result in a - ``ValueError`` being raised. When disabled, serialization is faster, but - mistakes might only be caught after the serialized value has been sent - to DynamoDB. + :py:exc:`ValueError` being raised. When disabled, serialization is + faster, but mistakes might only be caught after the serialized value has + been sent to DynamoDB. :param bool raw_transport: Indicates that values have not been pre-processed. For example, Base 64 strings have not been converted to bytes. Use this when using the AWS HTTP API without an AWS SDK. - :param DateFormat datetime_format: Determines how Python datetimes should be - serialized. Possible enumerations are available on the ddbcereal top - level module and the DateFormat enum: - - .. autoclass:: ddbcereal.DateFormat - :members: - - :param DynamoDBType fraction_type: Determines how Python ``Fraction`` s should - be serialized. Possible enumerations are available on the ddbcereal top - level module and the DynamoDBType enum: - - .. autoclass:: ddbcereal.DynamoDBType - :members: - :undoc-members: + :param DateFormat datetime_format: Determines how Python + :py:class:`~datetime.datetime`\ s should be serialized. Possible + enumerations are available on the ddbcereal top level module and the + :py:class:`~ddbcereal.DateFormat` enum. + + :param DynamoDBType fraction_type: Determines how Python + :py:class:`~fractions.Fraction`\ s should be serialized. Must be + :py:attr:`~ddbcereal.DynamoDBType.NUMBER` or + :py:attr:`~ddbcereal.DynamoDBType.STRING`. Enumerations are available on + the ddbcereal top level module and the + :py:class:`~ddbcereal.DynamoDBType` enum. + + :param DynamoDBType empty_set_type: When an empty set is serialized, make + the set this DynamoDB type. Must be + :py:attr:`~ddbcereal.DynamoDBType.NUMBER_SET`, + :py:attr:`~ddbcereal.DynamoDBType.STRING_SET`, or + :py:attr:`~ddbcereal.DynamoDBType.BINARY_SET`. Enumerations are available + on the ddbcereal top level module and the + :py:class:`~ddbcereal.DynamoDBType` enum. + +.. autoclass:: ddbcereal.DateFormat + :members: + +.. autoclass:: ddbcereal.DynamoDBType + :members: + :undoc-members: Deserialize DynamoDB Data into Python ------------------------------------- @@ -173,8 +186,8 @@ Deserializer Options :param bool allow_inexact: Whether to allow conversion to a Python number that won't exactly convey the value stored in DynamoDB (e.g. rounding of - significant digits is required). Deserializing numbers to floats is only - possible when this is enabled. + significant digits is required). Deserializing numbers to ``float``\ s is + only possible when this is enabled. :param bool raw_transport: Indicates to deserialize values to be transported without additional processing. Bytes will be transported as Base 64 @@ -182,14 +195,14 @@ Deserializer Options :param PythonNumber python_number: Determines how DynamoDB Numbers should be serialized. Possible enumerations are available on the ddbcereal top - level module and the PythonNumber enum: + level module and the :py:class:`PythonNumber` enum: .. autoclass:: ddbcereal.PythonNumber :members: :param python_null_value: The Python value to convert DynamoDB Nulls to. - Defaults to ``None``. An immutable value is recommended. Ignored if - ``python_null_factory`` is supplied. + Defaults to :py:class:`None`. An immutable value is recommended. Ignored + if ``python_null_factory`` is supplied. :param Callable[[], Any] python_null_factory: A function invoked for every DynamoDB Null value. The Null is converted to the return value of the @@ -202,9 +215,9 @@ conform to. They find appropriate Python types for the few types of data that DynamoDB can store. If you want to deserialize values into more advanced types, consider using a marshalling library like marshmallow or Pydantic. -They can take the dict produced by deserialize_item and create an object -based on a schema, an object with fields of built-in types like dates, deques -and of custom types. +They can take the dict produced by deserialize_item and create an objec based +on a schema, an object with fields of built-in types like dates, deques and of +custom types. See :py:meth:`marshmallow.Schema.load` and