Skip to content

Commit

Permalink
Faster sets, empty set option, faster non-float serialize
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinTArthur committed Jul 26, 2021
1 parent df92f1f commit f3324ce
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 81 deletions.
5 changes: 4 additions & 1 deletion ddbcereal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@
from ddbcereal.serializing import Serializer
from ddbcereal.types import DateFormat, DynamoDBType, PythonNumber

VERSION = 2, 0, 1
VERSION = 2, 1, 0

ISO_8601 = DateFormat.ISO_8601
UNIX_MILLISECONDS = DateFormat.UNIX_MILLISECONDS
UNIX_SECONDS = DateFormat.UNIX_SECONDS

BINARY_SET = DynamoDBType.BINARY_SET
NUMBER = DynamoDBType.NUMBER
NUMBER_SET = DynamoDBType.NUMBER_SET
STRING = DynamoDBType.STRING
STRING_SET = DynamoDBType.STRING_SET

DECIMAL_ONLY = PythonNumber.DECIMAL_ONLY
FRACTION_ONLY = PythonNumber.FRACTION_ONLY
Expand Down
76 changes: 45 additions & 31 deletions ddbcereal/serializing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import decimal
import fractions
from base64 import b64encode
from collections.abc import ByteString, Set
from datetime import datetime
Expand All @@ -25,13 +24,11 @@

NoneType = type(None)


DDB_NUMBER_EMIN = -128
DDB_NUMBER_EMAX = 126
DDB_NUMBER_PREC = 38
INFINITY = decimal.Decimal('Infinity')
NAN = decimal.Decimal('NaN')
NUMBER_TYPES = decimal.Decimal, fractions.Fraction, int, float


class Serializer:
Expand All @@ -46,6 +43,7 @@ def __init__(
raw_transport=False,
datetime_format=DateFormat.ISO_8601,
fraction_type=DynamoDBType.NUMBER,
empty_set_type=DynamoDBType.NUMBER_SET
):
decimal_traps = [
decimal.Clamped,
Expand All @@ -61,21 +59,29 @@ def __init__(
_serialize_bytes = serialize_bytes

if validate_numbers:
serialize_num = self._serialize_number_strict
_serialize_float = self._serialize_float_strict
_serialize_number = self._serialize_number_strict
else:
_serialize_float = serialize_number
_serialize_number = serialize_number
self._serialize_num = _serialize_number

if fraction_type == DynamoDBType.NUMBER:
_serialize_fraction = self._serialize_fraction_as_number
else:
serialize_num = serialize_number
self._serialize_num = serialize_num
_serialize_fraction = serialize_any_as_string

self._type_methods: MutableMapping[type, Callable] = {
bool: serialize_bool,
bytes: _serialize_bytes,
bytearray: _serialize_bytes,
memoryview: _serialize_bytes,
datetime: date_serializers[datetime_format],
decimal.Decimal: serialize_num,
decimal.Decimal: _serialize_number,
dict: self._serialize_mapping,
float: serialize_num if allow_inexact else serialize_float_exact,
int: serialize_num,
float: _serialize_float,
Fraction: _serialize_fraction,
int: _serialize_number,
list: self._serialize_listlike,
Mapping: self._serialize_mapping,
NoneType: serialize_none,
Expand All @@ -85,11 +91,6 @@ def __init__(
str: serialize_str,
}

if fraction_type == DynamoDBType.NUMBER:
self._type_methods[Fraction] = self._serialize_fraction_as_number
else:
self._type_methods[Fraction] = serialize_any_as_string

decimal_ctx = decimal.Context(
Emin=DDB_NUMBER_EMIN,
Emax=DDB_NUMBER_EMAX,
Expand All @@ -100,6 +101,8 @@ def __init__(
self._create_decimal = decimal_ctx.create_decimal
self._decimal_divide = decimal_ctx.divide

self._empty_set = {empty_set_type.value: []}

def serialize(self, value: Any) -> DynamoDBValue:
value_type = type(value)
try:
Expand Down Expand Up @@ -130,6 +133,18 @@ def _serialize_fraction_as_number(self, value: Fraction):
def _serialize_number_strict(
self,
value: Union[int, float, decimal.Decimal]
):
try:
dec_value = self._create_decimal(value)
except decimal.Inexact:
raise NumberInexactError()
if dec_value in (INFINITY, NAN):
raise NumberNotAllowedError(f'{dec_value} not supported')
return {'N': str(dec_value)}

def _serialize_float_strict(
self,
value: Union[int, float, decimal.Decimal]
):
try:
dec_value = self._create_decimal(str(value))
Expand All @@ -144,23 +159,22 @@ def _serialize_listlike(self, value: Union[list, tuple]):

def _serialize_set(self, value: Set):
if all(isinstance(element, str) for element in value):
return {'SS': [element for element in value]}
if all(isinstance(element, NUMBER_TYPES) for element in value):
return {
'NS': [
val
for element in value
for val in self.serialize(element).values()
]
}
if all(isinstance(element, ByteString) for element in value):
return {
'BS': [
val
for element in value
for val in self.serialize(element).values()
]
}
# Shortcut to faster string set:
return {'SS': list(value)}
if not value:
return self._empty_set
vals = [
self.serialize(element)
for element in value
]
first_type = next(iter(vals[0]))
if (
first_type in {'N', 'S', 'B'}
and all(first_type in val for val in vals)
):
return {first_type + 'S': [val[first_type] for val in vals]}

raise ValueError('Invalid or mixed types in set.')

def _serialize_mapping(self, value: Mapping):
return {
Expand Down
7 changes: 5 additions & 2 deletions ddbcereal/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ class DateFormat(enum.Enum):


class DynamoDBType(enum.Enum):
NUMBER = enum.auto()
STRING = enum.auto()
NUMBER = 'N'
NUMBER_SET = 'NS'
STRING = 'S'
STRING_SET = 'SS'
BINARY_SET = 'BS'


class PythonNumber(enum.Enum):
Expand Down
7 changes: 7 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
Changelog
=========
2.1.0
-----
* Empty Python set is now serialized to a configurable Set type.
* Faster serialization for Number Sets and Binary Sets.
* Behind the scenes, DynamoDBType enumerations are now ``str``\ s of
their type symbol.

2.0.1
-----
* Fix exceptions from typing on pre-3.9 Python
Expand Down
41 changes: 21 additions & 20 deletions docs/performance.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ happen once in the lifecycle of an application.

Squeezing More Performance
--------------------------
By default, time is spent on validating that data being supplied to a
serializer will be allowed by DynamoDB once serialized. Significantly faster
serialization is possible when not validating input before sending it to
DynamoDB.
By default, time is spent validating that data being supplied to a serializer
will be allowed by DynamoDB once serialized. Significantly faster serialization
is possible when not validating input before sending it to DynamoDB.

.. code-block:: python
Expand All @@ -29,13 +28,15 @@ lookup every time ``serialize`` is called.

Known Limitations
-----------------
ddbcereal is faster than boto3 at serializing everything *except* for Number
Sets (e.g. `set[Decimal]`, `frozenset[int]`)

* Constructing a serializer or deserializer is slow. It should be done once and
the serializer or deserializer should be reused.
* Map serialization and deserialization uses recursion in its current
implementation, so deep Maps will use more memory and could take longer than
expected to process. boto3's Map processing has this same issue.

Benchmarks
----------
.. list-table:: Serializer Benchmarks (ddbcereal 1.0.0 cpython 3.9.4, 3.1 GHz
.. list-table:: Serializer Benchmarks (ddbcereal 2.1.0 cpython 3.9.4, 3.1 GHz
Intel Core i7)
:widths: 25 25 50
:header-rows: 1
Expand All @@ -47,26 +48,26 @@ Benchmarks
- 60x Slower
- 60x Slower
* - Decimal to Number
- 1.4x faster
- 2.9x faster
- 1.9x faster
- 2.8x faster
* - int to Number
- 1.4x faster
- 2.4x faster
- 2x faster
- 3x faster
* - str to String
- 3.6x faster
- 3.6x faster
* - Mixed number types Set to Number Set
- 1.2x slower
- 1.1x faster
- 1.4x faster
* - Set[str] to String Set
- 3.9x faster
- 3.9x faster
- 4.2x faster
- 4.2x faster
* - List of mixed types to List
- 3.1x faster
- 4.1x faster
- 3.4x faster
- 4x faster
* - dict of mixed types to Map
- 3.6x faster
- 4x faster
- 4.6x faster
* - dict of 2 levels to Map
- 3.4x faster
- 4.6x faster
- 4x faster
- 4.8x faster
67 changes: 40 additions & 27 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,40 +98,53 @@ your needs.
validate_numbers=True, \
raw_transport=False, \
datetime_format=ddbcereal.ISO_8601, \
fraction_type=ddbcereal.NUMBER)
fraction_type=ddbcereal.NUMBER, \
empty_set_type=ddbcereal.NUMBER_SET)

:param bool allow_inexact: Whether to allow numbers whose exact value can't
be represented in DynamoDB or Python. DynamoDB's Number type stores exact
numbers (fixed decimals). floats are considered inexact by their nature
numbers (fixed decimals). ``float``\ s are considered inexact by their nature
and are only accepted with this option enabled.

:param bool validate_numbers: Whether to check inputted numbers to determine
if they're valid for storage in DynamoDB and whether or not they conform
to the ``allow_inexact`` parameter.

When enabled, attempts to serialize invalid numbers will result in a
``ValueError`` being raised. When disabled, serialization is faster, but
mistakes might only be caught after the serialized value has been sent
to DynamoDB.
:py:exc:`ValueError` being raised. When disabled, serialization is
faster, but mistakes might only be caught after the serialized value has
been sent to DynamoDB.

:param bool raw_transport: Indicates that values have not been
pre-processed. For example, Base 64 strings have not been converted to
bytes. Use this when using the AWS HTTP API without an AWS SDK.

:param DateFormat datetime_format: Determines how Python datetimes should be
serialized. Possible enumerations are available on the ddbcereal top
level module and the DateFormat enum:

.. autoclass:: ddbcereal.DateFormat
:members:

:param DynamoDBType fraction_type: Determines how Python ``Fraction`` s should
be serialized. Possible enumerations are available on the ddbcereal top
level module and the DynamoDBType enum:

.. autoclass:: ddbcereal.DynamoDBType
:members:
:undoc-members:
:param DateFormat datetime_format: Determines how Python
:py:class:`~datetime.datetime`\ s should be serialized. Possible
enumerations are available on the ddbcereal top level module and the
:py:class:`~ddbcereal.DateFormat` enum.

:param DynamoDBType fraction_type: Determines how Python
:py:class:`~fractions.Fraction`\ s should be serialized. Must be
:py:attr:`~ddbcereal.DynamoDBType.NUMBER` or
:py:attr:`~ddbcereal.DynamoDBType.STRING`. Enumerations are available on
the ddbcereal top level module and the
:py:class:`~ddbcereal.DynamoDBType` enum.

:param DynamoDBType empty_set_type: When an empty set is serialized, make
the set this DynamoDB type. Must be
:py:attr:`~ddbcereal.DynamoDBType.NUMBER_SET`,
:py:attr:`~ddbcereal.DynamoDBType.STRING_SET`, or
:py:attr:`~ddbcereal.DynamoDBType.BINARY_SET`. Enumerations are available
on the ddbcereal top level module and the
:py:class:`~ddbcereal.DynamoDBType` enum.

.. autoclass:: ddbcereal.DateFormat
:members:

.. autoclass:: ddbcereal.DynamoDBType
:members:
:undoc-members:

Deserialize DynamoDB Data into Python
-------------------------------------
Expand Down Expand Up @@ -173,23 +186,23 @@ Deserializer Options

:param bool allow_inexact: Whether to allow conversion to a Python number
that won't exactly convey the value stored in DynamoDB (e.g. rounding of
significant digits is required). Deserializing numbers to floats is only
possible when this is enabled.
significant digits is required). Deserializing numbers to ``float``\ s is
only possible when this is enabled.

:param bool raw_transport: Indicates to deserialize values to be transported
without additional processing. Bytes will be transported as Base 64
strings. Use this when using the AWS HTTP API without an AWS SDK.

:param PythonNumber python_number: Determines how DynamoDB Numbers should be
serialized. Possible enumerations are available on the ddbcereal top
level module and the PythonNumber enum:
level module and the :py:class:`PythonNumber` enum:

.. autoclass:: ddbcereal.PythonNumber
:members:

:param python_null_value: The Python value to convert DynamoDB Nulls to.
Defaults to ``None``. An immutable value is recommended. Ignored if
``python_null_factory`` is supplied.
Defaults to :py:class:`None`. An immutable value is recommended. Ignored
if ``python_null_factory`` is supplied.

:param Callable[[], Any] python_null_factory: A function invoked for every
DynamoDB Null value. The Null is converted to the return value of the
Expand All @@ -202,9 +215,9 @@ conform to. They find appropriate Python types for the few types of data that
DynamoDB can store. If you want to deserialize values into more advanced types,
consider using a marshalling library like marshmallow or Pydantic.

They can take the dict produced by deserialize_item and create an object
based on a schema, an object with fields of built-in types like dates, deques
and of custom types.
They can take the dict produced by deserialize_item and create an objec based
on a schema, an object with fields of built-in types like dates, deques and of
custom types.

See
:py:meth:`marshmallow.Schema.load` and
Expand Down

0 comments on commit f3324ce

Please sign in to comment.