From 29b66549b433b3ae372a26883b7988579927bbf7 Mon Sep 17 00:00:00 2001 From: Sandy Spicer Date: Mon, 17 Jun 2024 09:35:50 -0700 Subject: [PATCH] fix: update compression settings and change library (#22992) --- .../test/test_tolerant_zlib_compressor.py | 15 ++++++++---- posthog/caching/tolerant_zlib_compressor.py | 23 +++++++++++++++---- posthog/settings/data_stores.py | 2 +- requirements.in | 3 ++- requirements.txt | 2 ++ 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/posthog/caching/test/test_tolerant_zlib_compressor.py b/posthog/caching/test/test_tolerant_zlib_compressor.py index acefa330fe228..40b234b0a8058 100644 --- a/posthog/caching/test/test_tolerant_zlib_compressor.py +++ b/posthog/caching/test/test_tolerant_zlib_compressor.py @@ -11,7 +11,8 @@ class TestTolerantZlibCompressor(TestCase): short_uncompressed_bytes = b"hello world" # needs to be long enough to trigger compression uncompressed_bytes = ("hello world hello world hello world hello world hello world" * 100).encode("utf-8") - compressed_bytes = b"x\x9c\xed\xcb\xb1\t\x00 \x0c\x00\xc1U2\x9cB\x8a@\xc0\xc6\xf5\x9d!\x95\xcdu_\xfc\xe5\xae\xea\xb8}jE\xcez\xb8\xa3(\x8a\xa2(\x8a\xa2(\x8a\xa2(\x8a\xa2(\x8a\xa2\xe8\x1f\xfa\x00\xaf\xed\xb6)" + zlib_compressed_bytes = b"x\x9c\xed\xcb\xb1\t\x00 \x0c\x00\xc1U2\x9cB\x8a@\xc0\xc6\xf5\x9d!\x95\xcdu_\xfc\xe5\xae\xea\xb8}jE\xcez\xb8\xa3(\x8a\xa2(\x8a\xa2(\x8a\xa2(\x8a\xa2(\x8a\xa2\xe8\x1f\xfa\x00\xaf\xed\xb6)" + zstd_compressed_bytes = b'(\xb5/\xfd`\x0c\x16\xbd\x02\x00`hello world \x80\xc7\xa8\xe0\xf77\xf0\x951\x12x\x81\xc1\xff\xbf\x7fDD\x94\x88\x880""JD\x84\x18\x11\x11%"D\x8c\x88\x88\x12!"FDD\t\x11\x11#""\x89\x88\x88\x11\x11\xa1DD\xc4\x88\x08Q""bD\x88(\x11\x11ed\xaa\xf9]\xa6' @parameterized.expand( [ @@ -25,7 +26,7 @@ class TestTolerantZlibCompressor(TestCase): "test_when_enabled_can_compress", True, uncompressed_bytes, - compressed_bytes, + zstd_compressed_bytes, ), ( "test_when_enabled_does_not_compress_small_values", @@ -51,13 +52,19 @@ def test_the_zlib_compressor_compression(self, _, setting: bool, input: bytes, o ( "test_when_enabled_can_decompress", True, - compressed_bytes, + zlib_compressed_bytes, + uncompressed_bytes, + ), + ( + "test_when_enabled_can_decompress_zstd", + True, + zstd_compressed_bytes, uncompressed_bytes, ), ( "test_when_disabled_can_still_decompress", False, - compressed_bytes, + zlib_compressed_bytes, uncompressed_bytes, ), ] diff --git a/posthog/caching/tolerant_zlib_compressor.py b/posthog/caching/tolerant_zlib_compressor.py index 5c9341bb2a5c1..45e359aa11360 100644 --- a/posthog/caching/tolerant_zlib_compressor.py +++ b/posthog/caching/tolerant_zlib_compressor.py @@ -1,4 +1,5 @@ import zlib +import zstd from django_redis.compressors.base import BaseCompressor @@ -19,6 +20,13 @@ """, ) +USING_ZLIB_VALUE_COUNTER = Counter( + "posthog_redis_using_zlib_value_counter", + """ + A counter to track cache keys that are still being decompressed with (deprecated) zlib + """, +) + class TolerantZlibCompressor(BaseCompressor): """ @@ -30,17 +38,24 @@ class TolerantZlibCompressor(BaseCompressor): """ # we don't want to compress all values, e.g. feature flag cache in decide is already small - min_length = 1024 - preset = 6 + min_length = 512 + zstd_preset = 0 + zstd_threads = 1 + zlib_preset = 6 def compress(self, value: bytes) -> bytes: if settings.USE_REDIS_COMPRESSION and len(value) > self.min_length: - return zlib.compress(value, self.preset) + return zstd.compress(value, self.zstd_preset, self.zstd_threads) return value def decompress(self, value: bytes) -> bytes: try: - return zlib.decompress(value) + try: + return zstd.decompress(value) + except zstd.Error: + r = zlib.decompress(value) # Phasing out zlib, it is 10x slower and compresses worse + USING_ZLIB_VALUE_COUNTER.inc() + return r except zlib.error: if settings.USE_REDIS_COMPRESSION: COULD_NOT_DECOMPRESS_VALUE_COUNTER.inc() diff --git a/posthog/settings/data_stores.py b/posthog/settings/data_stores.py index b2985242c5277..5606ad4f423fc 100644 --- a/posthog/settings/data_stores.py +++ b/posthog/settings/data_stores.py @@ -289,7 +289,7 @@ def _parse_kafka_hosts(hosts_string: str) -> list[str]: # Controls whether the TolerantZlibCompressor is used for Redis compression when writing to Redis. # The TolerantZlibCompressor is a drop-in replacement for the standard Django ZlibCompressor that # can cope with compressed and uncompressed reading at the same time -USE_REDIS_COMPRESSION = get_from_env("USE_REDIS_COMPRESSION", False, type_cast=str_to_bool) +USE_REDIS_COMPRESSION = get_from_env("USE_REDIS_COMPRESSION", True, type_cast=str_to_bool) # AWS ElastiCache supports "reader" endpoints. # See "Finding a Redis (Cluster Mode Disabled) Cluster's Endpoints (Console)" diff --git a/requirements.in b/requirements.in index f5d224b7892bf..14d23d0a3463e 100644 --- a/requirements.in +++ b/requirements.in @@ -94,4 +94,5 @@ openai==1.10.0 tiktoken==0.6.0 nh3==0.2.14 hogql-parser==1.0.14 -zxcvbn==4.4.28 \ No newline at end of file +zxcvbn==4.4.28 +zstd==1.5.5.1 diff --git a/requirements.txt b/requirements.txt index 23234cd68e276..c31cd102213b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -689,5 +689,7 @@ xmlsec==1.3.13 # via python3-saml yarl==1.7.2 # via aiohttp +zstd==1.5.5.1 + # via -r requirements.in zxcvbn==4.4.28 # via -r requirements.in