diff --git a/.gitignore b/.gitignore index cf3686a..0a233ef 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ tests/dumps/dump_dealers_vins.rdb tests/dumps/dump_random_lists.rdb tests/dumps/dump_sorted_sets.rdb +.idea/* \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index da8b5d3..dc0e4d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,8 @@ language: python python: - "2.6" - "2.7" + - "3.4" + - "3.5" script: python run_tests diff --git a/CHANGES b/CHANGES index 9270be2..97d192c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,11 +1,15 @@ -* 0.1.1 - * Fixed lzf decompression - * Standard python project layout - * Python script to automatically create test RDB files - * Adds test cases - * Adds setup.py to easily install this library - * Adds MIT license +* 0.1.9 + * python 3 support + * rdb v8 (redis 4.0) support + * binary to string conversion fixes + * use ujson/cStringIO/python-lzf if they're available + * filter keys by size + * bugfixes parsing sorted sets + * fix setup.py dependancies and remove requirements.txt file + +* 0.1.8 + * fix a crash in the memory profiler recently introduced. + +* 0.1.7 + * rdb v7 (redis 3.2) support -* 0.1.0 - * Initial version - * Specification for RDB file format diff --git a/README.md b/README.md index fcf2841..de23f7b 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,7 @@ Rdbtools is written in Python, though there are similar projects in other langua Pre-Requisites : -1. python 2.x and pip. -2. redis-py is optional and only needed to run test cases. +1. redis-py is optional and only needed to run test cases. To install from PyPI (recommended) : @@ -27,29 +26,83 @@ To install from source : cd redis-rdb-tools sudo python setup.py install -## Converting dump files to JSON ## +# Command line usage examples + +Every run of RDB Tool requires to specify a command to indicate what should be done with the parsed RDB data. +Valid commands are: json, diff, justkeys, justkeyvals and protocol. + +JSON from a two database dump: + + > rdb --command json /var/redis/6379/dump.rdb -Parse the dump file and print the JSON on standard output + [{ + "user003":{"fname":"Ron","sname":"Bumquist"}, + "lizards":["Bush anole","Jackson's chameleon","Komodo dragon","Ground agama","Bearded dragon"], + "user001":{"fname":"Raoul","sname":"Duke"}, + "user002":{"fname":"Gonzo","sname":"Dr"}, + "user_list":["user003","user002","user001"]},{ + "baloon":{"helium":"birthdays","medical":"angioplasty","weather":"meteorology"}, + "armadillo":["chacoan naked-tailed","giant","Andean hairy","nine-banded","pink fairy"], + "aroma":{"pungent":"vinegar","putrid":"rotten eggs","floral":"roses"}}] - rdb --command json /var/redis/6379/dump.rdb +## Filter parsed output + +Only process keys that match the regex, and only print key and values: + + > rdb --command justkeyvals --key "user.*" /var/redis/6379/dump.rdb + + user003 fname Ron,sname Bumquist, + user001 fname Raoul,sname Duke, + user002 fname Gonzo,sname Dr, + user_list user003,user002,user001 -Only process keys that match the regex +Only process hashes starting with "a", in database 2: - rdb --command json --key "user.*" /var/redis/6379/dump.rdb + > rdb -c json --db 2 --type hash --key "a.*" /var/redis/6379/dump.rdb -Only process hashes starting with "a", in database 2 + [{},{ + "aroma":{"pungent":"vinegar","putrid":"rotten eggs","floral":"roses"}}] + +## Converting dump files to JSON ## + +The `json` command output is UTF-8 encoded JSON. +By default, the callback try to parse RDB data using UTF-8 and escape non 'ASCII printable' characters with the `\U` notation, or non UTF-8 parsable bytes with `\x`. +Attempting to decode RDB data can lead to binary data curroption, this can be avoided by using the `--escape raw` option. +Another option, is to use `-e base64` for Base64 encoding of binary data. + + +Parse the dump file and print the JSON on standard output: + + > rdb -c json /var/redis/6379/dump.rdb + + [{ + "Citat":["B\u00e4ttre sent \u00e4n aldrig","Bra karl reder sig sj\u00e4lv","Man ska inte k\u00f6pa grisen i s\u00e4cken"], + "bin_data":"\\xFE\u0000\u00e2\\xF2"}] + +Parse the dump file to raw bytes and print the JSON on standard output: - rdb --command json --db 2 --type hash --key "a.*" /var/redis/6379/dump.rdb + > rdb -c json /var/redis/6379/dump.rdb --escape raw + [{ + "Citat":["B\u00c3\u00a4ttre sent \u00c3\u00a4n aldrig","Bra karl reder sig sj\u00c3\u00a4lv","Man ska inte k\u00c3\u00b6pa grisen i s\u00c3\u00a4cken"], + "bin_data":"\u00fe\u0000\u00c3\u00a2\u00f2"}] ## Generate Memory Report ## -Running with the `-c memory` generates a CSV report with the approximate memory used by that key. +Running with the `-c memory` generates a CSV report with the approximate memory used by that key. `--bytes C` and `'--largest N` can be used to limit output to keys larger than C bytes, or the N largest keys. - rdb -c memory /var/redis/6379/dump.rdb > memory.csv + > rdb -c memory /var/redis/6379/dump.rdb --bytes 128 -f memory.csv + > cat memory.csv + database,type,key,size_in_bytes,encoding,num_elements,len_largest_element + 0,list,lizards,241,quicklist,5,19 + 0,list,user_list,190,quicklist,3,7 + 2,hash,baloon,138,ziplist,3,11 + 2,list,armadillo,231,quicklist,5,20 + 2,hash,aroma,129,ziplist,3,11 -The generated CSV has the following columns - Database Number, Data Type, Key, Memory Used in bytes and Encoding. + +The generated CSV has the following columns - Database Number, Data Type, Key, Memory Used in bytes and RDB Encoding type. Memory usage includes the key, the value and any other overheads. Note that the memory usage is approximate. In general, the actual memory used will be slightly higher than what is reported. @@ -62,17 +115,13 @@ The memory report should help you detect memory leaks caused by your application Sometimes you just want to find the memory used by a particular key, and running the entire memory report on the dump file is time consuming. -In such cases, you can use the `redis-memory-for-key` command - -Example : +In such cases, you can use the `redis-memory-for-key` command: - redis-memory-for-key person:1 + > redis-memory-for-key person:1 - redis-memory-for-key -s localhost -p 6379 -a mypassword person:1 - -Output : + > redis-memory-for-key -s localhost -p 6379 -a mypassword person:1 - Key "person:1" + Key person:1 Bytes 111 Type hash Encoding ziplist @@ -88,20 +137,20 @@ NOTE : First, use the --command diff option, and pipe the output to standard sort utility - rdb --command diff /var/redis/6379/dump1.rdb | sort > dump1.txt - rdb --command diff /var/redis/6379/dump2.rdb | sort > dump2.txt + > rdb --command diff /var/redis/6379/dump1.rdb | sort > dump1.txt + > rdb --command diff /var/redis/6379/dump2.rdb | sort > dump2.txt Then, run your favourite diff program - kdiff3 dump1.txt dump2.txt + > kdiff3 dump1.txt dump2.txt -To limit the size of the files, you can filter on keys using the --key=regex option +To limit the size of the files, you can filter on keys using the `--key` option ## Emitting Redis Protocol ## -You can convert RDB file into a stream of [redis protocol](http://redis.io/topics/protocol) using the "protocol" command. +You can convert RDB file into a stream of [redis protocol](http://redis.io/topics/protocol) using the `protocol` command. - rdb --command protocol /var/redis/6379/dump.rdb + > rdb --c protocol /var/redis/6379/dump.rdb *4 $4 @@ -113,37 +162,49 @@ You can convert RDB file into a stream of [redis protocol](http://redis.io/topic $8 Sripathi -You can pipe the output to netcat and re-import a subset of the data. -For example, if you want to shard your data into two redis instances, you can use the --key flag to select a subset of data, +You can pipe the output to netcat and re-import a subset of the data. +For example, if you want to shard your data into two redis instances, you can use the --key flag to select a subset of data, and then pipe the output to a running redis instance to load that data. - Read [Redis Mass Insert](http://redis.io/topics/mass-insert) for more information on this. -## Using the Parser ## +When printing protocol output, the `--escape` option can be used with `printable` or `utf8` to avoid non printable/control characters. + +# Using the Parser ## - import sys from rdbtools import RdbParser, RdbCallback + from rdbtools.encodehelpers import bytes_to_unicode - class MyCallback(RdbCallback) : - ''' Simple example to show how callback works. + class MyCallback(RdbCallback): + ''' Simple example to show how callback works. See RdbCallback for all available callback methods. See JsonCallback for a concrete example - ''' - def set(self, key, value, expiry): - print('%s = %s' % (str(key), str(value))) - + ''' + + def __init__(self): + super(MyCallback, self).__init__(string_escape=None) + + def encode_key(self, key): + return bytes_to_unicode(key, self._escape, skip_printable=True) + + def encode_value(self, val): + return bytes_to_unicode(val, self._escape) + + def set(self, key, value, expiry, info): + print('%s = %s' % (self.encode_key(key), self.encode_value(value))) + def hset(self, key, field, value): - print('%s.%s = %s' % (str(key), str(field), str(value))) - + print('%s.%s = %s' % (self.encode_key(key), self.encode_key(field), self.encode_value(value))) + def sadd(self, key, member): - print('%s has {%s}' % (str(key), str(member))) - - def rpush(self, key, value) : - print('%s has [%s]' % (str(key), str(value))) - + print('%s has {%s}' % (self.encode_key(key), self.encode_value(member))) + + def rpush(self, key, value): + print('%s has [%s]' % (self.encode_key(key), self.encode_value(value))) + def zadd(self, key, score, member): print('%s has {%s : %s}' % (str(key), str(member), str(score))) + callback = MyCallback() parser = RdbParser(callback) parser.parse('/var/redis/6379/dump.rdb') diff --git a/rdbtools/__init__.py b/rdbtools/__init__.py index bf49c26..8a8d0d8 100644 --- a/rdbtools/__init__.py +++ b/rdbtools/__init__.py @@ -1,10 +1,10 @@ from rdbtools.parser import RdbCallback, RdbParser, DebugCallback -from rdbtools.callbacks import JSONCallback, DiffCallback, ProtocolCallback -from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator +from rdbtools.callbacks import JSONCallback, DiffCallback, ProtocolCallback, KeyValsOnlyCallback, KeysOnlyCallback +from rdbtools.memprofiler import MemoryCallback, PrintAllKeys, StatsAggregator, PrintJustKeys -__version__ = '0.1.8' +__version__ = '0.1.9' VERSION = tuple(map(int, __version__.split('.'))) __all__ = [ - 'RdbParser', 'RdbCallback', 'JSONCallback', 'DiffCallback', 'MemoryCallback', 'ProtocolCallback', 'PrintAllKeys'] + 'RdbParser', 'RdbCallback', 'JSONCallback', 'DiffCallback', 'MemoryCallback', 'ProtocolCallback', 'KeyValsOnlyCallback', 'KeysOnlyCallback', 'PrintJustKeys'] diff --git a/rdbtools/callbacks.py b/rdbtools/callbacks.py index 4e76eb6..860c3c9 100644 --- a/rdbtools/callbacks.py +++ b/rdbtools/callbacks.py @@ -1,115 +1,39 @@ import calendar -import re -from decimal import Decimal -import sys -import struct -from rdbtools.parser import RdbCallback, RdbParser - -ESCAPE = re.compile(ur'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]') -ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') -HAS_UTF8 = re.compile(r'[\x80-\xff]') -ESCAPE_DCT = { - '\\': '\\\\', - '"': '\\"', - '\b': '\\b', - '\f': '\\f', - '\n': '\\n', - '\r': '\\r', - '\t': '\\t', - u'\u2028': '\\u2028', - u'\u2029': '\\u2029', -} -for i in range(0x20): - ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) - -def _floatconstants(): - _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') - # The struct module in Python 2.4 would get frexp() out of range here - # when an endian is specified in the format string. Fixed in Python 2.5+ - if sys.byteorder != 'big': - _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] - nan, inf = struct.unpack('dd', _BYTES) - return nan, inf, -inf - -NaN, PosInf, NegInf = _floatconstants() - -def _encode_basestring(s): - """Return a JSON representation of a Python string""" - if isinstance(s, str) and HAS_UTF8.search(s) is not None: - s = s.decode('utf-8') - def replace(match): - return ESCAPE_DCT[match.group(0)] - return u'"' + ESCAPE.sub(replace, s) + u'"' - -def _encode_basestring_ascii(s): - """Return an ASCII-only JSON representation of a Python string - - """ - try : - if isinstance(s, str) and HAS_UTF8.search(s) is not None: - s = s.decode('utf-8') - except: - pass +import codecs +import json - def replace(match): - s = match.group(0) - try: - return ESCAPE_DCT[s] - except KeyError: - n = ord(s) - if n < 0x10000: - #return '\\u{0:04x}'.format(n) - return '\\u%04x' % (n,) - else: - # surrogate pair - n -= 0x10000 - s1 = 0xd800 | ((n >> 10) & 0x3ff) - s2 = 0xdc00 | (n & 0x3ff) - return '\\u%04x\\u%04x' % (s1, s2) - return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' - -def _encode(s, quote_numbers = True): - if quote_numbers: - qn = '"' - else: - qn = '' - if isinstance(s, int) or isinstance(s, long): - return qn + str(s) + qn - elif isinstance(s, float): - if s != s: - return "NaN" - elif s == PosInf: - return "Infinity" - elif s == NegInf: - return "-Infinity" - else: - return qn + str(s) + qn - else: - return _encode_basestring_ascii(s) - -def encode_key(s): - return _encode(s, quote_numbers=True) - -def encode_value(s): - return _encode(s, quote_numbers=False) +from rdbtools.compat import isinteger +from rdbtools.parser import RdbCallback +from rdbtools import encodehelpers class JSONCallback(RdbCallback): - def __init__(self, out): + def __init__(self, out, string_escape=None): + if string_escape is None: + string_escape = encodehelpers.STRING_ESCAPE_UTF8 + super(JSONCallback, self).__init__(string_escape) self._out = out self._is_first_db = True self._has_databases = False self._is_first_key_in_db = True self._elements_in_key = 0 self._element_index = 0 - + + def encode_key(self, key): + key = encodehelpers.bytes_to_unicode(key, self._escape, skip_printable=True) + return codecs.encode(json.dumps(key), 'utf-8') + + def encode_value(self, val): + val = encodehelpers.bytes_to_unicode(val, self._escape) + return codecs.encode(json.dumps(val), 'utf-8') + def start_rdb(self): - self._out.write('[') + self._out.write(b'[') def start_database(self, db_number): if not self._is_first_db: - self._out.write('},') - self._out.write('{') + self._out.write(b'},') + self._out.write(b'{') self._is_first_db = False self._has_databases = True self._is_first_key_in_db = True @@ -119,13 +43,13 @@ def end_database(self, db_number): def end_rdb(self): if self._has_databases: - self._out.write('}') - self._out.write(']') + self._out.write(b'}') + self._out.write(b']') def _start_key(self, key, length): if not self._is_first_key_in_db: - self._out.write(',') - self._out.write('\r\n') + self._out.write(b',') + self._out.write(b'\r\n') self._is_first_key_in_db = False self._elements_in_key = length self._element_index = 0 @@ -135,71 +59,188 @@ def _end_key(self, key): def _write_comma(self): if self._element_index > 0 and self._element_index < self._elements_in_key : - self._out.write(',') + self._out.write(b',') self._element_index = self._element_index + 1 def set(self, key, value, expiry, info): self._start_key(key, 0) - self._out.write('%s:%s' % (encode_key(key), encode_value(value))) + self._out.write(self.encode_key(key) + b':' + self.encode_value(value)) def start_hash(self, key, length, expiry, info): self._start_key(key, length) - self._out.write('%s:{' % encode_key(key)) + self._out.write(self.encode_key(key) + b':{') def hset(self, key, field, value): self._write_comma() - self._out.write('%s:%s' % (encode_key(field), encode_value(value))) + self._out.write(self.encode_key(field) + b':' + self.encode_value(value)) def end_hash(self, key): self._end_key(key) - self._out.write('}') + self._out.write(b'}') def start_set(self, key, cardinality, expiry, info): self._start_key(key, cardinality) - self._out.write('%s:[' % encode_key(key)) + self._out.write(self.encode_key(key) + b':[') def sadd(self, key, member): self._write_comma() - self._out.write('%s' % encode_value(member)) + self._out.write(self.encode_value(member)) def end_set(self, key): self._end_key(key) - self._out.write(']') + self._out.write(b']') def start_list(self, key, expiry, info): self._start_key(key, 0) - self._out.write('%s:[' % encode_key(key)) + self._out.write(self.encode_key(key) + b':[') def rpush(self, key, value) : self._elements_in_key += 1 self._write_comma() - self._out.write('%s' % encode_value(value)) + self._out.write(self.encode_value(value)) def end_list(self, key, info): self._end_key(key) - self._out.write(']') + self._out.write(b']') def start_sorted_set(self, key, length, expiry, info): self._start_key(key, length) - self._out.write('%s:{' % encode_key(key)) + self._out.write(self.encode_key(key) + b':{') def zadd(self, key, score, member): self._write_comma() - self._out.write('%s:%s' % (encode_key(member), encode_value(score))) + self._out.write(self.encode_key(member) + b':' + self.encode_value(score)) + + def end_sorted_set(self, key): + self._end_key(key) + self._out.write(b'}') + + +class KeysOnlyCallback(RdbCallback): + def __init__(self, out, string_escape=None): + super(KeysOnlyCallback, self).__init__(string_escape) + self._out = out + + def _keyout(self, key): + self._out.write(self.encode_key(key) + b'\n') + + def set(self, key, value, expiry, info): + self._keyout(key) + + def start_hash(self, key, length, expiry, info): + self._keyout(key) + + def hset(self, key, field, value): + self._keyout(key) + + def start_set(self, key, cardinality, expiry, info): + self._keyout(key) + + def sadd(self, key, member): + self._keyout(key) + + def start_list(self, key, expiry, info): + self._keyout(key) + + def rpush(self, key, value) : + self._keyout(key) + + def start_sorted_set(self, key, length, expiry, info): + self._keyout(key) + + def zadd(self, key, score, member): + self._keyout(key) + + +class KeyValsOnlyCallback(RdbCallback): + def __init__(self, out, string_escape=None): + super(KeyValsOnlyCallback, self).__init__(string_escape) + self._out = out + self._is_first_db = True + self._has_databases = False + self._is_first_key_in_db = True + self._elements_in_key = 0 + self._element_index = 0 + + def _start_key(self, key, length): + if not self._is_first_key_in_db: + self._out.write(b',') + self._out.write(b'\r\n') + self._is_first_key_in_db = False + self._elements_in_key = length + self._element_index = 0 + + def _end_key(self, key): + pass + + def _write_comma(self): + if self._element_index > 0 and self._element_index < self._elements_in_key : + self._out.write(b',') + self._element_index = self._element_index + 1 + + def set(self, key, value, expiry, info): + self._start_key(key, 0) + self._out.write(self.encode_key(key) + b' ' + self.encode_value(value)) + + def start_hash(self, key, length, expiry, info): + self._start_key(key, length) + self._out.write(self.encode_key(key) + b' ') + + def hset(self, key, field, value): + self._write_comma() + self._out.write(self.encode_key(field) + b' ' + self.encode_value(value)) + + def end_hash(self, key): + self._end_key(key) + + def start_set(self, key, cardinality, expiry, info): + self._start_key(key, cardinality) + self._out.write(self.encode_key(key) + b' ') + + def sadd(self, key, member): + self._write_comma() + self._out.write(self.encode_value(member)) + + def end_set(self, key): + self._end_key(key) + + def start_list(self, key, expiry, info): + self._start_key(key, 0) + self._out.write(self.encode_key(key) + b' ') + + def rpush(self, key, value) : + self._elements_in_key += 1 + self._write_comma() + self._out.write(self.encode_value(value)) + + def end_list(self, key, info): + self._end_key(key) + + def start_sorted_set(self, key, length, expiry, info): + self._start_key(key, length) + self._out.write(self.encode_key(key) + b' ') + + def zadd(self, key, score, member): + self._write_comma() + self._out.write(self.encode_key(member) + b' ' + self.encode_value(score)) def end_sorted_set(self, key): self._end_key(key) - self._out.write('}') class DiffCallback(RdbCallback): '''Prints the contents of RDB in a format that is unix sort friendly, so that two rdb files can be diffed easily''' - def __init__(self, out): + def __init__(self, out, string_escape=None): + if string_escape is None: + string_escape = encodehelpers.STRING_ESCAPE_PRINT + super(DiffCallback, self).__init__(string_escape) self._out = out self._index = 0 self._dbnum = 0 - + + def dbstr(self): + return b'db=' + encodehelpers.int2bytes(self._dbnum) + b' ' def start_rdb(self): pass @@ -213,14 +254,15 @@ def end_rdb(self): pass def set(self, key, value, expiry, info): - self._out.write('db=%d %s -> %s' % (self._dbnum, encode_key(key), encode_value(value))) + self._out.write(self.dbstr() + self.encode_key(key) + b' -> ' + self.encode_value(value)) self.newline() def start_hash(self, key, length, expiry, info): pass def hset(self, key, field, value): - self._out.write('db=%d %s . %s -> %s' % (self._dbnum, encode_key(key), encode_key(field), encode_value(value))) + self._out.write( + self.dbstr() + self.encode_key(key) + b' . ' + self.encode_key(field) + b' -> ' + self.encode_value(value)) self.newline() def end_hash(self, key): @@ -230,7 +272,7 @@ def start_set(self, key, cardinality, expiry, info): pass def sadd(self, key, member): - self._out.write('db=%d %s { %s }' % (self._dbnum, encode_key(key), encode_value(member))) + self._out.write(self.dbstr() + self.encode_key(key) + b' { ' + self.encode_value(member) + b' }') self.newline() def end_set(self, key): @@ -240,7 +282,8 @@ def start_list(self, key, expiry, info): self._index = 0 def rpush(self, key, value) : - self._out.write('db=%d %s[%d] -> %s' % (self._dbnum, encode_key(key), self._index, encode_value(value))) + istr = encodehelpers.int2bytes(self._index) + self._out.write(self.dbstr() + self.encode_key(key) + b'[' + istr + b'] -> ' + self.encode_value(value)) self.newline() self._index = self._index + 1 @@ -248,18 +291,18 @@ def end_list(self, key, info): pass def start_sorted_set(self, key, length, expiry, info): - self._index = 0 - + pass + def zadd(self, key, score, member): - self._out.write('db=%d %s[%d] -> {%s, score=%s}' % (self._dbnum, encode_key(key), self._index, encode_key(member), encode_value(score))) + self._out.write(self.dbstr() + self.encode_key(key) + + b' -> {' + self.encode_key(member) + b', score=' + self.encode_value(score) + b'}') self.newline() - self._index = self._index + 1 def end_sorted_set(self, key): pass def newline(self): - self._out.write('\r\n') + self._out.write(b'\r\n') def _unix_timestamp(dt): @@ -267,7 +310,8 @@ def _unix_timestamp(dt): class ProtocolCallback(RdbCallback): - def __init__(self, out): + def __init__(self, out, string_escape=None): + super(ProtocolCallback, self).__init__(string_escape) self._out = out self.reset() @@ -294,10 +338,11 @@ def post_expiry(self, key): self.expireat(key, self.get_expiry_seconds(key)) def emit(self, *args): - self._out.write(u"*" + unicode(len(args)) + u"\r\n") + self._out.write(codecs.encode("*{}\r\n".format(len(args)), 'ascii')) for arg in args: - self._out.write(u"$" + unicode(len(unicode(arg))) + u"\r\n") - self._out.write(unicode(arg) + u"\r\n") + val = encodehelpers.apply_escape_bytes(arg, self._escape) + self._out.write(codecs.encode("${}\r\n".format(len(val)), 'ascii')) + self._out.write(val + b"\r\n") def start_database(self, db_number): self.reset() @@ -307,7 +352,7 @@ def start_database(self, db_number): def set(self, key, value, expiry, info): self.pre_expiry(key, expiry) - self.emit('SET', key, value) + self.emit(b'SET', key, value) self.post_expiry(key) # Hash handling @@ -316,7 +361,7 @@ def start_hash(self, key, length, expiry, info): self.pre_expiry(key, expiry) def hset(self, key, field, value): - self.emit('HSET', key, field, value) + self.emit(b'HSET', key, field, value) def end_hash(self, key): self.post_expiry(key) @@ -327,7 +372,7 @@ def start_set(self, key, cardinality, expiry, info): self.pre_expiry(key, expiry) def sadd(self, key, member): - self.emit('SADD', key, member) + self.emit(b'SADD', key, member) def end_set(self, key): self.post_expiry(key) @@ -338,7 +383,7 @@ def start_list(self, key, expiry, info): self.pre_expiry(key, expiry) def rpush(self, key, value): - self.emit('RPUSH', key, value) + self.emit(b'RPUSH', key, value) def end_list(self, key, info): self.post_expiry(key) @@ -349,7 +394,7 @@ def start_sorted_set(self, key, length, expiry, info): self.pre_expiry(key, expiry) def zadd(self, key, score, member): - self.emit('ZADD', key, score, member) + self.emit(b'ZADD', key, score, member) def end_sorted_set(self, key): self.post_expiry(key) @@ -357,7 +402,7 @@ def end_sorted_set(self, key): # Other misc commands def select(self, db_number): - self.emit('SELECT', db_number) + self.emit(b'SELECT', db_number) def expireat(self, key, timestamp): - self.emit('EXPIREAT', key, timestamp) + self.emit(b'EXPIREAT', key, timestamp) diff --git a/rdbtools/cli/rdb.py b/rdbtools/cli/rdb.py index 348102f..fddefa9 100755 --- a/rdbtools/cli/rdb.py +++ b/rdbtools/cli/rdb.py @@ -2,7 +2,8 @@ import os import sys from optparse import OptionParser -from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys +from rdbtools import RdbParser, JSONCallback, DiffCallback, MemoryCallback, ProtocolCallback, PrintAllKeys, KeysOnlyCallback, KeyValsOnlyCallback +from rdbtools.encodehelpers import ESCAPE_CHOICES VALID_TYPES = ("hash", "set", "string", "list", "sortedset") def main(): @@ -12,17 +13,25 @@ def main(): parser = OptionParser(usage=usage) parser.add_option("-c", "--command", dest="command", - help="Command to execute. Valid commands are json, diff, and protocol", metavar="FILE") + help="Command to execute. Valid commands are json, diff, justkeys, justkeyvals and protocol", metavar="FILE") parser.add_option("-f", "--file", dest="output", help="Output file", metavar="FILE") parser.add_option("-n", "--db", dest="dbs", action="append", help="Database Number. Multiple databases can be provided. If not specified, all databases will be included.") parser.add_option("-k", "--key", dest="keys", default=None, help="Keys to export. This can be a regular expression") + parser.add_option("-o", "--not-key", dest="not_keys", default=None, + help="Keys Not to export. This can be a regular expression") parser.add_option("-t", "--type", dest="types", action="append", help="""Data types to include. Possible values are string, hash, set, sortedset, list. Multiple typees can be provided. If not specified, all data types will be returned""") - + parser.add_option("-b", "--bytes", dest="bytes", default=None, + help="Limit memory output to keys greater to or equal to this value (in bytes)") + parser.add_option("-l", "--largest", dest="largest", default=None, + help="Limit memory output to only the top N keys (by size)") + parser.add_option("-e", "--escape", dest="escape", choices=ESCAPE_CHOICES, + help="Escape strings to encoding: {} (default), {}, {}, or {}.".format(*ESCAPE_CHOICES)) + (options, args) = parser.parse_args() if len(args) == 0: @@ -40,6 +49,9 @@ def main(): if options.keys: filters['keys'] = options.keys + + if options.not_keys: + filters['not_keys'] = options.not_keys if options.types: filters['types'] = [] @@ -48,39 +60,33 @@ def main(): raise Exception('Invalid type provided - %s. Expected one of %s' % (x, (", ".join(VALID_TYPES)))) else: filters['types'].append(x) - - # TODO : Fix this ugly if-else code - if options.output: - with open(options.output, "wb") as f: - if 'diff' == options.command: - callback = DiffCallback(f) - elif 'json' == options.command: - callback = JSONCallback(f) - elif 'memory' == options.command: - reporter = PrintAllKeys(f) - callback = MemoryCallback(reporter, 64) - elif 'protocol' == options.command: - callback = ProtocolCallback(f) - else: - raise Exception('Invalid Command %s' % options.command) - parser = RdbParser(callback) - parser.parse(dump_file) - else: - if 'diff' == options.command: - callback = DiffCallback(sys.stdout) - elif 'json' == options.command: - callback = JSONCallback(sys.stdout) - elif 'memory' == options.command: - reporter = PrintAllKeys(sys.stdout) - callback = MemoryCallback(reporter, 64) - elif 'protocol' == options.command: - callback = ProtocolCallback(sys.stdout) + + out_file_obj = None + try: + if options.output: + out_file_obj = open(options.output, "wb") else: + # Prefer not to depend on Python stdout implementation for writing binary. + out_file_obj = os.fdopen(sys.stdout.fileno(), 'wb') + + try: + callback = { + 'diff': lambda f: DiffCallback(f, string_escape=options.escape), + 'json': lambda f: JSONCallback(f, string_escape=options.escape), + 'justkeys': lambda f: KeysOnlyCallback(f, string_escape=options.escape), + 'justkeyvals': lambda f: KeyValsOnlyCallback(f, string_escape=options.escape), + 'memory': lambda f: MemoryCallback(PrintAllKeys(f, options.bytes, options.largest), + 64, string_escape=options.escape), + 'protocol': lambda f: ProtocolCallback(f, string_escape=options.escape) + }[options.command](out_file_obj) + except: raise Exception('Invalid Command %s' % options.command) parser = RdbParser(callback, filters=filters) parser.parse(dump_file) - + finally: + if options.output and out_file_obj is not None: + out_file_obj.close() + if __name__ == '__main__': main() - diff --git a/rdbtools/cli/redis_memory_for_key.py b/rdbtools/cli/redis_memory_for_key.py index 2b63eb7..24bc6b9 100755 --- a/rdbtools/cli/redis_memory_for_key.py +++ b/rdbtools/cli/redis_memory_for_key.py @@ -3,14 +3,16 @@ import os import sys -try : - from StringIO import StringIO +try: + try: + from cStringIO import StringIO as BytesIO + except ImportError: + from StringIO import StringIO as BytesIO except ImportError: - from io import StringIO + from io import BytesIO from optparse import OptionParser from rdbtools import RdbParser, JSONCallback, MemoryCallback -from rdbtools.callbacks import encode_key from redis import StrictRedis from redis.exceptions import ConnectionError, ResponseError @@ -45,14 +47,15 @@ def print_memory_for_key(key, host='localhost', port=6379, db=0, password=None): reporter = PrintMemoryUsage() callback = MemoryCallback(reporter, 64) parser = RdbParser(callback, filters={}) - parser._key = key + # DUMP command only return the key data, so we hack RdbParser to inject key name as parsed bytes. + parser._key = key.encode('utf-8') raw_dump = redis.execute_command('dump', key) if not raw_dump: sys.stderr.write('Key %s does not exist\n' % key) sys.exit(-1) - stream = StringIO(raw_dump) + stream = BytesIO(raw_dump) data_type = read_unsigned_char(stream) parser.read_object(stream, data_type) @@ -83,9 +86,9 @@ def check_redis_version(redis): def read_unsigned_char(f) : return struct.unpack('B', f.read(1))[0] -class PrintMemoryUsage(): +class PrintMemoryUsage(object): def next_record(self, record) : - print("%s\t\t\t\t%s" % ("Key", encode_key(record.key))) + print("%s\t\t\t\t%s" % ("Key", record.key)) print("%s\t\t\t\t%s" % ("Bytes", record.bytes)) print("%s\t\t\t\t%s" % ("Type", record.type)) if record.type in ('set', 'list', 'sortedset', 'hash'): diff --git a/rdbtools/compat.py b/rdbtools/compat.py new file mode 100644 index 0000000..dc72cef --- /dev/null +++ b/rdbtools/compat.py @@ -0,0 +1,25 @@ +# python2->3 compat + +import sys, re + +try: + xrange + range = xrange +except NameError: + range = range + +try: + long + def isinteger(n): + return isinstance(n, int) or isinstance(n, long) +except NameError: + def isinteger(n): + return isinstance(n, int) + +if sys.version_info < (3,): + def str2regexp(pattern): + return re.compile(pattern) +else: + def str2regexp(pattern): + return re.compile(pattern.encode('utf-8')) + diff --git a/rdbtools/encodehelpers.py b/rdbtools/encodehelpers.py new file mode 100644 index 0000000..ab83a38 --- /dev/null +++ b/rdbtools/encodehelpers.py @@ -0,0 +1,154 @@ +from __future__ import print_function +import base64 +import codecs +import sys + +from .compat import isinteger + +STRING_ESCAPE_RAW = 'raw' +STRING_ESCAPE_PRINT = 'print' +STRING_ESCAPE_UTF8 = 'utf8' +STRING_ESCAPE_BASE64 = 'base64' +ESCAPE_CHOICES = [STRING_ESCAPE_RAW, STRING_ESCAPE_PRINT, STRING_ESCAPE_UTF8, STRING_ESCAPE_BASE64] + +if sys.version_info < (3,): + bval = ord + + def int2unistr(i): return codecs.decode(str(i), 'ascii') + int2bytes = str +else: + def bval(x): return x + + int2unistr = str + + def int2bytes(i): return codecs.encode(str(i), 'ascii') + +ASCII_ESCAPE_LOOKUP = [u'\\x00', u'\\x01', u'\\x02', u'\\x03', u'\\x04', u'\\x05', u'\\x06', u'\\x07', u'\\x08', + u'\\x09', u'\\x0A', u'\\x0B', u'\\x0C', u'\\x0D', u'\\x0E', u'\\x0F', u'\\x10', u'\\x11', + u'\\x12', u'\\x13', u'\\x14', u'\\x15', u'\\x16', u'\\x17', u'\\x18', u'\\x19', u'\\x1A', + u'\\x1B', u'\\x1C', u'\\x1D', u'\\x1E', u'\\x1F', u' ', u'!', u'"', u'#', u'$', u'%', u'&', u"'", + u'(', u')', u'*', u'+', u',', u'-', u'.', u'/', u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', + u'8', u'9', u':', u';', u'<', u'=', u'>', u'?', u'@', u'A', u'B', u'C', u'D', u'E', u'F', u'G', + u'H', u'I', u'J', u'K', u'L', u'M', u'N', u'O', u'P', u'Q', u'R', u'S', u'T', u'U', u'V', u'W', + u'X', u'Y', u'Z', u'[', u'\\', u']', u'^', u'_', u'`', u'a', u'b', u'c', u'd', u'e', u'f', u'g', + u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', + u'x', u'y', u'z', u'{', u'|', u'}', u'~', u'\\x7F', u'\\x80', u'\\x81', u'\\x82', u'\\x83', + u'\\x84', u'\\x85', u'\\x86', u'\\x87', u'\\x88', u'\\x89', u'\\x8A', u'\\x8B', u'\\x8C', + u'\\x8D', u'\\x8E', u'\\x8F', u'\\x90', u'\\x91', u'\\x92', u'\\x93', u'\\x94', u'\\x95', + u'\\x96', u'\\x97', u'\\x98', u'\\x99', u'\\x9A', u'\\x9B', u'\\x9C', u'\\x9D', u'\\x9E', + u'\\x9F', u'\\xA0', u'\\xA1', u'\\xA2', u'\\xA3', u'\\xA4', u'\\xA5', u'\\xA6', u'\\xA7', + u'\\xA8', u'\\xA9', u'\\xAA', u'\\xAB', u'\\xAC', u'\\xAD', u'\\xAE', u'\\xAF', u'\\xB0', + u'\\xB1', u'\\xB2', u'\\xB3', u'\\xB4', u'\\xB5', u'\\xB6', u'\\xB7', u'\\xB8', u'\\xB9', + u'\\xBA', u'\\xBB', u'\\xBC', u'\\xBD', u'\\xBE', u'\\xBF', u'\\xC0', u'\\xC1', u'\\xC2', + u'\\xC3', u'\\xC4', u'\\xC5', u'\\xC6', u'\\xC7', u'\\xC8', u'\\xC9', u'\\xCA', u'\\xCB', + u'\\xCC', u'\\xCD', u'\\xCE', u'\\xCF', u'\\xD0', u'\\xD1', u'\\xD2', u'\\xD3', u'\\xD4', + u'\\xD5', u'\\xD6', u'\\xD7', u'\\xD8', u'\\xD9', u'\\xDA', u'\\xDB', u'\\xDC', u'\\xDD', + u'\\xDE', u'\\xDF', u'\\xE0', u'\\xE1', u'\\xE2', u'\\xE3', u'\\xE4', u'\\xE5', u'\\xE6', + u'\\xE7', u'\\xE8', u'\\xE9', u'\\xEA', u'\\xEB', u'\\xEC', u'\\xED', u'\\xEE', u'\\xEF', + u'\\xF0', u'\\xF1', u'\\xF2', u'\\xF3', u'\\xF4', u'\\xF5', u'\\xF6', u'\\xF7', u'\\xF8', + u'\\xF9', u'\\xFA', u'\\xFB', u'\\xFC', u'\\xFD', u'\\xFE', u'\\xFF'] + +ASCII_ESCAPE_LOOKUP_BYTES = [b'\\x00', b'\\x01', b'\\x02', b'\\x03', b'\\x04', b'\\x05', b'\\x06', b'\\x07', b'\\x08', + b'\\x09', b'\\x0A', b'\\x0B', b'\\x0C', b'\\x0D', b'\\x0E', b'\\x0F', b'\\x10', b'\\x11', + b'\\x12', b'\\x13', b'\\x14', b'\\x15', b'\\x16', b'\\x17', b'\\x18', b'\\x19', b'\\x1A', + b'\\x1B', b'\\x1C', b'\\x1D', b'\\x1E', b'\\x1F', b' ', b'!', b'"', b'#', b'$', b'%', b'&', + b"'", b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b'0', b'1', b'2', b'3', b'4', b'5', + b'6', b'7', b'8', b'9', b':', b';', b'<', b'=', b'>', b'?', b'@', b'A', b'B', b'C', b'D', + b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', + b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_', b'`', b'a', b'b', + b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', + b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'{', b'|', b'}', b'~', b'\\x7F', + b'\\x80', b'\\x81', b'\\x82', b'\\x83', b'\\x84', b'\\x85', b'\\x86', b'\\x87', b'\\x88', + b'\\x89', b'\\x8A', b'\\x8B', b'\\x8C', b'\\x8D', b'\\x8E', b'\\x8F', b'\\x90', b'\\x91', + b'\\x92', b'\\x93', b'\\x94', b'\\x95', b'\\x96', b'\\x97', b'\\x98', b'\\x99', b'\\x9A', + b'\\x9B', b'\\x9C', b'\\x9D', b'\\x9E', b'\\x9F', b'\\xA0', b'\\xA1', b'\\xA2', b'\\xA3', + b'\\xA4', b'\\xA5', b'\\xA6', b'\\xA7', b'\\xA8', b'\\xA9', b'\\xAA', b'\\xAB', b'\\xAC', + b'\\xAD', b'\\xAE', b'\\xAF', b'\\xB0', b'\\xB1', b'\\xB2', b'\\xB3', b'\\xB4', b'\\xB5', + b'\\xB6', b'\\xB7', b'\\xB8', b'\\xB9', b'\\xBA', b'\\xBB', b'\\xBC', b'\\xBD', b'\\xBE', + b'\\xBF', b'\\xC0', b'\\xC1', b'\\xC2', b'\\xC3', b'\\xC4', b'\\xC5', b'\\xC6', b'\\xC7', + b'\\xC8', b'\\xC9', b'\\xCA', b'\\xCB', b'\\xCC', b'\\xCD', b'\\xCE', b'\\xCF', b'\\xD0', + b'\\xD1', b'\\xD2', b'\\xD3', b'\\xD4', b'\\xD5', b'\\xD6', b'\\xD7', b'\\xD8', b'\\xD9', + b'\\xDA', b'\\xDB', b'\\xDC', b'\\xDD', b'\\xDE', b'\\xDF', b'\\xE0', b'\\xE1', b'\\xE2', + b'\\xE3', b'\\xE4', b'\\xE5', b'\\xE6', b'\\xE7', b'\\xE8', b'\\xE9', b'\\xEA', b'\\xEB', + b'\\xEC', b'\\xED', b'\\xEE', b'\\xEF', b'\\xF0', b'\\xF1', b'\\xF2', b'\\xF3', b'\\xF4', + b'\\xF5', b'\\xF6', b'\\xF7', b'\\xF8', b'\\xF9', b'\\xFA', b'\\xFB', b'\\xFC', b'\\xFD', + b'\\xFE', b'\\xFF'] + + +def escape_ascii(bytes_data): + return u''.join(ASCII_ESCAPE_LOOKUP[bval(ch)] for ch in bytes_data) + + +def escape_ascii_bytes(bytes_data): + return b''.join(ASCII_ESCAPE_LOOKUP_BYTES[bval(ch)] for ch in bytes_data) + + +def escape_utf8_error(err): + return escape_ascii(err.object[err.start:err.end]), err.end + +codecs.register_error('rdbslashescape', escape_utf8_error) + + +def escape_utf8(byte_data): + return byte_data.decode('utf-8', 'rdbslashescape') + + +def bytes_to_unicode(byte_data, escape, skip_printable=False): + """ + Decode given bytes using specified escaping method. + :param byte_data: The byte-like object with bytes to decode. + :param escape: The escape method to use. + :param skip_printable: If True, don't escape byte_data with all 'printable ASCII' bytes. Defaults to False. + :return: New unicode string, escaped with the specified method if needed. + """ + if isinteger(byte_data): + if skip_printable: + return int2unistr(byte_data) + else: + byte_data = int2bytes(byte_data) + else: + assert (isinstance(byte_data, type(b''))) + if skip_printable and all(0x20 <= bval(ch) <= 0x7E for ch in byte_data): + escape = STRING_ESCAPE_RAW + + if escape == STRING_ESCAPE_RAW: + return byte_data.decode('latin-1') + elif escape == STRING_ESCAPE_PRINT: + return escape_ascii(byte_data) + elif escape == STRING_ESCAPE_UTF8: + return escape_utf8(byte_data) + elif escape == STRING_ESCAPE_BASE64: + return codecs.decode(base64.b64encode(byte_data), 'latin-1') + else: + raise UnicodeEncodeError("Unknown escape option") + + +def apply_escape_bytes(byte_data, escape, skip_printable=False): + """ + Apply the specified escape method on the given bytes. + :param byte_data: The byte-like object with bytes to escape. + :param escape: The escape method to use. + :param skip_printable: If True, don't escape byte_data with all 'printable ASCII' bytes. Defaults to False. + :return: new bytes object with the escaped bytes or byte_data itself on some no-op cases. + """ + + if isinteger(byte_data): + if skip_printable: + return int2bytes(byte_data) + else: + byte_data = int2bytes(byte_data) + else: + assert (isinstance(byte_data, type(b''))) + if skip_printable and all(0x20 <= bval(ch) <= 0x7E for ch in byte_data): + escape = STRING_ESCAPE_RAW + + if escape == STRING_ESCAPE_RAW: + return byte_data + elif escape == STRING_ESCAPE_PRINT: + return escape_ascii_bytes(byte_data) + elif escape == STRING_ESCAPE_UTF8: + return codecs.encode(escape_utf8(byte_data), 'utf-8') + elif escape == STRING_ESCAPE_BASE64: + return base64.b64encode(byte_data) + else: + raise UnicodeEncodeError("Unknown escape option") diff --git a/rdbtools/memprofiler.py b/rdbtools/memprofiler.py index f8891dd..0059587 100644 --- a/rdbtools/memprofiler.py +++ b/rdbtools/memprofiler.py @@ -1,11 +1,17 @@ +import codecs from collections import namedtuple import random -import json import bisect from distutils.version import StrictVersion +try: + import ujson as json +except: + import json from rdbtools.parser import RdbCallback -from rdbtools.callbacks import encode_key +from rdbtools.encodehelpers import bytes_to_unicode + +from heapq import heappush, nlargest, heappop ZSKIPLIST_MAXLEVEL=32 ZSKIPLIST_P=0.25 @@ -13,7 +19,7 @@ MemoryRecord = namedtuple('MemoryRecord', ['database', 'type', 'key', 'bytes', 'encoding','size', 'len_largest_element']) -class StatsAggregator(): +class StatsAggregator(object): def __init__(self, key_groupings = None): self.aggregates = {} self.scatters = {} @@ -71,23 +77,53 @@ def add_scatter(self, heading, x, y): def get_json(self): return json.dumps({"aggregates":self.aggregates, "scatters":self.scatters, "histograms":self.histograms}) -class PrintAllKeys(): - def __init__(self, out): +class PrintAllKeys(object): + def __init__(self, out, bytes, largest): + self._bytes = bytes + self._largest = largest self._out = out - self._out.write("%s,%s,%s,%s,%s,%s,%s\n" % ("database", "type", "key", - "size_in_bytes", "encoding", "num_elements", "len_largest_element")) + headers = "%s,%s,%s,%s,%s,%s,%s\n" % ( + "database", "type", "key", "size_in_bytes", "encoding", "num_elements", "len_largest_element") + self._out.write(codecs.encode(headers, 'latin-1')) + + if self._largest is not None: + self._heap = [] def next_record(self, record) : if record.key is None: return # some records are not keys (e.g. dict) - self._out.write("%d,%s,%s,%d,%s,%d,%d\n" % (record.database, record.type, encode_key(record.key), - record.bytes, record.encoding, record.size, record.len_largest_element)) + if self._largest is None: + if self._bytes is None or record.bytes >= int(self._bytes): + rec_str = "%d,%s,%s,%d,%s,%d,%d\n" % ( + record.database, record.type, record.key, record.bytes, record.encoding, record.size, + record.len_largest_element) + self._out.write(codecs.encode(rec_str, 'latin-1')) + else: + heappush(self._heap, (record.bytes, record)) + + def end_rdb(self): + if self._largest is not None: + self._heap = nlargest(int(self._largest), self._heap) + self._largest = None + + while self._heap: + bytes, record = heappop(self._heap) + self.next_record(record) + +class PrintJustKeys(object): + def __init__(self, out): + self._out = out + def next_record(self, record): + self._out.write(codecs.encode("%s\n" % record.key, 'latin-1')) + + class MemoryCallback(RdbCallback): '''Calculates the memory used if this rdb file were loaded into RAM The memory usage is approximate, and based on heuristics. ''' - def __init__(self, stream, architecture, redis_version='3.2'): + def __init__(self, stream, architecture, redis_version='3.2', string_escape=None): + super(MemoryCallback, self).__init__(string_escape) self._stream = stream self._dbnum = 0 self._current_size = 0 @@ -110,6 +146,12 @@ def __init__(self, stream, architecture, redis_version='3.2'): self._long_size = 4 self._architecture = 32 + def emit_record(self, record_type, key, byte_count, encoding, size, largest_el): + if key is not None: + key = bytes_to_unicode(key, self._escape, skip_printable=True) + record = MemoryRecord(self._dbnum, record_type, key, byte_count, encoding, size, largest_el) + self._stream.next_record(record) + def start_rdb(self): pass @@ -128,33 +170,28 @@ def start_database(self, db_number): self._db_expires = 0 def end_database(self, db_number): - record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_keys), None, None, None) - self._stream.next_record(record) - record = MemoryRecord(self._dbnum, "dict", None, self.hashtable_overhead(self._db_expires), None, None, None) - self._stream.next_record(record) + self.emit_record("dict", None, self.hashtable_overhead(self._db_keys), None, None, None) + self.emit_record("dict", None, self.hashtable_overhead(self._db_expires), None, None, None) + if hasattr(self._stream, 'end_database'): + self._stream.end_database(db_number) def end_rdb(self): #print('internal fragmentation: %s' % self._total_internal_frag) - pass + if hasattr(self._stream, 'end_rdb'): + self._stream.end_rdb() def set(self, key, value, expiry, info): self._current_encoding = info['encoding'] - size = self.sizeof_string(key) + self.sizeof_string(value) + self.top_level_object_overhead() - size += 2*self.robj_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) + self.sizeof_string(value) length = element_length(value) - record = MemoryRecord(self._dbnum, "string", key, size, self._current_encoding, length, length) - self._stream.next_record(record) + self.emit_record("string", key, size, self._current_encoding, length, length) self.end_key() def start_hash(self, key, length, expiry, info): self._current_encoding = info['encoding'] self._current_length = length - size = self.sizeof_string(key) - size += 2*self.robj_overhead() - size += self.top_level_object_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) if 'sizeof_value' in info: size += info['sizeof_value'] @@ -174,11 +211,12 @@ def hset(self, key, field, value): self._current_size += self.sizeof_string(field) self._current_size += self.sizeof_string(value) self._current_size += self.hashtable_entry_overhead() - self._current_size += 2*self.robj_overhead() + if self._redis_version < StrictVersion('4.0'): + self._current_size += 2*self.robj_overhead() def end_hash(self, key): - record = MemoryRecord(self._dbnum, "hash", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("hash", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def start_set(self, key, cardinality, expiry, info): @@ -192,11 +230,12 @@ def sadd(self, key, member): if self._current_encoding == 'hashtable': self._current_size += self.sizeof_string(member) self._current_size += self.hashtable_entry_overhead() - self._current_size += self.robj_overhead() + if self._redis_version < StrictVersion('4.0'): + self._current_size += self.robj_overhead() def end_set(self, key): - record = MemoryRecord(self._dbnum, "set", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("set", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def start_list(self, key, expiry, info): @@ -204,10 +243,7 @@ def start_list(self, key, expiry, info): self._list_items_size = 0 self._list_items_zipped_size = 0 self._current_encoding = info['encoding'] - size = self.sizeof_string(key) - size += 2*self.robj_overhead() - size += self.top_level_object_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) # ignore the encoding in the rdb, and predict the encoding that will be used at the target redis version if self._redis_version >= StrictVersion('3.2'): @@ -256,19 +292,17 @@ def end_list(self, key, info): else: # linkedlist self._current_size += self.linkedlist_entry_overhead() * self._current_length self._current_size += self.linkedlist_overhead() - self._current_size += self.robj_overhead() * self._current_length + if self._redis_version < StrictVersion('4.0'): + self._current_size += self.robj_overhead() * self._current_length self._current_size += self._list_items_size - record = MemoryRecord(self._dbnum, "list", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("list", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def start_sorted_set(self, key, length, expiry, info): self._current_length = length self._current_encoding = info['encoding'] - size = self.sizeof_string(key) - size += 2*self.robj_overhead() - size += self.top_level_object_overhead() - size += self.key_expiry_overhead(expiry) + size = self.top_level_object_overhead(key, expiry) if 'sizeof_value' in info: size += info['sizeof_value'] @@ -283,14 +317,15 @@ def zadd(self, key, score, member): self._len_largest_element = element_length(member) if self._current_encoding == 'skiplist': - self._current_size += 8 # self.sizeof_string(score) + self._current_size += 8 # score (double) self._current_size += self.sizeof_string(member) - self._current_size += 2*self.robj_overhead() + if self._redis_version < StrictVersion('4.0'): + self._current_size += self.robj_overhead() self._current_size += self.skiplist_entry_overhead() def end_sorted_set(self, key): - record = MemoryRecord(self._dbnum, "sortedset", key, self._current_size, self._current_encoding, self._current_length, self._len_largest_element) - self._stream.next_record(record) + self.emit_record("sortedset", key, self._current_size, self._current_encoding, self._current_length, + self._len_largest_element) self.end_key() def end_key(self): @@ -322,10 +357,10 @@ def sizeof_string(self, string): return self.malloc_overhead(l + 1 + 8 + 1) return self.malloc_overhead(l + 1 + 16 + 1) - def top_level_object_overhead(self): + def top_level_object_overhead(self, key, expiry): # Each top level object is an entry in a dictionary, and so we have to include # the overhead of a dictionary entry - return self.hashtable_entry_overhead() + return self.hashtable_entry_overhead() + self.sizeof_string(key) + self.robj_overhead() + self.key_expiry_overhead(expiry) def key_expiry_overhead(self, expiry): # If there is no expiry, there isn't any overhead @@ -441,12 +476,15 @@ def zset_random_level(self): return level else: return ZSKIPLIST_MAXLEVEL - + +MAXINT = 2**63 - 1 + def element_length(element): if isinstance(element, int): - return 8 - if isinstance(element, long): - return 16 + if element < - MAXINT - 1 or element > MAXINT: + return 16 + else: + return 8 else: return len(element) diff --git a/rdbtools/parser.py b/rdbtools/parser.py index a7be7a2..14c6972 100644 --- a/rdbtools/parser.py +++ b/rdbtools/parser.py @@ -4,14 +4,27 @@ import datetime import re -try : - from StringIO import StringIO +from rdbtools.encodehelpers import STRING_ESCAPE_RAW, apply_escape_bytes, bval +from .compat import range, str2regexp + +try: + try: + from cStringIO import StringIO as BytesIO + except ImportError: + from StringIO import StringIO as BytesIO except ImportError: - from io import StringIO + from io import BytesIO + +try: + import lzf + HAS_PYTHON_LZF = True +except ImportError: + HAS_PYTHON_LZF = False REDIS_RDB_6BITLEN = 0 REDIS_RDB_14BITLEN = 1 -REDIS_RDB_32BITLEN = 2 +REDIS_RDB_32BITLEN = 0x80 +REDIS_RDB_64BITLEN = 0x81 REDIS_RDB_ENCVAL = 3 REDIS_RDB_OPCODE_AUX = 250 @@ -26,6 +39,8 @@ REDIS_RDB_TYPE_SET = 2 REDIS_RDB_TYPE_ZSET = 3 REDIS_RDB_TYPE_HASH = 4 +REDIS_RDB_TYPE_ZSET_2 = 5 # ZSET version 2 with doubles stored in binary. +REDIS_RDB_TYPE_MODULE = 6 REDIS_RDB_TYPE_HASH_ZIPMAP = 9 REDIS_RDB_TYPE_LIST_ZIPLIST = 10 REDIS_RDB_TYPE_SET_INTSET = 11 @@ -39,7 +54,7 @@ REDIS_RDB_ENC_LZF = 3 DATA_TYPE_MAPPING = { - 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", + 0 : "string", 1 : "list", 2 : "set", 3 : "sortedset", 4 : "hash", 5 : "sortedset", 6 : "module", 9 : "hash", 10 : "list", 11 : "set", 12 : "sortedset", 13 : "hash", 14 : "list"} class RdbCallback(object): @@ -48,6 +63,24 @@ class RdbCallback(object): This callback provides a serial and fast access to the dump file. """ + def __init__(self, string_escape): + if string_escape is None: + self._escape = STRING_ESCAPE_RAW + else: + self._escape = string_escape + + def encode_key(self, key): + """ + Escape a given key bytes with the instance chosen escape method. + + Key is not escaped if it contains only 'ASCII printable' bytes. + """ + return apply_escape_bytes(key, self._escape, skip_printable=True) + + def encode_value(self, val): + """Escape a given value bytes with the instance chosen escape method.""" + return apply_escape_bytes(val, self._escape) + def start_rdb(self): """ Called once we know we are dealing with a valid redis dump file @@ -256,7 +289,7 @@ def end_rdb(self): """Called to indicate we have completed parsing of the dump file""" pass -class RdbParser : +class RdbParser(object): """ A Parser for Redis RDB Files @@ -363,8 +396,12 @@ def read_length_with_encoding(self, f) : elif enc_type == REDIS_RDB_14BITLEN : bytes.append(read_unsigned_char(f)) length = ((bytes[0]&0x3F)<<8)|bytes[1] - else : + elif bytes[0] == REDIS_RDB_32BITLEN: length = ntohl(f) + elif bytes[0] == REDIS_RDB_64BITLEN: + length = ntohu64(f) + else: + raise Exception('read_length_with_encoding', "Invalid string encoding %s (encoding byte 0x%X)" % (enc_type, bytes[0])) return (length, is_encoded) def read_length(self, f) : @@ -392,6 +429,19 @@ def read_string(self, f) : val = f.read(length) return val + def read_float(self, f): + dbl_length = read_unsigned_char(f) + if dbl_length == 253: + return float('nan') + elif dbl_length == 254: + return float('inf') + elif dbl_length == 255: + return float('-inf') + data = f.read(dbl_length) + if isinstance(data, str): + return float(data) + return data # bug? + # Read an object for the stream # f is the redis file # enc_type is the type of object @@ -406,7 +456,7 @@ def read_object(self, f, enc_type) : # and the last string is the tail of the list length = self.read_length(f) self._callback.start_list(self._key, self._expiry, info={'encoding':'linkedlist' }) - for count in xrange(0, length) : + for count in range(0, length) : val = self.read_string(f) self._callback.rpush(self._key, val) self._callback.end_list(self._key, info={'encoding':'linkedlist' }) @@ -416,25 +466,22 @@ def read_object(self, f, enc_type) : # Note that the order of strings is non-deterministic length = self.read_length(f) self._callback.start_set(self._key, length, self._expiry, info={'encoding':'hashtable'}) - for count in xrange(0, length) : + for count in range(0, length) : val = self.read_string(f) self._callback.sadd(self._key, val) self._callback.end_set(self._key) - elif enc_type == REDIS_RDB_TYPE_ZSET : + elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2 : length = self.read_length(f) self._callback.start_sorted_set(self._key, length, self._expiry, info={'encoding':'skiplist'}) - for count in xrange(0, length) : + for count in range(0, length) : val = self.read_string(f) - dbl_length = read_unsigned_char(f) - score = f.read(dbl_length) - if isinstance(score, str): - score = float(score) + score = read_binary_double(f) if enc_type == REDIS_RDB_TYPE_ZSET_2 else self.read_float(f) self._callback.zadd(self._key, score, val) self._callback.end_sorted_set(self._key) elif enc_type == REDIS_RDB_TYPE_HASH : length = self.read_length(f) self._callback.start_hash(self._key, length, self._expiry, info={'encoding':'hashtable'}) - for count in xrange(0, length) : + for count in range(0, length) : field = self.read_string(f) value = self.read_string(f) self._callback.hset(self._key, field, value) @@ -451,6 +498,8 @@ def read_object(self, f, enc_type) : self.read_hash_from_ziplist(f) elif enc_type == REDIS_RDB_TYPE_LIST_QUICKLIST: self.read_list_from_quicklist(f) + elif enc_type == REDIS_RDB_TYPE_MODULE : + raise Exception('read_object', 'Unable to read Redis Modules RDB objects (key %s)' % (enc_type, self._key)) else : raise Exception('read_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) @@ -478,6 +527,14 @@ def skip_string(self, f): bytes_to_skip = length skip(f, bytes_to_skip) + + def skip_float(self, f): + dbl_length = read_unsigned_char(f) + if dbl_length < 253: + skip(f, dbl_length) + + def skip_binary_double(self, f): + skip(f, 8) def skip_object(self, f, enc_type): skip_strings = 0 @@ -487,8 +544,11 @@ def skip_object(self, f, enc_type): skip_strings = self.read_length(f) elif enc_type == REDIS_RDB_TYPE_SET : skip_strings = self.read_length(f) - elif enc_type == REDIS_RDB_TYPE_ZSET : - skip_strings = self.read_length(f) * 2 + elif enc_type == REDIS_RDB_TYPE_ZSET or enc_type == REDIS_RDB_TYPE_ZSET_2 : + length = self.read_length(f) + for x in range(length): + skip_string(f) + skip_binary_double(f) if enc_type == REDIS_RDB_TYPE_ZSET_2 else skip_float(f) elif enc_type == REDIS_RDB_TYPE_HASH : skip_strings = self.read_length(f) * 2 elif enc_type == REDIS_RDB_TYPE_HASH_ZIPMAP : @@ -503,19 +563,21 @@ def skip_object(self, f, enc_type): skip_strings = 1 elif enc_type == REDIS_RDB_TYPE_LIST_QUICKLIST: skip_strings = self.read_length(f) + elif enc_type == REDIS_RDB_TYPE_MODULE: + raise Exception('skip_object', 'Unable to skip Redis Modules RDB objects (key %s)' % (enc_type, self._key)) else : raise Exception('skip_object', 'Invalid object type %d for key %s' % (enc_type, self._key)) - for x in xrange(0, skip_strings): + for x in range(0, skip_strings): self.skip_string(f) def read_intset(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) encoding = read_unsigned_int(buff) num_entries = read_unsigned_int(buff) self._callback.start_set(self._key, num_entries, self._expiry, info={'encoding':'intset', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : if encoding == 8 : entry = read_signed_long(buff) elif encoding == 4 : @@ -529,12 +591,12 @@ def read_intset(self, f) : def read_ziplist(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) self._callback.start_list(self._key, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : val = self.read_ziplist_entry(buff) self._callback.rpush(self._key, val) zlist_end = read_unsigned_char(buff) @@ -546,14 +608,14 @@ def read_list_from_quicklist(self, f): count = self.read_length(f) total_size = 0 self._callback.start_list(self._key, self._expiry, info={'encoding': 'quicklist', 'zips': count}) - for i in xrange(0, count): + for i in range(0, count): raw_string = self.read_string(f) total_size += len(raw_string) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) - for x in xrange(0, num_entries): + for x in range(0, num_entries): self._callback.rpush(self._key, self.read_ziplist_entry(buff)) zlist_end = read_unsigned_char(buff) if zlist_end != 255: @@ -562,18 +624,18 @@ def read_list_from_quicklist(self, f): def read_zset_from_ziplist(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) if (num_entries % 2) : raise Exception('read_zset_from_ziplist', "Expected even number of elements, but found %d for key %s" % (num_entries, self._key)) - num_entries = num_entries /2 + num_entries = num_entries // 2 self._callback.start_sorted_set(self._key, num_entries, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : member = self.read_ziplist_entry(buff) score = self.read_ziplist_entry(buff) - if isinstance(score, str) : + if isinstance(score, bytes) : score = float(score) self._callback.zadd(self._key, score, member) zlist_end = read_unsigned_char(buff) @@ -583,15 +645,15 @@ def read_zset_from_ziplist(self, f) : def read_hash_from_ziplist(self, f) : raw_string = self.read_string(f) - buff = StringIO(raw_string) + buff = BytesIO(raw_string) zlbytes = read_unsigned_int(buff) tail_offset = read_unsigned_int(buff) num_entries = read_unsigned_short(buff) if (num_entries % 2) : raise Exception('read_hash_from_ziplist', "Expected even number of elements, but found %d for key %s" % (num_entries, self._key)) - num_entries = num_entries /2 + num_entries = num_entries // 2 self._callback.start_hash(self._key, num_entries, self._expiry, info={'encoding':'ziplist', 'sizeof_value':len(raw_string)}) - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : field = self.read_ziplist_entry(buff) value = self.read_ziplist_entry(buff) self._callback.hset(self._key, field, value) @@ -667,12 +729,12 @@ def read_zipmap_next_length(self, f) : return None def verify_magic_string(self, magic_string) : - if magic_string != 'REDIS' : + if magic_string != b'REDIS' : raise Exception('verify_magic_string', 'Invalid File Format') def verify_version(self, version_str) : version = int(version_str) - if version < 1 or version > 7: + if version < 1 or version > 8: raise Exception('verify_version', 'Invalid RDB version number %d' % version) self._rdb_version = version @@ -691,13 +753,18 @@ def init_filter(self, filters): raise Exception('init_filter', 'invalid value for dbs in filter %s' %filters['dbs']) if not ('keys' in filters and filters['keys']): - self._filters['keys'] = re.compile(".*") + self._filters['keys'] = re.compile(b".*") else: - self._filters['keys'] = re.compile(filters['keys']) + self._filters['keys'] = str2regexp(filters['keys']) + + if not ('not_keys' in filters and filters['not_keys']): + self._filters['not_keys'] = None + else: + self._filters['not_keys'] = str2regexp(filters['not_keys']) if not 'types' in filters: - self._filters['types'] = ('set', 'hash', 'sortedset', 'string', 'list') - elif isinstance(filters['types'], str): + self._filters['types'] = ('set', 'hash', 'sortedset', 'module', 'string', 'list') + elif isinstance(filters['types'], bytes): self._filters['types'] = (filters['types'], ) elif isinstance(filters['types'], list): self._filters['types'] = [str(x) for x in filters['types']] @@ -705,9 +772,19 @@ def init_filter(self, filters): raise Exception('init_filter', 'invalid value for types in filter %s' %filters['types']) def matches_filter(self, db_number, key=None, data_type=None): + + if isinstance(key, bytes): + key_to_match = key + elif isinstance(key, str): # bytes key in python2 + key_to_match = key + else: + key_to_match = str(key).encode('utf-8') + if self._filters['dbs'] and (not db_number in self._filters['dbs']): return False - if key and (not self._filters['keys'].match(str(key))): + if key and self._filters['not_keys'] and (self._filters['not_keys'].match(key_to_match)): + return False + if key and (not self._filters['keys'].match(key_to_match)): return False if data_type is not None and (not self.get_logical_type(data_type) in self._filters['types']): @@ -718,54 +795,65 @@ def get_logical_type(self, data_type): return DATA_TYPE_MAPPING[data_type] def lzf_decompress(self, compressed, expected_length): - in_stream = bytearray(compressed) - in_len = len(in_stream) - in_index = 0 - out_stream = bytearray() - out_index = 0 - - while in_index < in_len : - ctrl = in_stream[in_index] - if not isinstance(ctrl, int) : - raise Exception('lzf_decompress', 'ctrl should be a number %s for key %s' % (str(ctrl), self._key)) - in_index = in_index + 1 - if ctrl < 32 : - for x in xrange(0, ctrl + 1) : - out_stream.append(in_stream[in_index]) - #sys.stdout.write(chr(in_stream[in_index])) - in_index = in_index + 1 - out_index = out_index + 1 - else : - length = ctrl >> 5 - if length == 7 : - length = length + in_stream[in_index] - in_index = in_index + 1 - - ref = out_index - ((ctrl & 0x1f) << 8) - in_stream[in_index] - 1 + if HAS_PYTHON_LZF: + return lzf.decompress(compressed, expected_length) + else: + in_stream = bytearray(compressed) + in_len = len(in_stream) + in_index = 0 + out_stream = bytearray() + out_index = 0 + + while in_index < in_len : + ctrl = in_stream[in_index] + if not isinstance(ctrl, int) : + raise Exception('lzf_decompress', 'ctrl should be a number %s for key %s' % (str(ctrl), self._key)) in_index = in_index + 1 - for x in xrange(0, length + 2) : - out_stream.append(out_stream[ref]) - ref = ref + 1 - out_index = out_index + 1 - if len(out_stream) != expected_length : - raise Exception('lzf_decompress', 'Expected lengths do not match %d != %d for key %s' % (len(out_stream), expected_length, self._key)) - return str(out_stream) + if ctrl < 32 : + for x in range(0, ctrl + 1) : + out_stream.append(in_stream[in_index]) + #sys.stdout.write(chr(in_stream[in_index])) + in_index = in_index + 1 + out_index = out_index + 1 + else : + length = ctrl >> 5 + if length == 7 : + length = length + in_stream[in_index] + in_index = in_index + 1 + + ref = out_index - ((ctrl & 0x1f) << 8) - in_stream[in_index] - 1 + in_index = in_index + 1 + for x in range(0, length + 2) : + out_stream.append(out_stream[ref]) + ref = ref + 1 + out_index = out_index + 1 + if len(out_stream) != expected_length : + raise Exception('lzf_decompress', 'Expected lengths do not match %d != %d for key %s' % (len(out_stream), expected_length, self._key)) + return bytes(out_stream) def skip(f, free): if free : f.read(free) +def memrev(arr): + l = len(arr) + new_arr = bytearray(l) + for i in range(l): + new_arr[-i-1] = arr[i] + return str(new_arr) + def ntohl(f) : - val = read_unsigned_int(f) - new_val = 0 - new_val = new_val | ((val & 0x000000ff) << 24) - new_val = new_val | ((val & 0xff000000) >> 24) - new_val = new_val | ((val & 0x0000ff00) << 8) - new_val = new_val | ((val & 0x00ff0000) >> 8) - return new_val + val = memrev(f.read(4)) + return struct.unpack('I', val)[0] + +def ntohu64(f) : + val = memrev(f.read(8)) + return struct.unpack('Q', val)[0] def to_datetime(usecs_since_epoch): - seconds_since_epoch = usecs_since_epoch / 1000000 + seconds_since_epoch = usecs_since_epoch // 1000000 + if seconds_since_epoch > 221925052800 : + seconds_since_epoch = 221925052800 useconds = usecs_since_epoch % 1000000 dt = datetime.datetime.utcfromtimestamp(seconds_since_epoch) delta = datetime.timedelta(microseconds = useconds) @@ -793,7 +881,7 @@ def read_big_endian_unsigned_int(f): return struct.unpack('>I', f.read(4))[0] def read_24bit_signed_number(f): - s = '0' + f.read(3) + s = b'0' + f.read(3) num = struct.unpack('i', s)[0] return num >> 8 @@ -802,6 +890,9 @@ def read_signed_long(f) : def read_unsigned_long(f) : return struct.unpack('Q', f.read(8))[0] + +def read_binary_double(f) : + return struct.unpack('d', f.read(8))[0] def string_as_hexcode(string) : for s in string : diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a4a97ff..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -redis==2.4.12 -wsgiref==0.1.2 diff --git a/setup.py b/setup.py index 74b376d..e2d7fb8 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ 'packages' : ['rdbtools', 'rdbtools.cli'], 'package_data' : {'rdbtools.cli': ['*.template']}, 'test_suite' : 'tests.all_tests', + 'install_requires': ['redis'], 'entry_points' : { 'console_scripts' : [ 'rdb = rdbtools.cli.rdb:main', diff --git a/tests/__init__.py b/tests/__init__.py index f55c428..fab64de 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,9 +1,18 @@ import unittest from tests.parser_tests import RedisParserTestCase from tests.memprofiler_tests import MemoryCallbackTestCase +from tests.callbacks_tests import ProtocolTestCase, JsonTestCase, DiffTestCase, KeysTestCase, KeyValsTestCase + def all_tests(): suite = unittest.TestSuite() - suite.addTest(unittest.makeSuite(RedisParserTestCase)) - suite.addTest(unittest.makeSuite(MemoryCallbackTestCase)) + test_case_list = [RedisParserTestCase, + MemoryCallbackTestCase, + ProtocolTestCase, + JsonTestCase, + DiffTestCase, + KeysTestCase, + KeyValsTestCase] + for case in test_case_list: + suite.addTest(unittest.makeSuite(case)) return suite diff --git a/tests/callbacks_tests.py b/tests/callbacks_tests.py new file mode 100644 index 0000000..bb26ba3 --- /dev/null +++ b/tests/callbacks_tests.py @@ -0,0 +1,121 @@ +import os +import unittest +import random +import sys +from io import BytesIO + +from rdbtools import RdbParser +from rdbtools import encodehelpers +from rdbtools.callbacks import ProtocolCallback, JSONCallback, DiffCallback, KeysOnlyCallback, KeyValsOnlyCallback + +if sys.version_info < (3,): + def rand_bytes(count): + return ''.join(chr(random.randrange(256)) for _ in range(count)) +else: + def rand_bytes(count): + return bytes(random.randrange(256) for _ in range(count)) + +TEST_DUMPS_DIR = 'dumps' + + +class CallbackTester(unittest.TestCase): + """ + General callback tester to use with specific callback tests. + Child class should implement callback_setup() to fill _callback_class, and _fixture attributes. + """ + def setUp(self): + self._out = BytesIO() + self.callback_setup() + + def callback_setup(self): + self._callback_class = None + self._fixture = {'escape_db_file': 'non_ascii_values.rdb'} + + def escape_test_helper(self, escape_name): + if self._callback_class is None: + return # Handle unittest discovery attempt to test with this "abstract" class. + + escape = getattr(encodehelpers, escape_name) + callback = self._callback_class(out=self._out, string_escape=escape) + parser = RdbParser(callback) + parser.parse(os.path.join(os.path.dirname(__file__), TEST_DUMPS_DIR, self._fixture['escape_db_file'])) + result = self._out.getvalue() + # print('\n%s escape method %s' % (self._callback_class.__name__, escape_name)) + # print("\t\tself._fixture['escape_out_%s'] = %s" % (escape, repr(result))) + # try: + # print(result.decode('utf8')) + # except UnicodeDecodeError: + # print(result.decode('latin-1')) + self.assertEqual(result, + self._fixture['escape_out_' + escape], + msg='\n%s escape method %s' % (self._callback_class.__name__, escape_name) + ) + + def test_raw_escape(self): + """Test using STRING_ESCAPE_RAW with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_RAW') + + def test_print_escape(self): + """Test using STRING_ESCAPE_PRINT with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_PRINT') + + def test_utf8_escape(self): + """Test using STRING_ESCAPE_UTF8 with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_UTF8') + + def test_base64_escape(self): + """Test using STRING_ESCAPE_BASE64 with varied key encodings against expected output.""" + self.escape_test_helper('STRING_ESCAPE_BASE64') + + +class ProtocolTestCase(CallbackTester): + def callback_setup(self): + super(ProtocolTestCase, self).callback_setup() + self._callback_class = ProtocolCallback + self._fixture['escape_out_raw'] = b'*2\r\n$6\r\nSELECT\r\n$1\r\n0\r\n*3\r\n$3\r\nSET\r\n$9\r\nint_value\r\n$3\r\n123\r\n*3\r\n$3\r\nSET\r\n$5\r\nascii\r\n$10\r\n\x00! ~0\n\t\rAb\r\n*3\r\n$3\r\nSET\r\n$3\r\nbin\r\n$14\r\n\x00$ ~0\x7f\xff\n\xaa\t\x80\rAb\r\n*3\r\n$3\r\nSET\r\n$9\r\nprintable\r\n$7\r\n!+ Ab^~\r\n*3\r\n$3\r\nSET\r\n$3\r\n378\r\n$12\r\nint_key_name\r\n*3\r\n$3\r\nSET\r\n$4\r\nutf8\r\n$27\r\n\xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_print'] = b'*2\r\n$6\r\nSELECT\r\n$1\r\n0\r\n*3\r\n$3\r\nSET\r\n$9\r\nint_value\r\n$3\r\n123\r\n*3\r\n$3\r\nSET\r\n$5\r\nascii\r\n$22\r\n\\x00! ~0\\x0A\\x09\\x0DAb\r\n*3\r\n$3\r\nSET\r\n$3\r\nbin\r\n$38\r\n\\x00$ ~0\\x7F\\xFF\\x0A\\xAA\\x09\\x80\\x0DAb\r\n*3\r\n$3\r\nSET\r\n$9\r\nprintable\r\n$7\r\n!+ Ab^~\r\n*3\r\n$3\r\nSET\r\n$3\r\n378\r\n$12\r\nint_key_name\r\n*3\r\n$3\r\nSET\r\n$4\r\nutf8\r\n$99\r\n\\xD7\\x91\\xD7\\x93\\xD7\\x99\\xD7\\xA7\\xD7\\x94\\xF0\\x90\\x80\\x8F123\\xD7\\xA2\\xD7\\x91\\xD7\\xA8\\xD7\\x99\\xD7\\xAA\r\n' + self._fixture['escape_out_utf8'] = b'*2\r\n$6\r\nSELECT\r\n$1\r\n0\r\n*3\r\n$3\r\nSET\r\n$9\r\nint_value\r\n$3\r\n123\r\n*3\r\n$3\r\nSET\r\n$5\r\nascii\r\n$10\r\n\x00! ~0\n\t\rAb\r\n*3\r\n$3\r\nSET\r\n$3\r\nbin\r\n$23\r\n\x00$ ~0\x7f\\xFF\n\\xAA\t\\x80\rAb\r\n*3\r\n$3\r\nSET\r\n$9\r\nprintable\r\n$7\r\n!+ Ab^~\r\n*3\r\n$3\r\nSET\r\n$3\r\n378\r\n$12\r\nint_key_name\r\n*3\r\n$3\r\nSET\r\n$4\r\nutf8\r\n$27\r\n\xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_base64'] = b'*2\r\n$8\r\nU0VMRUNU\r\n$4\r\nMA==\r\n*3\r\n$4\r\nU0VU\r\n$12\r\naW50X3ZhbHVl\r\n$4\r\nMTIz\r\n*3\r\n$4\r\nU0VU\r\n$8\r\nYXNjaWk=\r\n$16\r\nACEgfjAKCQ1BYg==\r\n*3\r\n$4\r\nU0VU\r\n$4\r\nYmlu\r\n$20\r\nACQgfjB//wqqCYANQWI=\r\n*3\r\n$4\r\nU0VU\r\n$12\r\ncHJpbnRhYmxl\r\n$12\r\nISsgQWJefg==\r\n*3\r\n$4\r\nU0VU\r\n$4\r\nMzc4\r\n$16\r\naW50X2tleV9uYW1l\r\n*3\r\n$4\r\nU0VU\r\n$8\r\ndXRmOA==\r\n$36\r\n15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq\r\n' + + +class JsonTestCase(CallbackTester): + def callback_setup(self): + super(JsonTestCase, self).callback_setup() + self._callback_class = JSONCallback + self._fixture['escape_out_raw'] = b'[{\r\n"int_value":"123",\r\n"ascii":"\\u0000! ~0\\n\\t\\rAb",\r\n"bin":"\\u0000$ ~0\\u007f\\u00ff\\n\\u00aa\\t\\u0080\\rAb",\r\n"printable":"!+ Ab^~",\r\n"378":"int_key_name",\r\n"utf8":"\\u00d7\\u0091\\u00d7\\u0093\\u00d7\\u0099\\u00d7\\u00a7\\u00d7\\u0094\\u00f0\\u0090\\u0080\\u008f123\\u00d7\\u00a2\\u00d7\\u0091\\u00d7\\u00a8\\u00d7\\u0099\\u00d7\\u00aa"}]' + self._fixture['escape_out_print'] = b'[{\r\n"int_value":"123",\r\n"ascii":"\\\\x00! ~0\\\\x0A\\\\x09\\\\x0DAb",\r\n"bin":"\\\\x00$ ~0\\\\x7F\\\\xFF\\\\x0A\\\\xAA\\\\x09\\\\x80\\\\x0DAb",\r\n"printable":"!+ Ab^~",\r\n"378":"int_key_name",\r\n"utf8":"\\\\xD7\\\\x91\\\\xD7\\\\x93\\\\xD7\\\\x99\\\\xD7\\\\xA7\\\\xD7\\\\x94\\\\xF0\\\\x90\\\\x80\\\\x8F123\\\\xD7\\\\xA2\\\\xD7\\\\x91\\\\xD7\\\\xA8\\\\xD7\\\\x99\\\\xD7\\\\xAA"}]' + self._fixture['escape_out_utf8'] = b'[{\r\n"int_value":"123",\r\n"ascii":"\\u0000! ~0\\n\\t\\rAb",\r\n"bin":"\\u0000$ ~0\\u007f\\\\xFF\\n\\\\xAA\\t\\\\x80\\rAb",\r\n"printable":"!+ Ab^~",\r\n"378":"int_key_name",\r\n"utf8":"\\u05d1\\u05d3\\u05d9\\u05e7\\u05d4\\ud800\\udc0f123\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea"}]' + self._fixture['escape_out_base64'] = b'[{\r\n"int_value":"MTIz",\r\n"ascii":"ACEgfjAKCQ1BYg==",\r\n"bin":"ACQgfjB//wqqCYANQWI=",\r\n"printable":"ISsgQWJefg==",\r\n"378":"aW50X2tleV9uYW1l",\r\n"utf8":"15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq"}]' + + +class DiffTestCase(CallbackTester): + def callback_setup(self): + super(DiffTestCase, self).callback_setup() + self._callback_class = DiffCallback + self._fixture['escape_out_raw'] = b'db=0 int_value -> 123\r\ndb=0 ascii -> \x00! ~0\n\t\rAb\r\ndb=0 bin -> \x00$ ~0\x7f\xff\n\xaa\t\x80\rAb\r\ndb=0 printable -> !+ Ab^~\r\ndb=0 378 -> int_key_name\r\ndb=0 utf8 -> \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_print'] = b'db=0 int_value -> 123\r\ndb=0 ascii -> \\x00! ~0\\x0A\\x09\\x0DAb\r\ndb=0 bin -> \\x00$ ~0\\x7F\\xFF\\x0A\\xAA\\x09\\x80\\x0DAb\r\ndb=0 printable -> !+ Ab^~\r\ndb=0 378 -> int_key_name\r\ndb=0 utf8 -> \\xD7\\x91\\xD7\\x93\\xD7\\x99\\xD7\\xA7\\xD7\\x94\\xF0\\x90\\x80\\x8F123\\xD7\\xA2\\xD7\\x91\\xD7\\xA8\\xD7\\x99\\xD7\\xAA\r\n' + self._fixture['escape_out_utf8'] = b'db=0 int_value -> 123\r\ndb=0 ascii -> \x00! ~0\n\t\rAb\r\ndb=0 bin -> \x00$ ~0\x7f\\xFF\n\\xAA\t\\x80\rAb\r\ndb=0 printable -> !+ Ab^~\r\ndb=0 378 -> int_key_name\r\ndb=0 utf8 -> \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa\r\n' + self._fixture['escape_out_base64'] = b'db=0 int_value -> MTIz\r\ndb=0 ascii -> ACEgfjAKCQ1BYg==\r\ndb=0 bin -> ACQgfjB//wqqCYANQWI=\r\ndb=0 printable -> ISsgQWJefg==\r\ndb=0 378 -> aW50X2tleV9uYW1l\r\ndb=0 utf8 -> 15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq\r\n' + + +class KeysTestCase(CallbackTester): + def callback_setup(self): + super(KeysTestCase, self).callback_setup() + self._callback_class = KeysOnlyCallback + self._fixture['escape_out_raw'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + self._fixture['escape_out_print'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + self._fixture['escape_out_utf8'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + self._fixture['escape_out_base64'] = b'int_value\nascii\nbin\nprintable\n378\nutf8\n' + + +class KeyValsTestCase(CallbackTester): + def callback_setup(self): + super(KeyValsTestCase, self).callback_setup() + self._callback_class = KeyValsOnlyCallback + self._fixture['escape_out_raw'] = b'\r\nint_value 123,\r\nascii \x00! ~0\n\t\rAb,\r\nbin \x00$ ~0\x7f\xff\n\xaa\t\x80\rAb,\r\nprintable !+ Ab^~,\r\n378 int_key_name,\r\nutf8 \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa' + self._fixture['escape_out_print'] = b'\r\nint_value 123,\r\nascii \\x00! ~0\\x0A\\x09\\x0DAb,\r\nbin \\x00$ ~0\\x7F\\xFF\\x0A\\xAA\\x09\\x80\\x0DAb,\r\nprintable !+ Ab^~,\r\n378 int_key_name,\r\nutf8 \\xD7\\x91\\xD7\\x93\\xD7\\x99\\xD7\\xA7\\xD7\\x94\\xF0\\x90\\x80\\x8F123\\xD7\\xA2\\xD7\\x91\\xD7\\xA8\\xD7\\x99\\xD7\\xAA' + self._fixture['escape_out_utf8'] = b'\r\nint_value 123,\r\nascii \x00! ~0\n\t\rAb,\r\nbin \x00$ ~0\x7f\\xFF\n\\xAA\t\\x80\rAb,\r\nprintable !+ Ab^~,\r\n378 int_key_name,\r\nutf8 \xd7\x91\xd7\x93\xd7\x99\xd7\xa7\xd7\x94\xf0\x90\x80\x8f123\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa' + self._fixture['escape_out_base64'] = b'\r\nint_value MTIz,\r\nascii ACEgfjAKCQ1BYg==,\r\nbin ACQgfjB//wqqCYANQWI=,\r\nprintable ISsgQWJefg==,\r\n378 aW50X2tleV9uYW1l,\r\nutf8 15HXk9eZ16fXlPCQgI8xMjPXoteR16jXmdeq' + +if __name__ == '__main__': + unittest.main() diff --git a/tests/create_test_rdb.py b/tests/create_test_rdb.py index 1367573..88d967f 100644 --- a/tests/create_test_rdb.py +++ b/tests/create_test_rdb.py @@ -3,6 +3,7 @@ import string import shutil import os +from rdbtools.compat import range r = redis.StrictRedis() r2 = redis.StrictRedis(db=2) @@ -92,13 +93,13 @@ def zipmap_with_big_values(): def dictionary() : num_entries = 1000 - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : r.hset("force_dictionary", random_string(50, x), random_string(50, x + num_entries)) def ziplist_that_compresses_easily() : for length in (6, 12, 18, 24, 30, 36) : - r.rpush("ziplist_compresses_easily", ("".join("a" for x in xrange(length)))) - + r.rpush("ziplist_compresses_easily", ("".join("a" for x in range(length)))) + def ziplist_that_doesnt_compress() : r.rpush("ziplist_doesnt_compress", "aj2410") r.rpush("ziplist_doesnt_compress", "cc953a17a8e096e76a44169ad3f9ac87c5f8248a403274416179aa9fbd852344") @@ -131,7 +132,7 @@ def ziplist_with_integers() : def linkedlist() : num_entries = 1000 - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : r.rpush("force_linkedlist", random_string(50, x)) def intset_16() : @@ -164,7 +165,7 @@ def sorted_set_as_ziplist() : def regular_sorted_set() : num_entries = 500 - for x in xrange(0, num_entries) : + for x in range(0, num_entries) : r.zadd("force_sorted_set", float(x) / 100, random_string(50, x)) def random_string(length, seed) : diff --git a/tests/dumps/non_ascii_values.rdb b/tests/dumps/non_ascii_values.rdb new file mode 100644 index 0000000..337564f Binary files /dev/null and b/tests/dumps/non_ascii_values.rdb differ diff --git a/tests/dumps/rdb_version_8_with_64b_length_and_scores.rdb b/tests/dumps/rdb_version_8_with_64b_length_and_scores.rdb new file mode 100644 index 0000000..ca7cbe1 Binary files /dev/null and b/tests/dumps/rdb_version_8_with_64b_length_and_scores.rdb differ diff --git a/tests/memprofiler_tests.py b/tests/memprofiler_tests.py index 5a083f1..0886d81 100644 --- a/tests/memprofiler_tests.py +++ b/tests/memprofiler_tests.py @@ -4,7 +4,7 @@ from rdbtools import MemoryCallback import os -class Stats(): +class Stats(object): def __init__(self): self.records = {} @@ -24,6 +24,5 @@ def setUp(self): def test_len_largest_element(self): stats = get_stats('ziplist_that_compresses_easily.rdb') - self.assertEqual(stats['ziplist_compresses_easily'].len_largest_element, 36, "Length of largest element does not match") - - \ No newline at end of file + + self.assertEqual(stats[b'ziplist_compresses_easily'].len_largest_element, 36, "Length of largest element does not match") diff --git a/tests/parser_tests.py b/tests/parser_tests.py index 768b63c..a85c364 100644 --- a/tests/parser_tests.py +++ b/tests/parser_tests.py @@ -2,6 +2,7 @@ import os import math from rdbtools import RdbCallback, RdbParser +from rdbtools.compat import range class RedisParserTestCase(unittest.TestCase): def setUp(self): @@ -20,12 +21,12 @@ def test_multiple_databases(self): r = load_rdb('multiple_databases.rdb') self.assert_(len(r.databases), 2) self.assert_(1 not in r.databases) - self.assertEquals(r.databases[0]["key_in_zeroth_database"], "zero") - self.assertEquals(r.databases[2]["key_in_second_database"], "second") - + self.assertEquals(r.databases[0][b"key_in_zeroth_database"], b"zero") + self.assertEquals(r.databases[2][b"key_in_second_database"], b"second") + def test_keys_with_expiry(self): r = load_rdb('keys_with_expiry.rdb') - expiry = r.expiry[0]['expires_ms_precision'] + expiry = r.expiry[0][b'expires_ms_precision'] self.assertEquals(expiry.year, 2022) self.assertEquals(expiry.month, 12) self.assertEquals(expiry.day, 25) @@ -36,33 +37,33 @@ def test_keys_with_expiry(self): def test_integer_keys(self): r = load_rdb('integer_keys.rdb') - self.assertEquals(r.databases[0][125], "Positive 8 bit integer") - self.assertEquals(r.databases[0][0xABAB], "Positive 16 bit integer") - self.assertEquals(r.databases[0][0x0AEDD325], "Positive 32 bit integer") - + self.assertEquals(r.databases[0][125], b"Positive 8 bit integer") + self.assertEquals(r.databases[0][0xABAB], b"Positive 16 bit integer") + self.assertEquals(r.databases[0][0x0AEDD325], b"Positive 32 bit integer") + def test_negative_integer_keys(self): r = load_rdb('integer_keys.rdb') - self.assertEquals(r.databases[0][-123], "Negative 8 bit integer") - self.assertEquals(r.databases[0][-0x7325], "Negative 16 bit integer") - self.assertEquals(r.databases[0][-0x0AEDD325], "Negative 32 bit integer") - + self.assertEquals(r.databases[0][-123], b"Negative 8 bit integer") + self.assertEquals(r.databases[0][-0x7325], b"Negative 16 bit integer") + self.assertEquals(r.databases[0][-0x0AEDD325], b"Negative 32 bit integer") + def test_string_key_with_compression(self): r = load_rdb('easily_compressible_string_key.rdb') - key = "".join('a' for x in range(0, 200)) - value = "Key that redis should compress easily" + key = b"".join(b'a' for x in range(0, 200)) + value = b"Key that redis should compress easily" self.assertEquals(r.databases[0][key], value) def test_zipmap_thats_compresses_easily(self): r = load_rdb('zipmap_that_compresses_easily.rdb') - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["a"], "aa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aa"], "aaaa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aaaaa"], "aaaaaaaaaaaaaa") - + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"a"], b"aa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aa"], b"aaaa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aaaaa"], b"aaaaaaaaaaaaaa") + def test_zipmap_that_doesnt_compress(self): r = load_rdb('zipmap_that_doesnt_compress.rdb') - self.assertEquals(r.databases[0]["zimap_doesnt_compress"]["MKD1G6"], 2) - self.assertEquals(r.databases[0]["zimap_doesnt_compress"]["YNNXK"], "F7TI") - + self.assertEquals(r.databases[0][b"zimap_doesnt_compress"][b"MKD1G6"], 2) + self.assertEquals(r.databases[0][b"zimap_doesnt_compress"][b"YNNXK"], b"F7TI") + def test_zipmap_with_big_values(self): ''' See issue https://github.com/sripathikrishnan/redis-rdb-tools/issues/2 Values with length around 253/254/255 bytes are treated specially in the parser @@ -74,40 +75,40 @@ def test_zipmap_with_big_values(self): ziplist with a length encoding of 5 bytes. ''' r = load_rdb('zipmap_with_big_values.rdb') - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["253bytes"]), 253) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["254bytes"]), 254) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["255bytes"]), 255) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["300bytes"]), 300) - self.assertEquals(len(r.databases[0]["zipmap_with_big_values"]["20kbytes"]), 20000) - + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"253bytes"]), 253) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"254bytes"]), 254) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"255bytes"]), 255) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"300bytes"]), 300) + self.assertEquals(len(r.databases[0][b"zipmap_with_big_values"][b"20kbytes"]), 20000) + def test_hash_as_ziplist(self): '''In redis dump version = 4, hashmaps are stored as ziplists''' r = load_rdb('hash_as_ziplist.rdb') - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["a"], "aa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aa"], "aaaa") - self.assertEquals(r.databases[0]["zipmap_compresses_easily"]["aaaaa"], "aaaaaaaaaaaaaa") - + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"a"], b"aa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aa"], b"aaaa") + self.assertEquals(r.databases[0][b"zipmap_compresses_easily"][b"aaaaa"], b"aaaaaaaaaaaaaa") + def test_dictionary(self): r = load_rdb('dictionary.rdb') - self.assertEquals(r.lengths[0]["force_dictionary"], 1000) - self.assertEquals(r.databases[0]["force_dictionary"]["ZMU5WEJDG7KU89AOG5LJT6K7HMNB3DEI43M6EYTJ83VRJ6XNXQ"], - "T63SOS8DQJF0Q0VJEZ0D1IQFCYTIPSBOUIAI9SB0OV57MQR1FI") - self.assertEquals(r.databases[0]["force_dictionary"]["UHS5ESW4HLK8XOGTM39IK1SJEUGVV9WOPK6JYA5QBZSJU84491"], - "6VULTCV52FXJ8MGVSFTZVAGK2JXZMGQ5F8OVJI0X6GEDDR27RZ") - + self.assertEquals(r.lengths[0][b"force_dictionary"], 1000) + self.assertEquals(r.databases[0][b"force_dictionary"][b"ZMU5WEJDG7KU89AOG5LJT6K7HMNB3DEI43M6EYTJ83VRJ6XNXQ"], + b"T63SOS8DQJF0Q0VJEZ0D1IQFCYTIPSBOUIAI9SB0OV57MQR1FI") + self.assertEquals(r.databases[0][b"force_dictionary"][b"UHS5ESW4HLK8XOGTM39IK1SJEUGVV9WOPK6JYA5QBZSJU84491"], + b"6VULTCV52FXJ8MGVSFTZVAGK2JXZMGQ5F8OVJI0X6GEDDR27RZ") + def test_ziplist_that_compresses_easily(self): r = load_rdb('ziplist_that_compresses_easily.rdb') - self.assertEquals(r.lengths[0]["ziplist_compresses_easily"], 6) + self.assertEquals(r.lengths[0][b"ziplist_compresses_easily"], 6) for idx, length in enumerate([6, 12, 18, 24, 30, 36]) : - self.assertEquals(("".join("a" for x in xrange(length))), r.databases[0]["ziplist_compresses_easily"][idx]) - + self.assertEquals((b"".join(b"a" for x in range(length))), r.databases[0][b"ziplist_compresses_easily"][idx]) + def test_ziplist_that_doesnt_compress(self): r = load_rdb('ziplist_that_doesnt_compress.rdb') - self.assertEquals(r.lengths[0]["ziplist_doesnt_compress"], 2) - self.assert_("aj2410" in r.databases[0]["ziplist_doesnt_compress"]) - self.assert_("cc953a17a8e096e76a44169ad3f9ac87c5f8248a403274416179aa9fbd852344" - in r.databases[0]["ziplist_doesnt_compress"]) - + self.assertEquals(r.lengths[0][b"ziplist_doesnt_compress"], 2) + self.assert_(b"aj2410" in r.databases[0][b"ziplist_doesnt_compress"]) + self.assert_(b"cc953a17a8e096e76a44169ad3f9ac87c5f8248a403274416179aa9fbd852344" + in r.databases[0][b"ziplist_doesnt_compress"]) + def test_ziplist_with_integers(self): r = load_rdb('ziplist_with_integers.rdb') @@ -117,77 +118,91 @@ def test_ziplist_with_integers(self): expected_numbers += [-2, 13, 25, -61, 63, 16380, -16000, 65535, -65523, 4194304, 0x7fffffffffffffff] - self.assertEquals(r.lengths[0]["ziplist_with_integers"], len(expected_numbers)) + self.assertEquals(r.lengths[0][b"ziplist_with_integers"], len(expected_numbers)) for num in expected_numbers : - self.assert_(num in r.databases[0]["ziplist_with_integers"], "Cannot find %d" % num) + self.assert_(num in r.databases[0][b"ziplist_with_integers"], "Cannot find %d" % num) def test_linkedlist(self): r = load_rdb('linkedlist.rdb') - self.assertEquals(r.lengths[0]["force_linkedlist"], 1000) - self.assert_("JYY4GIFI0ETHKP4VAJF5333082J4R1UPNPLE329YT0EYPGHSJQ" in r.databases[0]["force_linkedlist"]) - self.assert_("TKBXHJOX9Q99ICF4V78XTCA2Y1UYW6ERL35JCIL1O0KSGXS58S" in r.databases[0]["force_linkedlist"]) + self.assertEquals(r.lengths[0][b"force_linkedlist"], 1000) + self.assert_(b"JYY4GIFI0ETHKP4VAJF5333082J4R1UPNPLE329YT0EYPGHSJQ" in r.databases[0][b"force_linkedlist"]) + self.assert_(b"TKBXHJOX9Q99ICF4V78XTCA2Y1UYW6ERL35JCIL1O0KSGXS58S" in r.databases[0][b"force_linkedlist"]) def test_intset_16(self): r = load_rdb('intset_16.rdb') - self.assertEquals(r.lengths[0]["intset_16"], 3) + self.assertEquals(r.lengths[0][b"intset_16"], 3) for num in (0x7ffe, 0x7ffd, 0x7ffc) : - self.assert_(num in r.databases[0]["intset_16"]) + self.assert_(num in r.databases[0][b"intset_16"]) def test_intset_32(self): r = load_rdb('intset_32.rdb') - self.assertEquals(r.lengths[0]["intset_32"], 3) + self.assertEquals(r.lengths[0][b"intset_32"], 3) for num in (0x7ffefffe, 0x7ffefffd, 0x7ffefffc) : - self.assert_(num in r.databases[0]["intset_32"]) + self.assert_(num in r.databases[0][b"intset_32"]) def test_intset_64(self): r = load_rdb('intset_64.rdb') - self.assertEquals(r.lengths[0]["intset_64"], 3) + self.assertEquals(r.lengths[0][b"intset_64"], 3) for num in (0x7ffefffefffefffe, 0x7ffefffefffefffd, 0x7ffefffefffefffc) : - self.assert_(num in r.databases[0]["intset_64"]) + self.assert_(num in r.databases[0][b"intset_64"]) def test_regular_set(self): r = load_rdb('regular_set.rdb') - self.assertEquals(r.lengths[0]["regular_set"], 6) - for member in ("alpha", "beta", "gamma", "delta", "phi", "kappa") : - self.assert_(member in r.databases[0]["regular_set"], msg=('%s missing' % member)) + self.assertEquals(r.lengths[0][b"regular_set"], 6) + for member in (b"alpha", b"beta", b"gamma", b"delta", b"phi", b"kappa") : + self.assert_(member in r.databases[0][b"regular_set"], msg=('%s missing' % member)) def test_sorted_set_as_ziplist(self): r = load_rdb('sorted_set_as_ziplist.rdb') - self.assertEquals(r.lengths[0]["sorted_set_as_ziplist"], 3) - zset = r.databases[0]["sorted_set_as_ziplist"] - self.assert_(floateq(zset['8b6ba6718a786daefa69438148361901'], 1)) - self.assert_(floateq(zset['cb7a24bb7528f934b841b34c3a73e0c7'], 2.37)) - self.assert_(floateq(zset['523af537946b79c4f8369ed39ba78605'], 3.423)) + self.assertEquals(r.lengths[0][b"sorted_set_as_ziplist"], 3) + zset = r.databases[0][b"sorted_set_as_ziplist"] + self.assert_(floateq(zset[b'8b6ba6718a786daefa69438148361901'], 1)) + self.assert_(floateq(zset[b'cb7a24bb7528f934b841b34c3a73e0c7'], 2.37)) + self.assert_(floateq(zset[b'523af537946b79c4f8369ed39ba78605'], 3.423)) def test_filtering_by_keys(self): r = load_rdb('parser_filters.rdb', filters={"keys":"k[0-9]"}) - self.assertEquals(r.databases[0]['k1'], "ssssssss") - self.assertEquals(r.databases[0]['k3'], "wwwwwwww") + self.assertEquals(r.databases[0][b'k1'], b"ssssssss") + self.assertEquals(r.databases[0][b'k3'], b"wwwwwwww") self.assertEquals(len(r.databases[0]), 2) def test_filtering_by_type(self): r = load_rdb('parser_filters.rdb', filters={"types":["sortedset"]}) - self.assert_('z1' in r.databases[0]) - self.assert_('z2' in r.databases[0]) - self.assert_('z3' in r.databases[0]) - self.assert_('z4' in r.databases[0]) + self.assert_(b'z1' in r.databases[0]) + self.assert_(b'z2' in r.databases[0]) + self.assert_(b'z3' in r.databases[0]) + self.assert_(b'z4' in r.databases[0]) self.assertEquals(len(r.databases[0]), 4) def test_filtering_by_database(self): r = load_rdb('multiple_databases.rdb', filters={"dbs":[2]}) - self.assert_('key_in_zeroth_database' not in r.databases[0]) - self.assert_('key_in_second_database' in r.databases[2]) + self.assert_(b'key_in_zeroth_database' not in r.databases[0]) + self.assert_(b'key_in_second_database' in r.databases[2]) self.assertEquals(len(r.databases[0]), 0) self.assertEquals(len(r.databases[2]), 1) def test_rdb_version_5_with_checksum(self): r = load_rdb('rdb_version_5_with_checksum.rdb') - self.assertEquals(r.databases[0]['abcd'], 'efgh') - self.assertEquals(r.databases[0]['foo'], 'bar') - self.assertEquals(r.databases[0]['bar'], 'baz') - self.assertEquals(r.databases[0]['abcdef'], 'abcdef') - self.assertEquals(r.databases[0]['longerstring'], 'thisisalongerstring.idontknowwhatitmeans') + self.assertEquals(r.databases[0][b'abcd'], b'efgh') + self.assertEquals(r.databases[0][b'foo'], b'bar') + self.assertEquals(r.databases[0][b'bar'], b'baz') + self.assertEquals(r.databases[0][b'abcdef'], b'abcdef') + self.assertEquals(r.databases[0][b'longerstring'], b'thisisalongerstring.idontknowwhatitmeans') + + def test_rdb_version_8_with_64b_length_and_scores(self): + r = load_rdb('rdb_version_8_with_64b_length_and_scores.rdb') + self.assertEquals(r.databases[0][b'foo'], b'bar') + zset = r.databases[0][b"bigset"] + self.assertEquals(len(zset), 1000) + self.assert_(floateq(zset[b'finalfield'], 2.718)) + + def test_multiple_databases_stream(self): + r = load_rdb_stream('multiple_databases.rdb') + self.assert_(len(r.databases), 2) + self.assert_(1 not in r.databases) + self.assertEquals(r.databases[0][b"key_in_zeroth_database"], b"zero") + self.assertEquals(r.databases[2][b"key_in_second_database"], b"second") def floateq(f1, f2) : return math.fabs(f1 - f2) < 0.00001 @@ -197,9 +212,16 @@ def load_rdb(file_name, filters=None) : parser = RdbParser(r, filters) parser.parse(os.path.join(os.path.dirname(__file__), 'dumps', file_name)) return r + +def load_rdb_stream(file_name, filters=None) : + r = MockRedis() + parser = RdbParser(r, filters) + parser.parse_fd(open(os.path.join(os.path.dirname(__file__), 'dumps', file_name), 'rb')) + return r class MockRedis(RdbCallback): - def __init__(self) : + def __init__(self): + super(MockRedis, self).__init__(string_escape=None) self.databases = {} self.lengths = {} self.expiry = {}