diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index d5a85e4..5f50482 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -1,3 +1,4 @@ +# See: https://www.charset.org/utf-8 BACKSLASH = 92 QUOTE = 34 OPEN_SQUARE = 91 @@ -7,3 +8,4 @@ SPACE = 32 TAB = 9 NEWLINE = 10 +COMMA = 44 diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py index 55de637..052c3cf 100644 --- a/dictdatabase/utils.py +++ b/dictdatabase/utils.py @@ -52,32 +52,47 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: # See https://www.json.org/json-en.html for the JSON syntax - skip_next, in_str, list_depth, dict_depth = False, False, 0, 0 + in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes) - for i in range(index, len(json_bytes)): - if skip_next: - skip_next = False - continue + while i < len_json_bytes: current = json_bytes[i] + # If backslash, skip the next character if current == byte_codes.BACKSLASH: - skip_next = True - continue - if current == byte_codes.QUOTE: + i += 1 + # If quote, toggle in_str + elif current == byte_codes.QUOTE: in_str = not in_str - if in_str or current == byte_codes.SPACE: - continue - if current == byte_codes.OPEN_SQUARE: + # Possible exit point where string ends and nesting is zero + if not in_str and list_depth == 0 and dict_depth == 0: + return i + 1 + # If in string, skip + elif in_str: + pass + + # Invariant: Not in_str, not escaped + + # Handle opening brackets + elif current == byte_codes.OPEN_SQUARE: list_depth += 1 - elif current == byte_codes.CLOSE_SQUARE: - list_depth -= 1 elif current == byte_codes.OPEN_CURLY: dict_depth += 1 - elif current == byte_codes.CLOSE_CURLY: - dict_depth -= 1 - if list_depth == 0 and dict_depth == 0: - return i + 1 - - raise TypeError("Invalid JSON syntax") + # Handle closing brackets + elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]: + if current == byte_codes.CLOSE_SQUARE: + list_depth -= 1 + if current == byte_codes.CLOSE_CURLY: + dict_depth -= 1 + if list_depth == 0: + if dict_depth == 0: + return i + 1 + if dict_depth == -1: + return i # Case: {"a": {}} + elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1): + # Handle commas and newline as exit points + return i + i += 1 + + raise TypeError("Invalid JSON") def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int: @@ -90,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int: - `json_bytes`: A bytes object containing valid JSON when decoded """ - skip_next, in_str, nesting = False, False, 0 - for i in range(start, end): - if skip_next: - skip_next = False - continue - current = json_bytes[i] - if current == byte_codes.BACKSLASH: - skip_next = True - continue - if current == byte_codes.QUOTE: + in_str, nesting, i = False, 0, start + while i < end: + byte_i = json_bytes[i] + if byte_i == byte_codes.BACKSLASH: + i += 1 + elif byte_i == byte_codes.QUOTE: in_str = not in_str - if in_str or current == byte_codes.SPACE: - continue - elif current == byte_codes.OPEN_CURLY: + elif in_str: + pass + elif byte_i == byte_codes.OPEN_CURLY: nesting += 1 - elif current == byte_codes.CLOSE_CURLY: + elif byte_i == byte_codes.CLOSE_CURLY: nesting -= 1 + i += 1 return nesting diff --git a/tests/test_read.py b/tests/test_read.py index 9c6cda3..4219af1 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent): def test_read_integrity(use_test_dir, use_compression, use_orjson, indent): cases = [ - r'{"a": "\\", "b": 2}', - r'{"a": "\\\\", "b": 2}', - r'{"a": "\\\\\"", "b": 2}', - r'{"a": "\\\"\\", "b": 2}', - r'{"a": "\"\\\\", "b": 2}', - r'{"a": "\"", "b": 2}', - r'{"a": "\"\"", "b": 2}', - r'{"a": "\"\"\\", "b": 2}', - r'{"a": "\"\\\"", "b": 2}', - r'{"a": "\\\"\"", "b": 2}', + r'{"a": "\\", "b": 0}', + r'{"a": "\\\\", "b": 1234}', + r'{"a": "\\\\\"", "b": 1234}', + r'{"a": "\\\"\\", "b": 1234}', + r'{"a": "\"\\\\", "b": 1234}', + r'{"a": "\"", "b": 1234}', + r'{"a": "\"\"", "b": 1234}', + r'{"a": "\"\"\\", "b": 1234}', + r'{"a": "\"\\\"", "b": 1234}', + r'{"a": "\\\"\"", "b": 1234}', ] for case in cases: with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f: f.write(case) - dd = DDB.at("test_read_integrity", key="a").read() - assert dd == json.loads(case)["a"] + key_a = DDB.at("test_read_integrity", key="a").read() + key_b = DDB.at("test_read_integrity", key="b").read() + assert key_a == json.loads(case)["a"] + assert key_b == json.loads(case)["b"] + diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..b8cfddf --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,114 @@ +import itertools +import orjson +from dictdatabase import utils, io_unsafe, byte_codes + + +def test_seek_index_through_value_bytes(use_test_dir): + v = b'{"a": 1, "b": {}}' + vc = b'{"a":1,"b":{}}' + + assert utils.seek_index_through_value_bytes(v, 5) == 7 + assert utils.seek_index_through_value_bytes(v, 6) == 7 + assert utils.seek_index_through_value_bytes(vc, 5) == 6 + + assert utils.seek_index_through_value_bytes(v, 13) == 16 + assert utils.seek_index_through_value_bytes(vc, 11) == 13 + + + n = b'{"a": 1234, "b": {"c": 2}}' + assert utils.seek_index_through_value_bytes(n, 5) == 10 + assert utils.seek_index_through_value_bytes(n, 6) == 10 + + + + + +def load_with_orjson(bytes, key): + # print("load with orjson", bytes) + return orjson.loads(bytes)[key] + + +def load_with_seeker(bytes, key): + key_bytes = f"\"{key}\":".encode() + a_val_start = bytes.find(key_bytes) + len(key_bytes) + if bytes[a_val_start] == byte_codes.SPACE: + a_val_start += 1 + a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start) + return orjson.loads(bytes[a_val_start:a_val_end]) + + +def test_seek_index_through_value_bytes_2(use_test_dir): + + + def orjson_dump_with_indent(data): + return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS) + + def orjson_dump_without_indent(data): + return orjson.dumps(data, option=orjson.OPT_SORT_KEYS) + + orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent] + + values = [ + # Lists + [], + [1, 2, 3], + ["xs", "value", "c"], + [1, "xs", 2, "value", 3, "c"], + [1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]], + [{}, {}, {}], + [{"xs": 1}, {"value": 2}, {"c": 3}], + [{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}], + [{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]], + # Dicts + {}, + {"xs": 1}, + {"xs": 1, "value": 2}, + {"xs": 1, "value": 2, "c": 3}, + {"xs": []}, + {"xs": [], "value": []}, + {"xs": -3.3, "value": ""}, + # Numbers + 1, + 1234, + 1.3, + -1.3, + 32.3, + 0, + -0, + # Strings + "", + "a", + "hello", + "a\\b", + "\\", + "\\\\", + "\\\\\"", + "\\\"\\", + "\"\\\\", + "\"", + "\"\"", + "\"\"\\", + "\"\\\"", + "\\\"\"", + ] + + for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values): + + obj = {"a": v1, "b": v2} + + json_bytes = dumper(obj) + + + a_from_orjson = load_with_orjson(json_bytes, "a") + a_from_seeker = load_with_seeker(json_bytes, "a") + + b_from_orjson = load_with_orjson(json_bytes, "b") + b_from_seeker = load_with_seeker(json_bytes, "b") + + # print("obj", obj) + # print("a_from_orjson", a_from_orjson) + # print("a_from_seeker", a_from_seeker) + assert a_from_orjson == a_from_seeker + # print("b_from_orjson", b_from_orjson) + # print("b_from_seeker", b_from_seeker) + assert b_from_orjson == b_from_seeker