Skip to content

Commit

Permalink
Fix partial number read (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
mkrd authored Nov 20, 2022
1 parent bfcb551 commit 6074dd4
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 45 deletions.
2 changes: 2 additions & 0 deletions dictdatabase/byte_codes.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# See: https://www.charset.org/utf-8
BACKSLASH = 92
QUOTE = 34
OPEN_SQUARE = 91
Expand All @@ -7,3 +8,4 @@
SPACE = 32
TAB = 9
NEWLINE = 10
COMMA = 44
78 changes: 45 additions & 33 deletions dictdatabase/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,32 +52,47 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:

# See https://www.json.org/json-en.html for the JSON syntax

skip_next, in_str, list_depth, dict_depth = False, False, 0, 0
in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes)

for i in range(index, len(json_bytes)):
if skip_next:
skip_next = False
continue
while i < len_json_bytes:
current = json_bytes[i]
# If backslash, skip the next character
if current == byte_codes.BACKSLASH:
skip_next = True
continue
if current == byte_codes.QUOTE:
i += 1
# If quote, toggle in_str
elif current == byte_codes.QUOTE:
in_str = not in_str
if in_str or current == byte_codes.SPACE:
continue
if current == byte_codes.OPEN_SQUARE:
# Possible exit point where string ends and nesting is zero
if not in_str and list_depth == 0 and dict_depth == 0:
return i + 1
# If in string, skip
elif in_str:
pass

# Invariant: Not in_str, not escaped

# Handle opening brackets
elif current == byte_codes.OPEN_SQUARE:
list_depth += 1
elif current == byte_codes.CLOSE_SQUARE:
list_depth -= 1
elif current == byte_codes.OPEN_CURLY:
dict_depth += 1
elif current == byte_codes.CLOSE_CURLY:
dict_depth -= 1
if list_depth == 0 and dict_depth == 0:
return i + 1

raise TypeError("Invalid JSON syntax")
# Handle closing brackets
elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]:
if current == byte_codes.CLOSE_SQUARE:
list_depth -= 1
if current == byte_codes.CLOSE_CURLY:
dict_depth -= 1
if list_depth == 0:
if dict_depth == 0:
return i + 1
if dict_depth == -1:
return i # Case: {"a": {}}
elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1):
# Handle commas and newline as exit points
return i
i += 1

raise TypeError("Invalid JSON")


def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
Expand All @@ -90,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
- `json_bytes`: A bytes object containing valid JSON when decoded
"""

skip_next, in_str, nesting = False, False, 0
for i in range(start, end):
if skip_next:
skip_next = False
continue
current = json_bytes[i]
if current == byte_codes.BACKSLASH:
skip_next = True
continue
if current == byte_codes.QUOTE:
in_str, nesting, i = False, 0, start
while i < end:
byte_i = json_bytes[i]
if byte_i == byte_codes.BACKSLASH:
i += 1
elif byte_i == byte_codes.QUOTE:
in_str = not in_str
if in_str or current == byte_codes.SPACE:
continue
elif current == byte_codes.OPEN_CURLY:
elif in_str:
pass
elif byte_i == byte_codes.OPEN_CURLY:
nesting += 1
elif current == byte_codes.CLOSE_CURLY:
elif byte_i == byte_codes.CLOSE_CURLY:
nesting -= 1
i += 1
return nesting


Expand Down
27 changes: 15 additions & 12 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent):

def test_read_integrity(use_test_dir, use_compression, use_orjson, indent):
cases = [
r'{"a": "\\", "b": 2}',
r'{"a": "\\\\", "b": 2}',
r'{"a": "\\\\\"", "b": 2}',
r'{"a": "\\\"\\", "b": 2}',
r'{"a": "\"\\\\", "b": 2}',
r'{"a": "\"", "b": 2}',
r'{"a": "\"\"", "b": 2}',
r'{"a": "\"\"\\", "b": 2}',
r'{"a": "\"\\\"", "b": 2}',
r'{"a": "\\\"\"", "b": 2}',
r'{"a": "\\", "b": 0}',
r'{"a": "\\\\", "b": 1234}',
r'{"a": "\\\\\"", "b": 1234}',
r'{"a": "\\\"\\", "b": 1234}',
r'{"a": "\"\\\\", "b": 1234}',
r'{"a": "\"", "b": 1234}',
r'{"a": "\"\"", "b": 1234}',
r'{"a": "\"\"\\", "b": 1234}',
r'{"a": "\"\\\"", "b": 1234}',
r'{"a": "\\\"\"", "b": 1234}',
]

for case in cases:
with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f:
f.write(case)
dd = DDB.at("test_read_integrity", key="a").read()
assert dd == json.loads(case)["a"]
key_a = DDB.at("test_read_integrity", key="a").read()
key_b = DDB.at("test_read_integrity", key="b").read()
assert key_a == json.loads(case)["a"]
assert key_b == json.loads(case)["b"]




Expand Down
114 changes: 114 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import itertools
import orjson
from dictdatabase import utils, io_unsafe, byte_codes


def test_seek_index_through_value_bytes(use_test_dir):
v = b'{"a": 1, "b": {}}'
vc = b'{"a":1,"b":{}}'

assert utils.seek_index_through_value_bytes(v, 5) == 7
assert utils.seek_index_through_value_bytes(v, 6) == 7
assert utils.seek_index_through_value_bytes(vc, 5) == 6

assert utils.seek_index_through_value_bytes(v, 13) == 16
assert utils.seek_index_through_value_bytes(vc, 11) == 13


n = b'{"a": 1234, "b": {"c": 2}}'
assert utils.seek_index_through_value_bytes(n, 5) == 10
assert utils.seek_index_through_value_bytes(n, 6) == 10





def load_with_orjson(bytes, key):
# print("load with orjson", bytes)
return orjson.loads(bytes)[key]


def load_with_seeker(bytes, key):
key_bytes = f"\"{key}\":".encode()
a_val_start = bytes.find(key_bytes) + len(key_bytes)
if bytes[a_val_start] == byte_codes.SPACE:
a_val_start += 1
a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start)
return orjson.loads(bytes[a_val_start:a_val_end])


def test_seek_index_through_value_bytes_2(use_test_dir):


def orjson_dump_with_indent(data):
return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)

def orjson_dump_without_indent(data):
return orjson.dumps(data, option=orjson.OPT_SORT_KEYS)

orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent]

values = [
# Lists
[],
[1, 2, 3],
["xs", "value", "c"],
[1, "xs", 2, "value", 3, "c"],
[1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]],
[{}, {}, {}],
[{"xs": 1}, {"value": 2}, {"c": 3}],
[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}],
[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]],
# Dicts
{},
{"xs": 1},
{"xs": 1, "value": 2},
{"xs": 1, "value": 2, "c": 3},
{"xs": []},
{"xs": [], "value": []},
{"xs": -3.3, "value": ""},
# Numbers
1,
1234,
1.3,
-1.3,
32.3,
0,
-0,
# Strings
"",
"a",
"hello",
"a\\b",
"\\",
"\\\\",
"\\\\\"",
"\\\"\\",
"\"\\\\",
"\"",
"\"\"",
"\"\"\\",
"\"\\\"",
"\\\"\"",
]

for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values):

obj = {"a": v1, "b": v2}

json_bytes = dumper(obj)


a_from_orjson = load_with_orjson(json_bytes, "a")
a_from_seeker = load_with_seeker(json_bytes, "a")

b_from_orjson = load_with_orjson(json_bytes, "b")
b_from_seeker = load_with_seeker(json_bytes, "b")

# print("obj", obj)
# print("a_from_orjson", a_from_orjson)
# print("a_from_seeker", a_from_seeker)
assert a_from_orjson == a_from_seeker
# print("b_from_orjson", b_from_orjson)
# print("b_from_seeker", b_from_seeker)
assert b_from_orjson == b_from_seeker

0 comments on commit 6074dd4

Please sign in to comment.