Fix partial number read (#41)

mkrd · Nov 20, 2022 · 6074dd4 · 6074dd4
1 parent bfcb551
commit 6074dd4
Show file tree

Hide file tree

Showing 4 changed files with 176 additions and 45 deletions.
diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py
@@ -1,3 +1,4 @@
+# See: https://www.charset.org/utf-8
 BACKSLASH = 92
 QUOTE = 34
 OPEN_SQUARE = 91
@@ -7,3 +8,4 @@
 SPACE = 32
 TAB = 9
 NEWLINE = 10
+COMMA = 44
diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py
@@ -52,32 +52,47 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
 
 	# See https://www.json.org/json-en.html for the JSON syntax
 
-	skip_next, in_str, list_depth, dict_depth = False, False, 0, 0
+	in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes)
 
-	for i in range(index, len(json_bytes)):
-		if skip_next:
-			skip_next = False
-			continue
+	while i < len_json_bytes:
 		current = json_bytes[i]
+		# If backslash, skip the next character
 		if current == byte_codes.BACKSLASH:
-			skip_next = True
-			continue
-		if current == byte_codes.QUOTE:
+			i += 1
+		# If quote, toggle in_str
+		elif current == byte_codes.QUOTE:
 			in_str = not in_str
-		if in_str or current == byte_codes.SPACE:
-			continue
-		if current == byte_codes.OPEN_SQUARE:
+			# Possible exit point where string ends and nesting is zero
+			if not in_str and list_depth == 0 and dict_depth == 0:
+				return i + 1
+		# If in string, skip
+		elif in_str:
+			pass
+
+		# Invariant: Not in_str, not escaped
+
+		# Handle opening brackets
+		elif current == byte_codes.OPEN_SQUARE:
 			list_depth += 1
-		elif current == byte_codes.CLOSE_SQUARE:
-			list_depth -= 1
 		elif current == byte_codes.OPEN_CURLY:
 			dict_depth += 1
-		elif current == byte_codes.CLOSE_CURLY:
-			dict_depth -= 1
-		if list_depth == 0 and dict_depth == 0:
-			return i + 1
-
-	raise TypeError("Invalid JSON syntax")
+		# Handle closing brackets
+		elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]:
+			if current == byte_codes.CLOSE_SQUARE:
+				list_depth -= 1
+			if current == byte_codes.CLOSE_CURLY:
+				dict_depth -= 1
+			if list_depth == 0:
+				if dict_depth == 0:
+					return i + 1
+				if dict_depth == -1:
+					return i  # Case: {"a": {}}
+		elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1):
+			# Handle commas and newline as exit points
+			return i
+		i += 1
+
+	raise TypeError("Invalid JSON")
 
 
 def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
@@ -90,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
 	- `json_bytes`: A bytes object containing valid JSON when decoded
 	"""
 
-	skip_next, in_str, nesting = False, False, 0
-	for i in range(start, end):
-		if skip_next:
-			skip_next = False
-			continue
-		current = json_bytes[i]
-		if current == byte_codes.BACKSLASH:
-			skip_next = True
-			continue
-		if current == byte_codes.QUOTE:
+	in_str, nesting, i = False, 0, start
+	while i < end:
+		byte_i = json_bytes[i]
+		if byte_i == byte_codes.BACKSLASH:
+			i += 1
+		elif byte_i == byte_codes.QUOTE:
 			in_str = not in_str
-		if in_str or current == byte_codes.SPACE:
-			continue
-		elif current == byte_codes.OPEN_CURLY:
+		elif in_str:
+			pass
+		elif byte_i == byte_codes.OPEN_CURLY:
 			nesting += 1
-		elif current == byte_codes.CLOSE_CURLY:
+		elif byte_i == byte_codes.CLOSE_CURLY:
 			nesting -= 1
+		i += 1
 	return nesting
 
 

diff --git a/tests/test_read.py b/tests/test_read.py
@@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent):
 
 def test_read_integrity(use_test_dir, use_compression, use_orjson, indent):
 	cases = [
-		r'{"a": "\\", "b": 2}',
-		r'{"a": "\\\\", "b": 2}',
-		r'{"a": "\\\\\"", "b": 2}',
-		r'{"a": "\\\"\\", "b": 2}',
-		r'{"a": "\"\\\\", "b": 2}',
-		r'{"a": "\"", "b": 2}',
-		r'{"a": "\"\"", "b": 2}',
-		r'{"a": "\"\"\\", "b": 2}',
-		r'{"a": "\"\\\"", "b": 2}',
-		r'{"a": "\\\"\"", "b": 2}',
+		r'{"a": "\\", "b": 0}',
+		r'{"a": "\\\\", "b": 1234}',
+		r'{"a": "\\\\\"", "b": 1234}',
+		r'{"a": "\\\"\\", "b": 1234}',
+		r'{"a": "\"\\\\", "b": 1234}',
+		r'{"a": "\"", "b": 1234}',
+		r'{"a": "\"\"", "b": 1234}',
+		r'{"a": "\"\"\\", "b": 1234}',
+		r'{"a": "\"\\\"", "b": 1234}',
+		r'{"a": "\\\"\"", "b": 1234}',
 	]
 
 	for case in cases:
 		with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f:
 			f.write(case)
-		dd = DDB.at("test_read_integrity", key="a").read()
-		assert dd == json.loads(case)["a"]
+		key_a = DDB.at("test_read_integrity", key="a").read()
+		key_b = DDB.at("test_read_integrity", key="b").read()
+		assert key_a == json.loads(case)["a"]
+		assert key_b == json.loads(case)["b"]
+
 
 
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,114 @@
+import itertools
+import orjson
+from dictdatabase import utils, io_unsafe, byte_codes
+
+
+def test_seek_index_through_value_bytes(use_test_dir):
+	v = b'{"a": 1, "b": {}}'
+	vc = b'{"a":1,"b":{}}'
+
+	assert utils.seek_index_through_value_bytes(v, 5) == 7
+	assert utils.seek_index_through_value_bytes(v, 6) == 7
+	assert utils.seek_index_through_value_bytes(vc, 5) == 6
+
+	assert utils.seek_index_through_value_bytes(v, 13) == 16
+	assert utils.seek_index_through_value_bytes(vc, 11) == 13
+
+
+	n = b'{"a": 1234, "b": {"c": 2}}'
+	assert utils.seek_index_through_value_bytes(n, 5) == 10
+	assert utils.seek_index_through_value_bytes(n, 6) == 10
+
+
+
+
+
+def load_with_orjson(bytes, key):
+	# print("load with orjson", bytes)
+	return orjson.loads(bytes)[key]
+
+
+def load_with_seeker(bytes, key):
+	key_bytes = f"\"{key}\":".encode()
+	a_val_start = bytes.find(key_bytes) + len(key_bytes)
+	if bytes[a_val_start] == byte_codes.SPACE:
+		a_val_start += 1
+	a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start)
+	return orjson.loads(bytes[a_val_start:a_val_end])
+
+
+def test_seek_index_through_value_bytes_2(use_test_dir):
+
+
+	def orjson_dump_with_indent(data):
+		return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)
+
+	def orjson_dump_without_indent(data):
+		return orjson.dumps(data, option=orjson.OPT_SORT_KEYS)
+
+	orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent]
+
+	values = [
+		# Lists
+		[],
+		[1, 2, 3],
+		["xs", "value", "c"],
+		[1, "xs", 2, "value", 3, "c"],
+		[1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]],
+		[{}, {}, {}],
+		[{"xs": 1}, {"value": 2}, {"c": 3}],
+		[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}],
+		[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]],
+		# Dicts
+		{},
+		{"xs": 1},
+		{"xs": 1, "value": 2},
+		{"xs": 1, "value": 2, "c": 3},
+		{"xs": []},
+		{"xs": [], "value": []},
+		{"xs": -3.3, "value": ""},
+		# Numbers
+		1,
+		1234,
+		1.3,
+		-1.3,
+		32.3,
+		0,
+		-0,
+		# Strings
+		"",
+		"a",
+		"hello",
+		"a\\b",
+		"\\",
+		"\\\\",
+		"\\\\\"",
+		"\\\"\\",
+		"\"\\\\",
+		"\"",
+		"\"\"",
+		"\"\"\\",
+		"\"\\\"",
+		"\\\"\"",
+	]
+
+	for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values):
+
+		obj = {"a": v1, "b": v2}
+
+		json_bytes = dumper(obj)
+
+
+		a_from_orjson = load_with_orjson(json_bytes, "a")
+		a_from_seeker = load_with_seeker(json_bytes, "a")
+
+		b_from_orjson = load_with_orjson(json_bytes, "b")
+		b_from_seeker = load_with_seeker(json_bytes, "b")
+
+		# print("obj", obj)
+		# print("a_from_orjson", a_from_orjson)
+		# print("a_from_seeker", a_from_seeker)
+		assert a_from_orjson == a_from_seeker
+		# print("b_from_orjson", b_from_orjson)
+		# print("b_from_seeker", b_from_seeker)
+		assert b_from_orjson == b_from_seeker