Merge pull request #42 from UmbrellaMalware/main

Add glom-like searching for keys
mkrd · Nov 23, 2022 · a25afcf · a25afcf
2 parents 93d0df0 + fc1d8b7
commit a25afcf
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 24 deletions.
diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py
@@ -0,0 +1,8 @@
+import dataclasses
+
+
+@dataclasses.dataclass(frozen=True)
+class SearchResult:
+    start_byte: int
+    end_byte: int
+    found: bool
diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py
@@ -0,0 +1,28 @@
+import hashlib
+
+from dictdatabase import utils
+
+
+class IndexManager:
+    @staticmethod
+    def create_index(all_file_bytes: bytes, key: str, start, end):
+        """
+        It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
+        value
+
+        Args:
+          all_file_bytes (bytes): The entire file as a byte string.
+          key (str): The key of the value we're indexing.
+          start: the start of the value in the file
+          end: the end of the value in the file
+
+        Returns:
+          The key, start, end, indent_level, indent_with, value_hash, end
+        """
+        key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
+        indent_level, indent_with = utils.detect_indentation_in_json_bytes(
+            all_file_bytes, key_start
+        )
+        value_bytes = all_file_bytes[start:end]
+        value_hash = hashlib.sha256(value_bytes).hexdigest()
+        return key, start, end, indent_level, indent_with, value_hash, end
diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py
@@ -1,7 +1,10 @@
-import orjson
 import os
+
+import orjson
+
 from . import config
 
+
 # Problem: Multiple read processes will concurrently read and write the same file
 # In some cases this will result in a empty read error, thats why the try-except exists
 

diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py
@@ -1,10 +1,19 @@
 from __future__ import annotations
-from typing import Tuple
+
+import hashlib
+import json
 from dataclasses import dataclass
+from typing import Tuple
+
 import orjson
-import json
-import hashlib
-from . import config, utils, byte_codes, indexing, io_bytes
+
+from . import byte_codes
+from . import config
+from . import indexing
+from . import io_bytes
+from . import searching
+from . import utils
+from .index_manager import IndexManager
 
 
 @dataclass(frozen=True)  # slots=True not supported by python 3.8 and 3.9
@@ -79,21 +88,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None:
 
 	# Not found in index file, search for key in the entire file
 	all_file_bytes = io_bytes.read(db_name)
-	key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
-
-	if key_end == -1:
+	start, end, found = searching.search_value_position_in_db(all_file_bytes, key)
+	if not found:
 		return None
-
-	# Key found, now determine the bounding byte indices of the value
-	start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
-	end = utils.seek_index_through_value_bytes(all_file_bytes, start)
-
-	indent_level, indent_with  = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
 	value_bytes = all_file_bytes[start:end]
-	value_hash = hashlib.sha256(value_bytes).hexdigest()
-
 	# Write key info to index file
-	indexer.write(key, start, end, indent_level, indent_with, value_hash, end)
+	indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end))
 	return orjson.loads(value_bytes)
 
 
@@ -185,16 +185,16 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
 		return partial_handle
 
 	# Not found in index file, search for key in the entire file
-	key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
+	position = searching.search_key_position_in_db(all_file_bytes, key)
 
-	if key_end == -1:
+	if not position.found:
 		raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")
 
 	# Key found, now determine the bounding byte indices of the value
-	start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
+	start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0)
 	end = utils.seek_index_through_value_bytes(all_file_bytes, start)
 
-	indent_level, indent_with  = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
+	indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte)
 
 	partial_value = orjson.loads(all_file_bytes[start:end])
 	prefix_bytes = all_file_bytes[:start] if config.use_compression else None

diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py
@@ -0,0 +1,74 @@
+from typing import Tuple
+
+import orjson
+
+from dictdatabase import byte_codes
+from dictdatabase import utils
+from dictdatabase.dataclasses import SearchResult
+
+
+def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult:
+    """
+    It finds the start and end indices of the value of a key in a JSON file
+
+    Args:
+      file (bytes): bytes
+      key (str): The key to find in the JSON file.
+
+    Returns:
+      A tuple of the start and end index of the key, and a boolean value indicating whether the key was found.
+    """
+    key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key)
+    if key_end == -1:
+        return SearchResult(start_byte=-1, end_byte=-1, found=False)
+    start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0)
+    end = utils.seek_index_through_value_bytes(file, start)
+    return SearchResult(start_byte=start, end_byte=end, found=True)
+
+
+def search_key_position_in_db(
+    file: bytes, key: str, glom_searching=True
+) -> SearchResult:
+    original_value_start = 0
+    original_value_end = len(file)
+    original_key_start = 0
+    original_key_end = len(file)
+    for k in key.split(".") if glom_searching else [key]:
+        key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k)
+        if key_end == -1:
+            return SearchResult(start_byte=-1, end_byte=-1, found=False)
+        original_key_end = original_value_start + key_end
+        original_key_start = original_value_start + key_start
+        position = find_key_position_in_bytes(file, k)
+        original_value_end = original_value_start + original_value_end
+        original_value_start += position.start_byte
+        file = file[original_value_start:original_value_end]
+    return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True)
+
+
+def search_value_position_in_db(
+    all_file_bytes: bytes, key: str, glom_searching=True
+) -> Tuple[int, int, bool]:
+    """
+    It takes a byte string, a key, and a boolean, and returns a tuple of three integers
+
+    Args:
+      all_file_bytes (bytes): The bytes of the file you're searching in.
+      key (str): The key to search for.
+      glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to
+    True
+
+    Returns:
+      The start and end of the key in the file.
+    """
+    original_start = 0
+    original_end = len(all_file_bytes)
+    for k in key.split(".") if glom_searching else [key]:
+        position = find_key_position_in_bytes(
+            all_file_bytes[original_start:original_end], k
+        )
+        if not position.found:
+            return -1, -1, False
+        original_end = original_start + position.end_byte
+        original_start += position.start_byte
+    return original_start, original_end, True
diff --git a/tests/benchmark/run_parallel.py b/tests/benchmark/run_parallel.py
@@ -89,9 +89,9 @@ class Scenario:
 	ops: int = 10
 
 	def print(self):
-		res = f"✨ Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)"
-		res += ", 🔸 compression" if self.use_compression else ""
-		res += ", 💎 big file" if self.big_file else ""
+		res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)"
+		res += ", [] compression" if self.use_compression else ""
+		res += ", {} big file" if self.big_file else ""
 		print(res)
 
 

diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py
@@ -0,0 +1,31 @@
+import dictdatabase as DDB
+
+data = {
+    "users": {
+        "Ben": {"age": 30, "job": "Software Engineer"},
+        "Bob": {"age": 30, "job": "Plumbers"},
+    },
+    "Ben": {"job": {"age": 30, "job": "Software Engineer"}},
+}
+
+
+def test_glom_searching():
+    DDB.at("users").create(data, force_overwrite=True)
+    assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer"
+
+
+def test_without_glom_searching():
+    DDB.at("users").create(data, force_overwrite=True)
+    assert DDB.at("users", key="Ben").read() == {
+        "job": {"age": 30, "job": "Software Engineer"}
+    }
+
+
+def test_glom_searching_if_key_not_exists():
+    DDB.at("users").create(data, force_overwrite=True)
+    assert DDB.at("users", key="users.Job.Ben").read() is None
+
+
+def test_glom_searching_if_subkey_not_exists():
+    DDB.at("users").create(data, force_overwrite=True)
+    assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None
diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py
@@ -0,0 +1,27 @@
+import pytest
+
+import dictdatabase as DDB
+
+data = {
+    "users": {
+        "Ben": {"age": 30, "job": "Software Engineer"},
+        "Bob": {"age": 30, "job": "Plumbers"},
+    },
+    "Ben": {"job": {"age": 30, "job": "Software Engineer"}},
+}
+
+
+def test_glom_writing():
+    DDB.at("users").create(data, force_overwrite=True)
+    with DDB.at("users", key="users.Ben").session() as (session, purchase):
+        purchase["status"] = "cancelled"
+        session.write()
+    assert DDB.at("users", key="users.Ben.status").read() == "cancelled"
+
+
+def test_glom_writing_sub_key_not_exists():
+    DDB.at("users").create(data, force_overwrite=True)
+    with pytest.raises(KeyError):
+        with DDB.at("users", key="users.SUBKEY").session() as (session, purchase):
+            purchase["status"] = "cancelled"
+            session.write()