diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py new file mode 100644 index 0000000..2c54e19 --- /dev/null +++ b/dictdatabase/dataclasses.py @@ -0,0 +1,8 @@ +import dataclasses + + +@dataclasses.dataclass(frozen=True) +class SearchResult: + start_byte: int + end_byte: int + found: bool diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py new file mode 100644 index 0000000..a7dd8d3 --- /dev/null +++ b/dictdatabase/index_manager.py @@ -0,0 +1,28 @@ +import hashlib + +from dictdatabase import utils + + +class IndexManager: + @staticmethod + def create_index(all_file_bytes: bytes, key: str, start, end): + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value + + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file + + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return key, start, end, indent_level, indent_with, value_hash, end diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index c5eaabc..b147f92 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -1,7 +1,10 @@ -import orjson import os + +import orjson + from . import config + # Problem: Multiple read processes will concurrently read and write the same file # In some cases this will result in a empty read error, thats why the try-except exists diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 7204a30..7c8cba2 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -1,10 +1,19 @@ from __future__ import annotations -from typing import Tuple + +import hashlib +import json from dataclasses import dataclass +from typing import Tuple + import orjson -import json -import hashlib -from . import config, utils, byte_codes, indexing, io_bytes + +from . import byte_codes +from . import config +from . import indexing +from . import io_bytes +from . import searching +from . import utils +from .index_manager import IndexManager @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -79,21 +88,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - - if key_end == -1: + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) + if not found: return None - - # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - # Write key info to index file - indexer.write(key, start, end, indent_level, indent_with, value_hash, end) + indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end)) return orjson.loads(value_bytes) @@ -185,16 +185,16 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + position = searching.search_key_position_in_db(all_file_bytes, key) - if key_end == -1: + if not position.found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) + start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0) end = utils.seek_index_through_value_bytes(all_file_bytes, start) - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) + indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte) partial_value = orjson.loads(all_file_bytes[start:end]) prefix_bytes = all_file_bytes[:start] if config.use_compression else None diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py new file mode 100644 index 0000000..697819b --- /dev/null +++ b/dictdatabase/searching.py @@ -0,0 +1,74 @@ +from typing import Tuple + +import orjson + +from dictdatabase import byte_codes +from dictdatabase import utils +from dictdatabase.dataclasses import SearchResult + + +def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult: + """ + It finds the start and end indices of the value of a key in a JSON file + + Args: + file (bytes): bytes + key (str): The key to find in the JSON file. + + Returns: + A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + if key_end == -1: + return SearchResult(start_byte=-1, end_byte=-1, found=False) + start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) + end = utils.seek_index_through_value_bytes(file, start) + return SearchResult(start_byte=start, end_byte=end, found=True) + + +def search_key_position_in_db( + file: bytes, key: str, glom_searching=True +) -> SearchResult: + original_value_start = 0 + original_value_end = len(file) + original_key_start = 0 + original_key_end = len(file) + for k in key.split(".") if glom_searching else [key]: + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k) + if key_end == -1: + return SearchResult(start_byte=-1, end_byte=-1, found=False) + original_key_end = original_value_start + key_end + original_key_start = original_value_start + key_start + position = find_key_position_in_bytes(file, k) + original_value_end = original_value_start + original_value_end + original_value_start += position.start_byte + file = file[original_value_start:original_value_end] + return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True) + + +def search_value_position_in_db( + all_file_bytes: bytes, key: str, glom_searching=True +) -> Tuple[int, int, bool]: + """ + It takes a byte string, a key, and a boolean, and returns a tuple of three integers + + Args: + all_file_bytes (bytes): The bytes of the file you're searching in. + key (str): The key to search for. + glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to + True + + Returns: + The start and end of the key in the file. + """ + original_start = 0 + original_end = len(all_file_bytes) + for k in key.split(".") if glom_searching else [key]: + position = find_key_position_in_bytes( + all_file_bytes[original_start:original_end], k + ) + if not position.found: + return -1, -1, False + original_end = original_start + position.end_byte + original_start += position.start_byte + return original_start, original_end, True diff --git a/tests/benchmark/run_parallel.py b/tests/benchmark/run_parallel.py index 6065f51..7f0799d 100644 --- a/tests/benchmark/run_parallel.py +++ b/tests/benchmark/run_parallel.py @@ -89,9 +89,9 @@ class Scenario: ops: int = 10 def print(self): - res = f"✨ Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)" - res += ", 🔸 compression" if self.use_compression else "" - res += ", 💎 big file" if self.big_file else "" + res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)" + res += ", [] compression" if self.use_compression else "" + res += ", {} big file" if self.big_file else "" print(res) diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py new file mode 100644 index 0000000..cc1ec05 --- /dev/null +++ b/tests/test_glom_like_searching.py @@ -0,0 +1,31 @@ +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer" + + +def test_without_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="Ben").read() == { + "job": {"age": 30, "job": "Software Engineer"} + } + + +def test_glom_searching_if_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Job.Ben").read() is None + + +def test_glom_searching_if_subkey_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py new file mode 100644 index 0000000..4255cd0 --- /dev/null +++ b/tests/test_glom_writing.py @@ -0,0 +1,27 @@ +import pytest + +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_writing(): + DDB.at("users").create(data, force_overwrite=True) + with DDB.at("users", key="users.Ben").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write() + assert DDB.at("users", key="users.Ben.status").read() == "cancelled" + + +def test_glom_writing_sub_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + with pytest.raises(KeyError): + with DDB.at("users", key="users.SUBKEY").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write()