From 8bf46041dae44c9aa39472d135689cc772913b2b Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Mon, 21 Nov 2022 21:45:05 +0300 Subject: [PATCH 01/10] add glom like searching --- dictdatabase/index_manager.py | 28 +++++++++++++++++ dictdatabase/io_unsafe.py | 17 ++++------- dictdatabase/searching.py | 50 +++++++++++++++++++++++++++++++ tests/test_glom_like_searching.py | 21 +++++++++++++ 4 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 dictdatabase/index_manager.py create mode 100644 dictdatabase/searching.py create mode 100644 tests/test_glom_like_searching.py diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py new file mode 100644 index 0000000..a7dd8d3 --- /dev/null +++ b/dictdatabase/index_manager.py @@ -0,0 +1,28 @@ +import hashlib + +from dictdatabase import utils + + +class IndexManager: + @staticmethod + def create_index(all_file_bytes: bytes, key: str, start, end): + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value + + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file + + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return key, start, end, indent_level, indent_with, value_hash, end diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 7204a30..5d2f242 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -5,6 +5,8 @@ import json import hashlib from . import config, utils, byte_codes, indexing, io_bytes +from .index_manager import IndexManager +from .searching import Searcher @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -79,21 +81,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - - if key_end == -1: + start, end, found = Searcher().search(all_file_bytes, key) + if not found: return None - - # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - # Write key info to index file - indexer.write(key, start, end, indent_level, indent_with, value_hash, end) + indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end)) return orjson.loads(value_bytes) diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py new file mode 100644 index 0000000..0dc55b4 --- /dev/null +++ b/dictdatabase/searching.py @@ -0,0 +1,50 @@ +from dictdatabase import byte_codes +from dictdatabase import utils + + +class Searcher: + @staticmethod + def find_start_end_in_bytes(file: bytes, key: str) -> tuple[int, int, bool]: + """ + It finds the start and end indices of the value of a key in a JSON file + + Args: + file (bytes): bytes + key (str): The key to find in the JSON file. + + Returns: + A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + if key_end == -1: + return -1, -1, False + start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) + end = utils.seek_index_through_value_bytes(file, start) + return start, end, True + + def search( + self, all_file_bytes: bytes, key: str, glom_searching=True + ) -> tuple[int, int, bool]: + """ + It takes a byte string, a key, and a boolean, and returns a tuple of three integers + + Args: + all_file_bytes (bytes): The bytes of the file you're searching in. + key (str): The key to search for. + glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to + True + + Returns: + The start and end of the key in the file. + """ + original_start = 0 + original_end = len(all_file_bytes) + for k in key.split(".") if glom_searching else [key]: + start, end, found = self.find_start_end_in_bytes( + all_file_bytes[original_start:original_end], k + ) + if not found: + return -1, -1, False + original_end = original_start + end + original_start += start + return original_start, original_end, True diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py new file mode 100644 index 0000000..cea6670 --- /dev/null +++ b/tests/test_glom_like_searching.py @@ -0,0 +1,21 @@ +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.job").read() == 'Software Engineer' + + +def test_without_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="Ben").read() == { + "job": {"age": 30, "job": "Software Engineer"} + } From 8796aea7e4fe201cf65860f000be747bb308d6a4 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Mon, 21 Nov 2022 21:46:59 +0300 Subject: [PATCH 02/10] reformat imports --- dictdatabase/indexing.py | 5 ++++- dictdatabase/io_unsafe.py | 15 +++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index c5eaabc..b147f92 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -1,7 +1,10 @@ -import orjson import os + +import orjson + from . import config + # Problem: Multiple read processes will concurrently read and write the same file # In some cases this will result in a empty read error, thats why the try-except exists diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 5d2f242..5a2b09a 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -1,10 +1,17 @@ from __future__ import annotations -from typing import Tuple + +import hashlib +import json from dataclasses import dataclass +from typing import Tuple + import orjson -import json -import hashlib -from . import config, utils, byte_codes, indexing, io_bytes + +from . import byte_codes +from . import config +from . import indexing +from . import io_bytes +from . import utils from .index_manager import IndexManager from .searching import Searcher From 4d80f328607000e99fee0db7492c86349607732f Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Tue, 22 Nov 2022 10:27:26 +0300 Subject: [PATCH 03/10] fix type hinting --- dictdatabase/searching.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index 0dc55b4..81c4e30 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -1,10 +1,12 @@ +from typing import Tuple + from dictdatabase import byte_codes from dictdatabase import utils class Searcher: @staticmethod - def find_start_end_in_bytes(file: bytes, key: str) -> tuple[int, int, bool]: + def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ It finds the start and end indices of the value of a key in a JSON file @@ -24,7 +26,7 @@ def find_start_end_in_bytes(file: bytes, key: str) -> tuple[int, int, bool]: def search( self, all_file_bytes: bytes, key: str, glom_searching=True - ) -> tuple[int, int, bool]: + ) -> Tuple[int, int, bool]: """ It takes a byte string, a key, and a boolean, and returns a tuple of three integers From 5d5a1034b139db04b9792e182d235339bdd5d901 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Tue, 22 Nov 2022 11:20:54 +0300 Subject: [PATCH 04/10] add tests --- tests/test_glom_like_searching.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py index cea6670..cc1ec05 100644 --- a/tests/test_glom_like_searching.py +++ b/tests/test_glom_like_searching.py @@ -11,7 +11,7 @@ def test_glom_searching(): DDB.at("users").create(data, force_overwrite=True) - assert DDB.at("users", key="users.Ben.job").read() == 'Software Engineer' + assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer" def test_without_glom_searching(): @@ -19,3 +19,13 @@ def test_without_glom_searching(): assert DDB.at("users", key="Ben").read() == { "job": {"age": 30, "job": "Software Engineer"} } + + +def test_glom_searching_if_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Job.Ben").read() is None + + +def test_glom_searching_if_subkey_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None From 8940e493ef490062bd9a0f6c59854f395a0ec922 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Tue, 22 Nov 2022 11:21:07 +0300 Subject: [PATCH 05/10] rename Searcher -> KeySearcher --- dictdatabase/io_unsafe.py | 4 ++-- dictdatabase/searching.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 5a2b09a..d9ea9ab 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -13,7 +13,7 @@ from . import io_bytes from . import utils from .index_manager import IndexManager -from .searching import Searcher +from .searching import KeySearcher @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -88,7 +88,7 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - start, end, found = Searcher().search(all_file_bytes, key) + start, end, found = KeySearcher().search(all_file_bytes, key) if not found: return None value_bytes = all_file_bytes[start:end] diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index 81c4e30..cc6177d 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -4,7 +4,7 @@ from dictdatabase import utils -class Searcher: +class KeySearcher: @staticmethod def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ From f96351788c8c6f82257a770735829e01ddbd4ef9 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:08:18 +0300 Subject: [PATCH 06/10] add partial write --- dictdatabase/io_unsafe.py | 8 +-- dictdatabase/searching.py | 103 ++++++++++++++++++++++--------------- tests/test_glom_writing.py | 17 ++++++ 3 files changed, 82 insertions(+), 46 deletions(-) create mode 100644 tests/test_glom_writing.py diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index d9ea9ab..83a85a9 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -11,9 +11,9 @@ from . import config from . import indexing from . import io_bytes +from . import searching from . import utils from .index_manager import IndexManager -from .searching import KeySearcher @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -88,7 +88,7 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - start, end, found = KeySearcher().search(all_file_bytes, key) + start, end, found = searching.search_value_by_key(all_file_bytes, key) if not found: return None value_bytes = all_file_bytes[start:end] @@ -185,9 +185,9 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + key_start, key_end, found = searching.search_key(all_file_bytes, key) - if key_end == -1: + if not found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") # Key found, now determine the bounding byte indices of the value diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index cc6177d..f661bde 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -1,52 +1,71 @@ from typing import Tuple +import orjson + from dictdatabase import byte_codes from dictdatabase import utils -class KeySearcher: - @staticmethod - def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: - """ - It finds the start and end indices of the value of a key in a JSON file +def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: + """ + It finds the start and end indices of the value of a key in a JSON file + + Args: + file (bytes): bytes + key (str): The key to find in the JSON file. - Args: - file (bytes): bytes - key (str): The key to find in the JSON file. + Returns: + A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + if key_end == -1: + return -1, -1, False + start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) + end = utils.seek_index_through_value_bytes(file, start) + return start, end, True - Returns: - A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. - """ - key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + +def search_key(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: + original_value_start = 0 + original_value_end = len(file) + original_key_start = 0 + original_key_end = len(file) + for k in key.split(".") if glom_searching else [key]: + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k) if key_end == -1: return -1, -1, False - start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(file, start) - return start, end, True - - def search( - self, all_file_bytes: bytes, key: str, glom_searching=True - ) -> Tuple[int, int, bool]: - """ - It takes a byte string, a key, and a boolean, and returns a tuple of three integers - - Args: - all_file_bytes (bytes): The bytes of the file you're searching in. - key (str): The key to search for. - glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to - True - - Returns: - The start and end of the key in the file. - """ - original_start = 0 - original_end = len(all_file_bytes) - for k in key.split(".") if glom_searching else [key]: - start, end, found = self.find_start_end_in_bytes( - all_file_bytes[original_start:original_end], k - ) - if not found: - return -1, -1, False - original_end = original_start + end - original_start += start - return original_start, original_end, True + original_key_end = original_value_start + key_end + original_key_start = original_value_start + key_start + value_start, value_end, found = find_start_end_in_bytes(file, k) + original_value_end = original_value_start + original_value_end + original_value_start += value_start + file = file[original_value_start:original_value_end] + return original_key_start, original_key_end, True + + +def search_value_by_key( + all_file_bytes: bytes, key: str, glom_searching=True +) -> Tuple[int, int, bool]: + """ + It takes a byte string, a key, and a boolean, and returns a tuple of three integers + + Args: + all_file_bytes (bytes): The bytes of the file you're searching in. + key (str): The key to search for. + glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to + True + + Returns: + The start and end of the key in the file. + """ + original_start = 0 + original_end = len(all_file_bytes) + for k in key.split(".") if glom_searching else [key]: + start, end, found = find_start_end_in_bytes( + all_file_bytes[original_start:original_end], k + ) + if not found: + return -1, -1, False + original_end = original_start + end + original_start += start + return original_start, original_end, True diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py new file mode 100644 index 0000000..2702884 --- /dev/null +++ b/tests/test_glom_writing.py @@ -0,0 +1,17 @@ +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_writing(): + DDB.at("users").create(data, force_overwrite=True) + with DDB.at("users", key="users.Ben").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write() + assert DDB.at("users", key="users.Ben.status").read() == "cancelled" From b1a2e8e27f6c0a99a11dd7610d5d2e449ff956da Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:10:51 +0300 Subject: [PATCH 07/10] fix print compatibility --- tests/benchmark/run_parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/benchmark/run_parallel.py b/tests/benchmark/run_parallel.py index 6065f51..7f0799d 100644 --- a/tests/benchmark/run_parallel.py +++ b/tests/benchmark/run_parallel.py @@ -89,9 +89,9 @@ class Scenario: ops: int = 10 def print(self): - res = f"✨ Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)" - res += ", 🔸 compression" if self.use_compression else "" - res += ", 💎 big file" if self.big_file else "" + res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)" + res += ", [] compression" if self.use_compression else "" + res += ", {} big file" if self.big_file else "" print(res) From 193e49ff90bcaf410317d8e516600131fc3f1094 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:16:08 +0300 Subject: [PATCH 08/10] renaming --- dictdatabase/io_unsafe.py | 4 ++-- dictdatabase/searching.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 83a85a9..fbc405b 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -88,7 +88,7 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - start, end, found = searching.search_value_by_key(all_file_bytes, key) + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) if not found: return None value_bytes = all_file_bytes[start:end] @@ -185,7 +185,7 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end, found = searching.search_key(all_file_bytes, key) + key_start, key_end, found = searching.search_key_position_in_db(all_file_bytes, key) if not found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index f661bde..62ed857 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -6,7 +6,7 @@ from dictdatabase import utils -def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: +def find_key_position_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ It finds the start and end indices of the value of a key in a JSON file @@ -25,7 +25,7 @@ def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: return start, end, True -def search_key(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: +def search_key_position_in_db(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: original_value_start = 0 original_value_end = len(file) original_key_start = 0 @@ -36,14 +36,14 @@ def search_key(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bo return -1, -1, False original_key_end = original_value_start + key_end original_key_start = original_value_start + key_start - value_start, value_end, found = find_start_end_in_bytes(file, k) + value_start, value_end, found = find_key_position_in_bytes(file, k) original_value_end = original_value_start + original_value_end original_value_start += value_start file = file[original_value_start:original_value_end] return original_key_start, original_key_end, True -def search_value_by_key( +def search_value_position_in_db( all_file_bytes: bytes, key: str, glom_searching=True ) -> Tuple[int, int, bool]: """ @@ -61,7 +61,7 @@ def search_value_by_key( original_start = 0 original_end = len(all_file_bytes) for k in key.split(".") if glom_searching else [key]: - start, end, found = find_start_end_in_bytes( + start, end, found = find_key_position_in_bytes( all_file_bytes[original_start:original_end], k ) if not found: From d86fb47dc558376ab8140c245d24eaae088c7026 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:25:27 +0300 Subject: [PATCH 09/10] add negative test --- tests/test_glom_writing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py index 2702884..4255cd0 100644 --- a/tests/test_glom_writing.py +++ b/tests/test_glom_writing.py @@ -1,3 +1,5 @@ +import pytest + import dictdatabase as DDB data = { @@ -15,3 +17,11 @@ def test_glom_writing(): purchase["status"] = "cancelled" session.write() assert DDB.at("users", key="users.Ben.status").read() == "cancelled" + + +def test_glom_writing_sub_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + with pytest.raises(KeyError): + with DDB.at("users", key="users.SUBKEY").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write() From c8918e502c2d4885fb1ab7bf9143d42c09645133 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:38:11 +0300 Subject: [PATCH 10/10] add dataclass for searching --- dictdatabase/dataclasses.py | 8 ++++++++ dictdatabase/io_unsafe.py | 8 ++++---- dictdatabase/searching.py | 27 +++++++++++++++------------ 3 files changed, 27 insertions(+), 16 deletions(-) create mode 100644 dictdatabase/dataclasses.py diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py new file mode 100644 index 0000000..2c54e19 --- /dev/null +++ b/dictdatabase/dataclasses.py @@ -0,0 +1,8 @@ +import dataclasses + + +@dataclasses.dataclass(frozen=True) +class SearchResult: + start_byte: int + end_byte: int + found: bool diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index fbc405b..7c8cba2 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -185,16 +185,16 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end, found = searching.search_key_position_in_db(all_file_bytes, key) + position = searching.search_key_position_in_db(all_file_bytes, key) - if not found: + if not position.found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) + start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0) end = utils.seek_index_through_value_bytes(all_file_bytes, start) - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) + indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte) partial_value = orjson.loads(all_file_bytes[start:end]) prefix_bytes = all_file_bytes[:start] if config.use_compression else None diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index 62ed857..697819b 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -4,9 +4,10 @@ from dictdatabase import byte_codes from dictdatabase import utils +from dictdatabase.dataclasses import SearchResult -def find_key_position_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: +def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult: """ It finds the start and end indices of the value of a key in a JSON file @@ -19,13 +20,15 @@ def find_key_position_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) if key_end == -1: - return -1, -1, False + return SearchResult(start_byte=-1, end_byte=-1, found=False) start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) end = utils.seek_index_through_value_bytes(file, start) - return start, end, True + return SearchResult(start_byte=start, end_byte=end, found=True) -def search_key_position_in_db(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: +def search_key_position_in_db( + file: bytes, key: str, glom_searching=True +) -> SearchResult: original_value_start = 0 original_value_end = len(file) original_key_start = 0 @@ -33,14 +36,14 @@ def search_key_position_in_db(file: bytes, key: str, glom_searching=True) -> Tup for k in key.split(".") if glom_searching else [key]: key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k) if key_end == -1: - return -1, -1, False + return SearchResult(start_byte=-1, end_byte=-1, found=False) original_key_end = original_value_start + key_end original_key_start = original_value_start + key_start - value_start, value_end, found = find_key_position_in_bytes(file, k) + position = find_key_position_in_bytes(file, k) original_value_end = original_value_start + original_value_end - original_value_start += value_start + original_value_start += position.start_byte file = file[original_value_start:original_value_end] - return original_key_start, original_key_end, True + return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True) def search_value_position_in_db( @@ -61,11 +64,11 @@ def search_value_position_in_db( original_start = 0 original_end = len(all_file_bytes) for k in key.split(".") if glom_searching else [key]: - start, end, found = find_key_position_in_bytes( + position = find_key_position_in_bytes( all_file_bytes[original_start:original_end], k ) - if not found: + if not position.found: return -1, -1, False - original_end = original_start + end - original_start += start + original_end = original_start + position.end_byte + original_start += position.start_byte return original_start, original_end, True