Skip to content

Commit

Permalink
Merge pull request #42 from UmbrellaMalware/main
Browse files Browse the repository at this point in the history
Add glom-like searching for keys
  • Loading branch information
UmbrellaMalware authored Nov 23, 2022
2 parents 93d0df0 + fc1d8b7 commit a25afcf
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 24 deletions.
8 changes: 8 additions & 0 deletions dictdatabase/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import dataclasses


@dataclasses.dataclass(frozen=True)
class SearchResult:
start_byte: int
end_byte: int
found: bool
28 changes: 28 additions & 0 deletions dictdatabase/index_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import hashlib

from dictdatabase import utils


class IndexManager:
@staticmethod
def create_index(all_file_bytes: bytes, key: str, start, end):
"""
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
value
Args:
all_file_bytes (bytes): The entire file as a byte string.
key (str): The key of the value we're indexing.
start: the start of the value in the file
end: the end of the value in the file
Returns:
The key, start, end, indent_level, indent_with, value_hash, end
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, key_start
)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()
return key, start, end, indent_level, indent_with, value_hash, end
5 changes: 4 additions & 1 deletion dictdatabase/indexing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import orjson
import os

import orjson

from . import config


# Problem: Multiple read processes will concurrently read and write the same file
# In some cases this will result in a empty read error, thats why the try-except exists

Expand Down
40 changes: 20 additions & 20 deletions dictdatabase/io_unsafe.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from __future__ import annotations
from typing import Tuple

import hashlib
import json
from dataclasses import dataclass
from typing import Tuple

import orjson
import json
import hashlib
from . import config, utils, byte_codes, indexing, io_bytes

from . import byte_codes
from . import config
from . import indexing
from . import io_bytes
from . import searching
from . import utils
from .index_manager import IndexManager


@dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9
Expand Down Expand Up @@ -79,21 +88,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None:

# Not found in index file, search for key in the entire file
all_file_bytes = io_bytes.read(db_name)
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)

if key_end == -1:
start, end, found = searching.search_value_position_in_db(all_file_bytes, key)
if not found:
return None

# Key found, now determine the bounding byte indices of the value
start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
end = utils.seek_index_through_value_bytes(all_file_bytes, start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()

# Write key info to index file
indexer.write(key, start, end, indent_level, indent_with, value_hash, end)
indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end))
return orjson.loads(value_bytes)


Expand Down Expand Up @@ -185,16 +185,16 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
return partial_handle

# Not found in index file, search for key in the entire file
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
position = searching.search_key_position_in_db(all_file_bytes, key)

if key_end == -1:
if not position.found:
raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")

# Key found, now determine the bounding byte indices of the value
start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0)
end = utils.seek_index_through_value_bytes(all_file_bytes, start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte)

partial_value = orjson.loads(all_file_bytes[start:end])
prefix_bytes = all_file_bytes[:start] if config.use_compression else None
Expand Down
74 changes: 74 additions & 0 deletions dictdatabase/searching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Tuple

import orjson

from dictdatabase import byte_codes
from dictdatabase import utils
from dictdatabase.dataclasses import SearchResult


def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult:
"""
It finds the start and end indices of the value of a key in a JSON file
Args:
file (bytes): bytes
key (str): The key to find in the JSON file.
Returns:
A tuple of the start and end index of the key, and a boolean value indicating whether the key was found.
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key)
if key_end == -1:
return SearchResult(start_byte=-1, end_byte=-1, found=False)
start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0)
end = utils.seek_index_through_value_bytes(file, start)
return SearchResult(start_byte=start, end_byte=end, found=True)


def search_key_position_in_db(
file: bytes, key: str, glom_searching=True
) -> SearchResult:
original_value_start = 0
original_value_end = len(file)
original_key_start = 0
original_key_end = len(file)
for k in key.split(".") if glom_searching else [key]:
key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k)
if key_end == -1:
return SearchResult(start_byte=-1, end_byte=-1, found=False)
original_key_end = original_value_start + key_end
original_key_start = original_value_start + key_start
position = find_key_position_in_bytes(file, k)
original_value_end = original_value_start + original_value_end
original_value_start += position.start_byte
file = file[original_value_start:original_value_end]
return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True)


def search_value_position_in_db(
all_file_bytes: bytes, key: str, glom_searching=True
) -> Tuple[int, int, bool]:
"""
It takes a byte string, a key, and a boolean, and returns a tuple of three integers
Args:
all_file_bytes (bytes): The bytes of the file you're searching in.
key (str): The key to search for.
glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to
True
Returns:
The start and end of the key in the file.
"""
original_start = 0
original_end = len(all_file_bytes)
for k in key.split(".") if glom_searching else [key]:
position = find_key_position_in_bytes(
all_file_bytes[original_start:original_end], k
)
if not position.found:
return -1, -1, False
original_end = original_start + position.end_byte
original_start += position.start_byte
return original_start, original_end, True
6 changes: 3 additions & 3 deletions tests/benchmark/run_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ class Scenario:
ops: int = 10

def print(self):
res = f"Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)"
res += ", 🔸 compression" if self.use_compression else ""
res += ", 💎 big file" if self.big_file else ""
res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)"
res += ", [] compression" if self.use_compression else ""
res += ", {} big file" if self.big_file else ""
print(res)


Expand Down
31 changes: 31 additions & 0 deletions tests/test_glom_like_searching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import dictdatabase as DDB

data = {
"users": {
"Ben": {"age": 30, "job": "Software Engineer"},
"Bob": {"age": 30, "job": "Plumbers"},
},
"Ben": {"job": {"age": 30, "job": "Software Engineer"}},
}


def test_glom_searching():
DDB.at("users").create(data, force_overwrite=True)
assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer"


def test_without_glom_searching():
DDB.at("users").create(data, force_overwrite=True)
assert DDB.at("users", key="Ben").read() == {
"job": {"age": 30, "job": "Software Engineer"}
}


def test_glom_searching_if_key_not_exists():
DDB.at("users").create(data, force_overwrite=True)
assert DDB.at("users", key="users.Job.Ben").read() is None


def test_glom_searching_if_subkey_not_exists():
DDB.at("users").create(data, force_overwrite=True)
assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None
27 changes: 27 additions & 0 deletions tests/test_glom_writing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pytest

import dictdatabase as DDB

data = {
"users": {
"Ben": {"age": 30, "job": "Software Engineer"},
"Bob": {"age": 30, "job": "Plumbers"},
},
"Ben": {"job": {"age": 30, "job": "Software Engineer"}},
}


def test_glom_writing():
DDB.at("users").create(data, force_overwrite=True)
with DDB.at("users", key="users.Ben").session() as (session, purchase):
purchase["status"] = "cancelled"
session.write()
assert DDB.at("users", key="users.Ben.status").read() == "cancelled"


def test_glom_writing_sub_key_not_exists():
DDB.at("users").create(data, force_overwrite=True)
with pytest.raises(KeyError):
with DDB.at("users", key="users.SUBKEY").session() as (session, purchase):
purchase["status"] = "cancelled"
session.write()

0 comments on commit a25afcf

Please sign in to comment.