Skip to content

Commit

Permalink
add Index dataclass
Browse files Browse the repository at this point in the history
  • Loading branch information
Danil Tolmachev authored and mkrd committed Nov 27, 2022
1 parent 456aef2 commit 4d0c4b8
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 247 deletions.
11 changes: 11 additions & 0 deletions dictdatabase/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,14 @@ class SearchResult:
start_byte: int
end_byte: int
found: bool


@dataclasses.dataclass(frozen=True)
class Index:
key: str
key_start: int
key_end: int
indent_level: int
indent_with: str
value_hash: str
old_value_end: int
41 changes: 20 additions & 21 deletions dictdatabase/index_manager.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
import hashlib

from dictdatabase import utils
from dictdatabase.dataclasses import Index


class IndexManager:
@staticmethod
def create_index(all_file_bytes: bytes, key: str, start, end):
"""
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
value
def create_index(all_file_bytes: bytes, key: str, start, end) -> Index:
"""
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
value
Args:
all_file_bytes (bytes): The entire file as a byte string.
key (str): The key of the value we're indexing.
start: the start of the value in the file
end: the end of the value in the file
Args:
all_file_bytes (bytes): The entire file as a byte string.
key (str): The key of the value we're indexing.
start: the start of the value in the file
end: the end of the value in the file
Returns:
The key, start, end, indent_level, indent_with, value_hash, end
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, key_start
)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()
return key, start, end, indent_level, indent_with, value_hash, end
Returns:
The key, start, end, indent_level, indent_with, value_hash, end
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, key_start
)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()
return Index(key, start, end, indent_level, indent_with, value_hash, end)
121 changes: 63 additions & 58 deletions dictdatabase/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import orjson

from . import config
from .dataclasses import Index


# Problem: Multiple read processes will concurrently read and write the same file
Expand All @@ -25,61 +26,65 @@


class Indexer:
"""
The Indexer takes the name of a database file, and tries to load the .index file
of the corresponding database file.
The name of the index file is the name of the database file, with the extension
.index and all "/" replaced with "___"
The content of the index file is a json object, where the keys are keys inside
the database json file, and the values are lists of 5 elements:
- start_index: The index of the first byte of the value of the key in the database file
- end_index: The index of the last byte of the value of the key in the database file
- indent_level: The indent level of the key in the database file
- indent_with: The indent string used.
- value_hash: The hash of the value bytes
"""

__slots__ = ("data", "path")

def __init__(self, db_name: str):
# Make path of index file
db_name = db_name.replace("/", "___")
self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")

os.makedirs(os.path.dirname(self.path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
return

try:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())
except orjson.JSONDecodeError:
self.data = {}


def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Elements:[start_index, end_index, indent_level, indent_with, value_hash]
"""
return self.data.get(key, None)


def write(self, key, start_index, end_index, indent_level, indent_with, value_hash, old_value_end):
"""
Write index information for a key to the index file
"""

if self.data.get(key, None) is not None:
delta = end_index - old_value_end
for entry in self.data.values():
if entry[0] > old_value_end:
entry[0] += delta
entry[1] += delta

self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
"""
The Indexer takes the name of a database file, and tries to load the .index file
of the corresponding database file.
The name of the index file is the name of the database file, with the extension
.index and all "/" replaced with "___"
The content of the index file is a json object, where the keys are keys inside
the database json file, and the values are lists of 5 elements:
- start_index: The index of the first byte of the value of the key in the database file
- end_index: The index of the last byte of the value of the key in the database file
- indent_level: The indent level of the key in the database file
- indent_with: The indent string used.
- value_hash: The hash of the value bytes
"""

__slots__ = ("data", "path")

def __init__(self, db_name: str):
# Make path of index file
db_name = db_name.replace("/", "___")
self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")

os.makedirs(os.path.dirname(self.path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
return

try:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())
except orjson.JSONDecodeError:
self.data = {}

def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Elements:[start_index, end_index, indent_level, indent_with, value_hash]
"""
return self.data.get(key, None)

def write(self, index: Index):
"""
Write index information for a key to the index file
"""

if self.data.get(index.key, None) is not None:
delta = index.key_end - index.old_value_end
for entry in self.data.values():
if entry[0] > index.old_value_end:
entry[0] += delta
entry[1] += delta

self.data[index.key] = [
index.key_start,
index.key_end,
index.indent_level,
index.indent_with,
index.value_hash,
]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
Loading

0 comments on commit 4d0c4b8

Please sign in to comment.