Skip to content

Commit

Permalink
improve data flow
Browse files Browse the repository at this point in the history
  • Loading branch information
mkrd committed Nov 10, 2022
1 parent e33fe59 commit 10390b0
Showing 1 changed file with 27 additions and 14 deletions.
41 changes: 27 additions & 14 deletions dictdatabase/io_unsafe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations
from typing import Tuple
from dataclasses import dataclass
import orjson
import json
Expand Down Expand Up @@ -124,17 +125,26 @@ def write(db_name: str, data: dict):
################################################################################


def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key):
def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) -> Tuple[PartialFileHandle | None, bytes | None]:
"""
Try to get a partial file handle by using the key entry in the index file.
If the data could be read from the index file, a tuple of the partial file
handle and None is returned.
If the data could not be read from the index file, a tuple of None and the file
bytes is returned, so that the file bytes can be searched for the key.
"""

if (index := indexer.get(key)) is None:
return None
return None, io_bytes.read(db_name)
value_start, value_end, indent_level, indent_with, value_hash = index

# If compression is enabled, all data has to be read from the file
if config.use_compression:
data_bytes = io_bytes.read(db_name)
value_bytes = data_bytes[value_start:value_end]
if value_hash != hashlib.sha256(value_bytes).hexdigest():
return None
return None, data_bytes
value_data = orjson.loads(value_bytes)
partial_dict = PartialDict(data_bytes[:value_start], key, value_data, value_start, data_bytes[value_end:])

Expand All @@ -144,11 +154,13 @@ def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key)
value_length = value_end - value_start
value_bytes = value_and_suffix_bytes[:value_length]
if value_hash != hashlib.sha256(value_bytes).hexdigest():
return None
# If the hashes don't match, read the prefix to concat the full file bytes
prefix_bytes = io_bytes.read(db_name, 0, value_start)
return None, prefix_bytes + value_and_suffix_bytes
value_data = orjson.loads(value_bytes)
partial_dict = PartialDict(None, key, value_data, value_start, value_and_suffix_bytes[value_length:])

return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), None



Expand All @@ -163,30 +175,31 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
If the key is not found, a `KeyError` is raised.
"""

data = io_bytes.read(db_name)

# Search for key in the index file
indexer = indexing.Indexer(db_name)
if (partial_file_handle := try_get_parial_file_handle_by_index(indexer, db_name, key)) is not None:
return partial_file_handle
partial_handle, data_bytes = try_get_parial_file_handle_by_index(indexer, db_name, key)
if partial_handle is not None:
return partial_handle

# Not found in index file, search for key in the entire file
key_start, key_end = utils.find_outermost_key_in_json_bytes(data, key)
key_start, key_end = utils.find_outermost_key_in_json_bytes(data_bytes, key)

if key_end == -1:
raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")

# Key found, now determine the bounds of the value
value_start = key_end + (1 if data[key_end] == byte_codes.SPACE else 0)
value_end = utils.seek_index_through_value_bytes(data, value_start)
value_start = key_end + (1 if data_bytes[key_end] == byte_codes.SPACE else 0)
value_end = utils.seek_index_through_value_bytes(data_bytes, value_start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(data, key_start)
partial_bytes = data[value_start:value_end]
indent_level, indent_with = utils.detect_indentation_in_json_bytes(data_bytes, key_start)
partial_bytes = data_bytes[value_start:value_end]

# Write key info to index file

partial_value = orjson.loads(partial_bytes)
partial_dict = PartialDict(data[:value_start], key, partial_value, value_start, data[value_end:])
prefix_bytes = data_bytes[:value_start] if config.use_compression else None
partial_dict = PartialDict(prefix_bytes, key, partial_value, value_start, data_bytes[value_end:])
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)


Expand Down

0 comments on commit 10390b0

Please sign in to comment.