From b05e6ee4cac7a170c8b95ce7a148e661cdcde3cc Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Sun, 27 Nov 2022 18:00:20 +0300 Subject: [PATCH] add Index dataclass --- dictdatabase/dataclasses.py | 11 ++ dictdatabase/index_manager.py | 41 ++-- dictdatabase/indexing.py | 121 ++++++------ dictdatabase/io_unsafe.py | 363 ++++++++++++++++++---------------- 4 files changed, 289 insertions(+), 247 deletions(-) diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py index 2c54e19..9743ffb 100644 --- a/dictdatabase/dataclasses.py +++ b/dictdatabase/dataclasses.py @@ -6,3 +6,14 @@ class SearchResult: start_byte: int end_byte: int found: bool + + +@dataclasses.dataclass(frozen=True) +class Index: + key: str + key_start: int + key_end: int + indent_level: int + indent_with: str + value_hash: str + old_value_end: int diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py index a7dd8d3..579a97f 100644 --- a/dictdatabase/index_manager.py +++ b/dictdatabase/index_manager.py @@ -1,28 +1,27 @@ import hashlib from dictdatabase import utils +from dictdatabase.dataclasses import Index -class IndexManager: - @staticmethod - def create_index(all_file_bytes: bytes, key: str, start, end): - """ - It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its - value +def create_index(all_file_bytes: bytes, key: str, start, end) -> Index: + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value - Args: - all_file_bytes (bytes): The entire file as a byte string. - key (str): The key of the value we're indexing. - start: the start of the value in the file - end: the end of the value in the file + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file - Returns: - The key, start, end, indent_level, indent_with, value_hash, end - """ - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - indent_level, indent_with = utils.detect_indentation_in_json_bytes( - all_file_bytes, key_start - ) - value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - return key, start, end, indent_level, indent_with, value_hash, end + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return Index(key, start, end, indent_level, indent_with, value_hash, end) diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index b147f92..bb8e36a 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -3,6 +3,7 @@ import orjson from . import config +from .dataclasses import Index # Problem: Multiple read processes will concurrently read and write the same file @@ -25,61 +26,65 @@ class Indexer: - """ - The Indexer takes the name of a database file, and tries to load the .index file - of the corresponding database file. - - The name of the index file is the name of the database file, with the extension - .index and all "/" replaced with "___" - - The content of the index file is a json object, where the keys are keys inside - the database json file, and the values are lists of 5 elements: - - start_index: The index of the first byte of the value of the key in the database file - - end_index: The index of the last byte of the value of the key in the database file - - indent_level: The indent level of the key in the database file - - indent_with: The indent string used. - - value_hash: The hash of the value bytes - """ - - __slots__ = ("data", "path") - - def __init__(self, db_name: str): - # Make path of index file - db_name = db_name.replace("/", "___") - self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") - - os.makedirs(os.path.dirname(self.path), exist_ok=True) - if not os.path.exists(self.path): - self.data = {} - return - - try: - with open(self.path, "rb") as f: - self.data = orjson.loads(f.read()) - except orjson.JSONDecodeError: - self.data = {} - - - def get(self, key): - """ - Returns a list of 5 elements for a key if it exists, otherwise None - Elements:[start_index, end_index, indent_level, indent_with, value_hash] - """ - return self.data.get(key, None) - - - def write(self, key, start_index, end_index, indent_level, indent_with, value_hash, old_value_end): - """ - Write index information for a key to the index file - """ - - if self.data.get(key, None) is not None: - delta = end_index - old_value_end - for entry in self.data.values(): - if entry[0] > old_value_end: - entry[0] += delta - entry[1] += delta - - self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash] - with open(self.path, "wb") as f: - f.write(orjson.dumps(self.data)) + """ + The Indexer takes the name of a database file, and tries to load the .index file + of the corresponding database file. + + The name of the index file is the name of the database file, with the extension + .index and all "/" replaced with "___" + + The content of the index file is a json object, where the keys are keys inside + the database json file, and the values are lists of 5 elements: + - start_index: The index of the first byte of the value of the key in the database file + - end_index: The index of the last byte of the value of the key in the database file + - indent_level: The indent level of the key in the database file + - indent_with: The indent string used. + - value_hash: The hash of the value bytes + """ + + __slots__ = ("data", "path") + + def __init__(self, db_name: str): + # Make path of index file + db_name = db_name.replace("/", "___") + self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") + + os.makedirs(os.path.dirname(self.path), exist_ok=True) + if not os.path.exists(self.path): + self.data = {} + return + + try: + with open(self.path, "rb") as f: + self.data = orjson.loads(f.read()) + except orjson.JSONDecodeError: + self.data = {} + + def get(self, key): + """ + Returns a list of 5 elements for a key if it exists, otherwise None + Elements:[start_index, end_index, indent_level, indent_with, value_hash] + """ + return self.data.get(key, None) + + def write(self, index: Index): + """ + Write index information for a key to the index file + """ + + if self.data.get(index.key, None) is not None: + delta = index.key_end - index.old_value_end + for entry in self.data.values(): + if entry[0] > index.old_value_end: + entry[0] += delta + entry[1] += delta + + self.data[index.key] = [ + index.key_start, + index.key_end, + index.indent_level, + index.indent_with, + index.value_hash, + ] + with open(self.path, "wb") as f: + f.write(orjson.dumps(self.data)) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 7c8cba2..cd77b70 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -13,26 +13,27 @@ from . import io_bytes from . import searching from . import utils -from .index_manager import IndexManager +from .dataclasses import Index +from .index_manager import create_index @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 class PartialDict: - prefix: bytes - key: str - value: dict - value_start: int - value_end: int - suffix: bytes + prefix: bytes + key: str + value: dict + value_start: int + value_end: int + suffix: bytes @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 class PartialFileHandle: - db_name: str - partial_dict: PartialDict - indent_level: int - indent_with: str - indexer: indexing.Indexer + db_name: str + partial_dict: PartialDict + indent_level: int + indent_with: str + indexer: indexing.Indexer ######################################################################################## @@ -41,13 +42,13 @@ class PartialFileHandle: def read(db_name: str) -> dict: - """ - Read the file at db_path from the configured storage directory. - Make sure the file exists. If it does notnot a FileNotFoundError is - raised. - """ - # Always use orjson to read the file, because it is faster - return orjson.loads(io_bytes.read(db_name)) + """ + Read the file at db_path from the configured storage directory. + Make sure the file exists. If it does notnot a FileNotFoundError is + raised. + """ + # Always use orjson to read the file, because it is faster + return orjson.loads(io_bytes.read(db_name)) ######################################################################################## @@ -55,46 +56,48 @@ def read(db_name: str) -> dict: ######################################################################################## -def try_read_bytes_using_indexer(indexer: indexing.Indexer, db_name: str, key: str) -> bytes | None: - """ - Check if the key info is saved in the file's index file. - If it is and the value has not changed, return the value bytes. - Otherwise return None. - """ +def try_read_bytes_using_indexer( + indexer: indexing.Indexer, db_name: str, key: str +) -> bytes | None: + """ + Check if the key info is saved in the file's index file. + If it is and the value has not changed, return the value bytes. + Otherwise return None. + """ - if (index := indexer.get(key)) is None: - return None - start, end, _, _, value_hash = index - partial_bytes = io_bytes.read(db_name, start=start, end=end) - if value_hash != hashlib.sha256(partial_bytes).hexdigest(): - return None - return partial_bytes + if (index := indexer.get(key)) is None: + return None + start, end, _, _, value_hash = index + partial_bytes = io_bytes.read(db_name, start=start, end=end) + if value_hash != hashlib.sha256(partial_bytes).hexdigest(): + return None + return partial_bytes def partial_read_only(db_name: str, key: str) -> dict | None: - """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. - - If the key is not found, a `KeyError` is raised. - """ - - # Search for key in the index file - indexer = indexing.Indexer(db_name) - if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: - return orjson.loads(value_bytes) - - # Not found in index file, search for key in the entire file - all_file_bytes = io_bytes.read(db_name) - start, end, found = searching.search_value_position_in_db(all_file_bytes, key) - if not found: - return None - value_bytes = all_file_bytes[start:end] - # Write key info to index file - indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end)) - return orjson.loads(value_bytes) + """ + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. + + If the key is not found, a `KeyError` is raised. + """ + + # Search for key in the index file + indexer = indexing.Indexer(db_name) + if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: + return orjson.loads(value_bytes) + + # Not found in index file, search for key in the entire file + all_file_bytes = io_bytes.read(db_name) + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) + if not found: + return None + value_bytes = all_file_bytes[start:end] + # Write key info to index file + indexer.write(create_index(all_file_bytes, key, start, end)) + return orjson.loads(value_bytes) ################################################################################ @@ -103,26 +106,26 @@ def partial_read_only(db_name: str, key: str) -> dict | None: def serialize_data_to_json_bytes(data: dict) -> bytes: - """ - Serialize the data as json bytes. Depending on the config, - this can be done with orjson or the standard json module. - Additionally config.indent is respected. - """ - if config.use_orjson: - option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS - return orjson.dumps(data, option=option) - else: - db_dump = json.dumps(data, indent=config.indent, sort_keys=True) - return db_dump.encode() + """ + Serialize the data as json bytes. Depending on the config, + this can be done with orjson or the standard json module. + Additionally config.indent is respected. + """ + if config.use_orjson: + option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS + return orjson.dumps(data, option=option) + else: + db_dump = json.dumps(data, indent=config.indent, sort_keys=True) + return db_dump.encode() def write(db_name: str, data: dict): - """ - Write the dict db dumped as a json string - to the file of the db_path. - """ - data_bytes = serialize_data_to_json_bytes(data) - io_bytes.write(db_name, data_bytes) + """ + Write the dict db dumped as a json string + to the file of the db_path. + """ + data_bytes = serialize_data_to_json_bytes(data) + io_bytes.write(db_name, data_bytes) ################################################################################ @@ -130,105 +133,129 @@ def write(db_name: str, data: dict): ################################################################################ -def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) -> Tuple[PartialFileHandle | None, bytes | None]: - """ - Try to get a partial file handle by using the key entry in the index file. - - If the data could be read from the index file, a tuple of the partial file - handle and None is returned. - If the data could not be read from the index file, a tuple of None and the file - bytes is returned, so that the file bytes can be searched for the key. - """ - - if (index := indexer.get(key)) is None: - return None, io_bytes.read(db_name) - start, end, indent_level, indent_with, value_hash = index - - # If compression is enabled, all data has to be read from the file - if config.use_compression: - all_file_bytes = io_bytes.read(db_name) - value_bytes = all_file_bytes[start:end] - if value_hash != hashlib.sha256(value_bytes).hexdigest(): - return None, all_file_bytes - value_data = orjson.loads(value_bytes) - partial_dict = PartialDict(all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:]) - - # If compression is disabled, only the value and suffix have to be read - else: - value_and_suffix_bytes = io_bytes.read(db_name, start=start) - value_length = end - start - value_bytes = value_and_suffix_bytes[:value_length] - if value_hash != hashlib.sha256(value_bytes).hexdigest(): - # If the hashes don't match, read the prefix to concat the full file bytes - prefix_bytes = io_bytes.read(db_name, end=start) - return None, prefix_bytes + value_and_suffix_bytes - value_data = orjson.loads(value_bytes) - partial_dict = PartialDict(None, key, value_data, start, end, value_and_suffix_bytes[value_length:]) - - return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), None +def try_get_parial_file_handle_by_index( + indexer: indexing.Indexer, db_name, key +) -> Tuple[PartialFileHandle | None, bytes | None]: + """ + Try to get a partial file handle by using the key entry in the index file. + + If the data could be read from the index file, a tuple of the partial file + handle and None is returned. + If the data could not be read from the index file, a tuple of None and the file + bytes is returned, so that the file bytes can be searched for the key. + """ + + if (index := indexer.get(key)) is None: + return None, io_bytes.read(db_name) + start, end, indent_level, indent_with, value_hash = index + + # If compression is enabled, all data has to be read from the file + if config.use_compression: + all_file_bytes = io_bytes.read(db_name) + value_bytes = all_file_bytes[start:end] + if value_hash != hashlib.sha256(value_bytes).hexdigest(): + return None, all_file_bytes + value_data = orjson.loads(value_bytes) + partial_dict = PartialDict( + all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:] + ) + + # If compression is disabled, only the value and suffix have to be read + else: + value_and_suffix_bytes = io_bytes.read(db_name, start=start) + value_length = end - start + value_bytes = value_and_suffix_bytes[:value_length] + if value_hash != hashlib.sha256(value_bytes).hexdigest(): + # If the hashes don't match, read the prefix to concat the full file bytes + prefix_bytes = io_bytes.read(db_name, end=start) + return None, prefix_bytes + value_and_suffix_bytes + value_data = orjson.loads(value_bytes) + partial_dict = PartialDict( + None, key, value_data, start, end, value_and_suffix_bytes[value_length:] + ) + + return ( + PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), + None, + ) def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: - """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. - - If the key is not found, a `KeyError` is raised. - """ - - # Search for key in the index file - indexer = indexing.Indexer(db_name) - partial_handle, all_file_bytes = try_get_parial_file_handle_by_index(indexer, db_name, key) - if partial_handle is not None: - return partial_handle - - # Not found in index file, search for key in the entire file - position = searching.search_key_position_in_db(all_file_bytes, key) - - if not position.found: - raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") - - # Key found, now determine the bounding byte indices of the value - start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte) - - partial_value = orjson.loads(all_file_bytes[start:end]) - prefix_bytes = all_file_bytes[:start] if config.use_compression else None - partial_dict = PartialDict(prefix_bytes, key, partial_value, start, end, all_file_bytes[end:]) - return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) + """ + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. + + If the key is not found, a `KeyError` is raised. + """ + + # Search for key in the index file + indexer = indexing.Indexer(db_name) + partial_handle, all_file_bytes = try_get_parial_file_handle_by_index( + indexer, db_name, key + ) + if partial_handle is not None: + return partial_handle + + # Not found in index file, search for key in the entire file + position = searching.search_key_position_in_db(all_file_bytes, key) + + if not position.found: + raise KeyError(f'Key "{key}" not found in db "{db_name}"') + + # Key found, now determine the bounding byte indices of the value + start = position.end_byte + ( + 1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0 + ) + end = utils.seek_index_through_value_bytes(all_file_bytes, start) + + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, position.start_byte + ) + + partial_value = orjson.loads(all_file_bytes[start:end]) + prefix_bytes = all_file_bytes[:start] if config.use_compression else None + partial_dict = PartialDict( + prefix_bytes, key, partial_value, start, end, all_file_bytes[end:] + ) + return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) def partial_write(pf: PartialFileHandle): - """ - Write a partial file handle to the db. - """ - - partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) - - # Add indentation - if pf.indent_level > 0 and pf.indent_with: - replace_this = "\n".encode() - replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() - partial_bytes = partial_bytes.replace(replace_this, replace_with) - - # Write key info to index file - pf.indexer.write( - key=pf.partial_dict.key, - start_index=pf.partial_dict.value_start, - end_index=pf.partial_dict.value_start + len(partial_bytes), - indent_level=pf.indent_level, - indent_with=pf.indent_with, - value_hash=hashlib.sha256(partial_bytes).hexdigest(), - old_value_end=pf.partial_dict.value_end, - ) - - if pf.partial_dict.prefix is None: - # Prefix could not be determined due to compression, so write the entire file - io_bytes.write(pf.db_name, partial_bytes + pf.partial_dict.suffix, start=pf.partial_dict.value_start) - else: - # Prefix was determined, so only write the changed part and the suffix - io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix) + """ + Write a partial file handle to the db. + """ + + partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) + + # Add indentation + if pf.indent_level > 0 and pf.indent_with: + replace_this = "\n".encode() + replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() + partial_bytes = partial_bytes.replace(replace_this, replace_with) + + # Write key info to index file + index = Index( + key=pf.partial_dict.key, + key_start=pf.partial_dict.value_start, + key_end=pf.partial_dict.value_start + len(partial_bytes), + indent_level=pf.indent_level, + indent_with=pf.indent_with, + value_hash=hashlib.sha256(partial_bytes).hexdigest(), + old_value_end=pf.partial_dict.value_end, + ) + pf.indexer.write(index) + + if pf.partial_dict.prefix is None: + # Prefix could not be determined due to compression, so write the entire file + io_bytes.write( + pf.db_name, + partial_bytes + pf.partial_dict.suffix, + start=pf.partial_dict.value_start, + ) + else: + # Prefix was determined, so only write the changed part and the suffix + io_bytes.write( + pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix + )