Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed usage of key, now only generate checksum #10

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions src/metadata_archivist/archivist.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
# "add_description": control boolean to add schema description attributes to resulting metadata. Default True .
# "add_type": control boolean to add schema type attributes to resulting metadata. Default False .
# "output_format": "string value of metadata file output format. Default "JSON" .
# "encoding_key": key used for secure pickle encoding, if None is provided key is autogenerated.
# Path to key file can be provided. Default None .
DEFAULT_CONFIG = {
"extraction_directory": ".",
"output_directory": ".",
Expand All @@ -47,7 +45,6 @@
"add_description": False,
"add_type": False,
"output_format": "JSON",
"encoding_key": None,
}


Expand Down
35 changes: 1 addition & 34 deletions src/metadata_archivist/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class Formatter:
add_parser: method to add Parser to list, updates internal schema file.
update_parser: method to update Parser in list, updates internal schema file.
remove_parser: method to remove Parser from list, updates internal schema file.
get_encoding_key: method to generate encoding key for encoded cache saving.
get_parser: method to retrieve Parser from list, uses Parser name for matching.
parse_files: method to trigger parsing procedures on a given list of input files.
compile_metadata: method to trigger structuring of parsing results.
Expand Down Expand Up @@ -112,17 +111,12 @@ def __init__(
# For parser result caching
self._cache = helpers.FormatterCache()

# For securing cached pickles
self._encoding_key = None

# Public
self.config = config
self.metadata = {}

self.combine = lambda formatter2, schema=None: _combine(formatter1=self, formatter2=formatter2, schema=schema)

self.get_encoding_key()

if parsers is not None:
if isinstance(parsers, AParser):
self.add_parser(parsers)
Expand Down Expand Up @@ -181,32 +175,6 @@ def schema(self, schema: dict) -> None:
for ex in self._parsers:
self._extend_json_schema(ex)

def get_encoding_key(self) -> bytes:
"""
Method to get or generate encoding key from self contained configuration.
If no encoding key provided in the form of a bytes object or path to a binary file then,
encoding key is generated from encrypted configuration file.
"""

if self._encoding_key is None:

encoding_key = self.config["encoding_key"]
if encoding_key is None:
self._encoding_key = sha3_256(p_dumps(self.config, protocol=HIGHEST_PROTOCOL)).digest()

elif isinstance(encoding_key, bytes):
self._encoding_key = encoding_key

elif isinstance(encoding_key, (str, Path)):
with Path(encoding_key).open("rb", encoding=None) as f:
self._encoding_key = f.read()

else:
LOG.debug("config encoding key value '%s'", str(encoding_key))
raise ValueError("No appropriate encoding key could be generated.")

return self._encoding_key

def export_schema(self) -> dict:
"""
Removes interpretation directives from schema, such that result respects JSONSchema standard.
Expand Down Expand Up @@ -406,7 +374,6 @@ def parse_files(
entry = self._cache[pid].add(explored_path, file_path)
entry.save_metadata(
metadata,
self._encoding_key,
overwrite=self.config.get("overwrite", True),
)
meta_files.append(entry.meta_path)
Expand Down Expand Up @@ -541,7 +508,7 @@ def compile_metadata(self) -> dict:
for cache_entry in parser_cache:
update_dict_with_parts(
self.metadata,
cache_entry.load_metadata(self._encoding_key),
cache_entry.load_metadata(),
list(cache_entry.rel_path.parts),
)
LOG.info("Done!")
Expand Down
2 changes: 1 addition & 1 deletion src/metadata_archivist/formatting_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _format_parser_id_rule(
parsed_metadata = []

# Lazy loading handling
metadata = cache_entry.load_metadata(formatter.get_encoding_key())
metadata = cache_entry.load_metadata()

# Compute additional directives if given
if parsing_context is not None and "keys" in parsing_context:
Expand Down
15 changes: 5 additions & 10 deletions src/metadata_archivist/helper_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from json import dumps
from pathlib import Path
from hashlib import sha3_256
from hmac import new, compare_digest
from typing import Optional, Dict, Union, Any
from collections.abc import Iterator, ItemsView
from pickle import loads as p_loads, dumps as p_dumps, HIGHEST_PROTOCOL
Expand Down Expand Up @@ -69,15 +68,12 @@ def __init__(self, explored_path: Path, file_path: Path, metadata: Optional[dict
self.meta_path = Path(str(file_path) + ".meta.pkl")
self._digest = None

def load_metadata(self, key: bytes) -> dict:
def load_metadata(self) -> dict:
"""
Loads cached metadata.
If no cache exists this implies that lazy loading is enabled,
metadata is loaded then from the self generated meta path.

Arguments:
key: bytes key used to secure pickled file.

Returns:
self contained parsed metadata dictionary.
"""
Expand All @@ -90,8 +86,8 @@ def load_metadata(self, key: bytes) -> dict:

with self.meta_path.open("rb", encoding=None) as f:
bytes_read = f.read()
new_digest = new(key, bytes_read, sha3_256).hexdigest()
if compare_digest(self._digest, new_digest):
new_digest = sha3_256(bytes_read).hexdigest()
if new_digest == self._digest:
self.metadata = p_loads(bytes_read)
else:
raise ValueError("Encoded pickle has been tampered with.")
Expand All @@ -103,13 +99,12 @@ def load_metadata(self, key: bytes) -> dict:

return self.metadata

def save_metadata(self, metadata: dict, key: bytes, overwrite: bool = True) -> None:
def save_metadata(self, metadata: dict, overwrite: bool = True) -> None:
"""
Saves metadata to file and releases object from memory.

Arguments:
metadata: dictionary to save.
key: bytes key used to secure pickled file.
overwrite_meta_files : control boolean to enable overwriting of lazy load cache files.
"""

Expand All @@ -124,7 +119,7 @@ def save_metadata(self, metadata: dict, key: bytes, overwrite: bool = True) -> N
raise FileExistsError("Unable to save parsed metadata; overwriting not allowed.")

pickle_dump = p_dumps(metadata, protocol=HIGHEST_PROTOCOL)
self._digest = new(key, pickle_dump, sha3_256).hexdigest()
self._digest = sha3_256(pickle_dump).hexdigest()

with self.meta_path.open("wb", encoding=None) as f:
f.write(pickle_dump)
Expand Down
Loading