Skip to content

Commit

Permalink
Merge pull request #10 from INM-6/remove_key
Browse files Browse the repository at this point in the history
Removed usage of key, now only generate checksum
  • Loading branch information
JoseJVS authored Sep 3, 2024
2 parents 7c70a05 + 851b601 commit e8b15e1
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 48 deletions.
3 changes: 0 additions & 3 deletions src/metadata_archivist/archivist.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
# "add_description": control boolean to add schema description attributes to resulting metadata. Default True .
# "add_type": control boolean to add schema type attributes to resulting metadata. Default False .
# "output_format": "string value of metadata file output format. Default "JSON" .
# "encoding_key": key used for secure pickle encoding, if None is provided key is autogenerated.
# Path to key file can be provided. Default None .
DEFAULT_CONFIG = {
"extraction_directory": ".",
"output_directory": ".",
Expand All @@ -47,7 +45,6 @@
"add_description": False,
"add_type": False,
"output_format": "JSON",
"encoding_key": None,
}


Expand Down
35 changes: 1 addition & 34 deletions src/metadata_archivist/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class Formatter:
add_parser: method to add Parser to list, updates internal schema file.
update_parser: method to update Parser in list, updates internal schema file.
remove_parser: method to remove Parser from list, updates internal schema file.
get_encoding_key: method to generate encoding key for encoded cache saving.
get_parser: method to retrieve Parser from list, uses Parser name for matching.
parse_files: method to trigger parsing procedures on a given list of input files.
compile_metadata: method to trigger structuring of parsing results.
Expand Down Expand Up @@ -112,17 +111,12 @@ def __init__(
# For parser result caching
self._cache = helpers.FormatterCache()

# For securing cached pickles
self._encoding_key = None

# Public
self.config = config
self.metadata = {}

self.combine = lambda formatter2, schema=None: _combine(formatter1=self, formatter2=formatter2, schema=schema)

self.get_encoding_key()

if parsers is not None:
if isinstance(parsers, AParser):
self.add_parser(parsers)
Expand Down Expand Up @@ -181,32 +175,6 @@ def schema(self, schema: dict) -> None:
for ex in self._parsers:
self._extend_json_schema(ex)

def get_encoding_key(self) -> bytes:
"""
Method to get or generate encoding key from self contained configuration.
If no encoding key provided in the form of a bytes object or path to a binary file then,
encoding key is generated from encrypted configuration file.
"""

if self._encoding_key is None:

encoding_key = self.config["encoding_key"]
if encoding_key is None:
self._encoding_key = sha3_256(p_dumps(self.config, protocol=HIGHEST_PROTOCOL)).digest()

elif isinstance(encoding_key, bytes):
self._encoding_key = encoding_key

elif isinstance(encoding_key, (str, Path)):
with Path(encoding_key).open("rb", encoding=None) as f:
self._encoding_key = f.read()

else:
LOG.debug("config encoding key value '%s'", str(encoding_key))
raise ValueError("No appropriate encoding key could be generated.")

return self._encoding_key

def export_schema(self) -> dict:
"""
Removes interpretation directives from schema, such that result respects JSONSchema standard.
Expand Down Expand Up @@ -406,7 +374,6 @@ def parse_files(
entry = self._cache[pid].add(explored_path, file_path)
entry.save_metadata(
metadata,
self._encoding_key,
overwrite=self.config.get("overwrite", True),
)
meta_files.append(entry.meta_path)
Expand Down Expand Up @@ -541,7 +508,7 @@ def compile_metadata(self) -> dict:
for cache_entry in parser_cache:
update_dict_with_parts(
self.metadata,
cache_entry.load_metadata(self._encoding_key),
cache_entry.load_metadata(),
list(cache_entry.rel_path.parts),
)
LOG.info("Done!")
Expand Down
2 changes: 1 addition & 1 deletion src/metadata_archivist/formatting_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _format_parser_id_rule(
parsed_metadata = []

# Lazy loading handling
metadata = cache_entry.load_metadata(formatter.get_encoding_key())
metadata = cache_entry.load_metadata()

# Compute additional directives if given
if parsing_context is not None and "keys" in parsing_context:
Expand Down
15 changes: 5 additions & 10 deletions src/metadata_archivist/helper_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from json import dumps
from pathlib import Path
from hashlib import sha3_256
from hmac import new, compare_digest
from typing import Optional, Dict, Union, Any
from collections.abc import Iterator, ItemsView
from pickle import loads as p_loads, dumps as p_dumps, HIGHEST_PROTOCOL
Expand Down Expand Up @@ -69,15 +68,12 @@ def __init__(self, explored_path: Path, file_path: Path, metadata: Optional[dict
self.meta_path = Path(str(file_path) + ".meta.pkl")
self._digest = None

def load_metadata(self, key: bytes) -> dict:
def load_metadata(self) -> dict:
"""
Loads cached metadata.
If no cache exists this implies that lazy loading is enabled,
metadata is loaded then from the self generated meta path.
Arguments:
key: bytes key used to secure pickled file.
Returns:
self contained parsed metadata dictionary.
"""
Expand All @@ -90,8 +86,8 @@ def load_metadata(self, key: bytes) -> dict:

with self.meta_path.open("rb", encoding=None) as f:
bytes_read = f.read()
new_digest = new(key, bytes_read, sha3_256).hexdigest()
if compare_digest(self._digest, new_digest):
new_digest = sha3_256(bytes_read).hexdigest()
if new_digest == self._digest:
self.metadata = p_loads(bytes_read)
else:
raise ValueError("Encoded pickle has been tampered with.")
Expand All @@ -103,13 +99,12 @@ def load_metadata(self, key: bytes) -> dict:

return self.metadata

def save_metadata(self, metadata: dict, key: bytes, overwrite: bool = True) -> None:
def save_metadata(self, metadata: dict, overwrite: bool = True) -> None:
"""
Saves metadata to file and releases object from memory.
Arguments:
metadata: dictionary to save.
key: bytes key used to secure pickled file.
overwrite_meta_files : control boolean to enable overwriting of lazy load cache files.
"""

Expand All @@ -124,7 +119,7 @@ def save_metadata(self, metadata: dict, key: bytes, overwrite: bool = True) -> N
raise FileExistsError("Unable to save parsed metadata; overwriting not allowed.")

pickle_dump = p_dumps(metadata, protocol=HIGHEST_PROTOCOL)
self._digest = new(key, pickle_dump, sha3_256).hexdigest()
self._digest = sha3_256(pickle_dump).hexdigest()

with self.meta_path.open("wb", encoding=None) as f:
f.write(pickle_dump)
Expand Down

0 comments on commit e8b15e1

Please sign in to comment.