From d9eca0446095449d896c3eb62daef4475dba5583 Mon Sep 17 00:00:00 2001 From: Robin Vobruba Date: Thu, 12 Dec 2024 07:25:17 +0100 Subject: [PATCH] Improves and externalizes OKH v1 YAML sanitization [fix] It is now done by a BASH script from the OKH-Tool. --- krawl/fetcher/util.py | 50 +++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/krawl/fetcher/util.py b/krawl/fetcher/util.py index 8faa4c0..7264e54 100644 --- a/krawl/fetcher/util.py +++ b/krawl/fetcher/util.py @@ -2,6 +2,7 @@ import os import re +import sys from pathlib import Path import subprocess import tempfile @@ -30,18 +31,36 @@ def is_binary(content: str | bytes) -> bool: return "\0" in content return b"\0" in content -def _recuperate_invalid_yaml_manifest(manifest_contents: bytes) -> bytes | None: +def _recuperate_invalid_yaml_manifest(manifest_contents: bytes) -> bytes: """Cleans up OKH v1 (YAMl) manifest content. Many manifests out there use bad syntax or invalid values, which we try to undo as much as possible in here.""" - mfst = b'' - null_pat = re.compile(r'^\s+-\s+null\s*$') - for line in manifest_contents.split(b'\n'): - line_str = str(line, "utf-8") - if not null_pat.match(line_str): - mfst = mfst + line + b'\n' - return mfst + (_, fn_v1) = tempfile.mkstemp() + # Target file should not yet exist when converting + os.remove(fn_v1) + + with open(fn_v1, "wb") as binary_file: + binary_file.write(manifest_contents) + + sanitize_okh_v1_yaml(fn_v1) + + with open(fn_v1, "rb") as binary_file: + manifest_contents = binary_file.read() + os.remove(fn_v1) + return manifest_contents + +def sanitize_okh_v1_yaml(manifest_file: Path): + """Sanitizes an OKH v1 (YAMl) manifest, + using the external software 'sanitize-v1-yaml' + from the okh-tool repo.""" + + conv_cmd = ['sanitize-v1-yaml', '--in-place', manifest_file] + # res = subprocess.run(conv_cmd) + try: + subprocess.check_output(conv_cmd, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as err: + raise ConversionError(f"Failed to sanitize OKH v1 manifest, exitcode: {err.returncode}, stderr: {err.stderr.decode(sys.getfilesystemencoding())}, stdout: {err.output.decode(sys.getfilesystemencoding())}", []) def convert_okh_v1_to_losh(manifest_contents: bytes) -> bytes | None: """Converts OKH v1 (YAMl) manifest contents to OKH LOSH (TOML) manifest contents, @@ -57,16 +76,15 @@ def convert_okh_v1_to_losh(manifest_contents: bytes) -> bytes | None: with open(fn_v1, "wb") as binary_file: binary_file.write(manifest_contents) - res = subprocess.run(['okh-tool', 'conv', fn_v1, fn_losh]) + conv_cmd = ['okh-tool', 'conv', fn_v1, fn_losh] + try: + subprocess.check_output(conv_cmd, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as err: + raise ConversionError(f"Failed to convert OKH v1 manifest to OKH LOSH, exitcode: {err.returncode}, stderr: {err.stderr.decode(sys.getfilesystemencoding())}, stdout: {err.output.decode(sys.getfilesystemencoding())}", []) # res.check_returncode() - ok = res.returncode == 0 - if ok: - with open(fn_losh, "rb") as binary_file: - manifest_contents = binary_file.read() - else: - raise ConversionError("Failed to convert OKH v1 manifest to OKH LOSH", []) - manifest_contents = None + with open(fn_losh, "rb") as binary_file: + manifest_contents = binary_file.read() if os.path.exists(fn_v1): os.remove(fn_v1)