Skip to content

Commit

Permalink
Improves and externalizes OKH v1 YAML sanitization [fix]
Browse files Browse the repository at this point in the history
It is now done by a BASH script from the OKH-Tool.
  • Loading branch information
hoijui committed Dec 12, 2024
1 parent 15e3ce1 commit d9eca04
Showing 1 changed file with 34 additions and 16 deletions.
50 changes: 34 additions & 16 deletions krawl/fetcher/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import re
import sys
from pathlib import Path
import subprocess
import tempfile
Expand Down Expand Up @@ -30,18 +31,36 @@ def is_binary(content: str | bytes) -> bool:
return "\0" in content
return b"\0" in content

def _recuperate_invalid_yaml_manifest(manifest_contents: bytes) -> bytes | None:
def _recuperate_invalid_yaml_manifest(manifest_contents: bytes) -> bytes:
"""Cleans up OKH v1 (YAMl) manifest content.
Many manifests out there use bad syntax or invalid values,
which we try to undo as much as possible in here."""

mfst = b''
null_pat = re.compile(r'^\s+-\s+null\s*$')
for line in manifest_contents.split(b'\n'):
line_str = str(line, "utf-8")
if not null_pat.match(line_str):
mfst = mfst + line + b'\n'
return mfst
(_, fn_v1) = tempfile.mkstemp()
# Target file should not yet exist when converting
os.remove(fn_v1)

with open(fn_v1, "wb") as binary_file:
binary_file.write(manifest_contents)

sanitize_okh_v1_yaml(fn_v1)

with open(fn_v1, "rb") as binary_file:
manifest_contents = binary_file.read()
os.remove(fn_v1)
return manifest_contents

def sanitize_okh_v1_yaml(manifest_file: Path):
"""Sanitizes an OKH v1 (YAMl) manifest,
using the external software 'sanitize-v1-yaml'
from the okh-tool repo."""

conv_cmd = ['sanitize-v1-yaml', '--in-place', manifest_file]
# res = subprocess.run(conv_cmd)
try:
subprocess.check_output(conv_cmd, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as err:
raise ConversionError(f"Failed to sanitize OKH v1 manifest, exitcode: {err.returncode}, stderr: {err.stderr.decode(sys.getfilesystemencoding())}, stdout: {err.output.decode(sys.getfilesystemencoding())}", [])

def convert_okh_v1_to_losh(manifest_contents: bytes) -> bytes | None:
"""Converts OKH v1 (YAMl) manifest contents to OKH LOSH (TOML) manifest contents,
Expand All @@ -57,16 +76,15 @@ def convert_okh_v1_to_losh(manifest_contents: bytes) -> bytes | None:
with open(fn_v1, "wb") as binary_file:
binary_file.write(manifest_contents)

res = subprocess.run(['okh-tool', 'conv', fn_v1, fn_losh])
conv_cmd = ['okh-tool', 'conv', fn_v1, fn_losh]
try:
subprocess.check_output(conv_cmd, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as err:
raise ConversionError(f"Failed to convert OKH v1 manifest to OKH LOSH, exitcode: {err.returncode}, stderr: {err.stderr.decode(sys.getfilesystemencoding())}, stdout: {err.output.decode(sys.getfilesystemencoding())}", [])

# res.check_returncode()
ok = res.returncode == 0
if ok:
with open(fn_losh, "rb") as binary_file:
manifest_contents = binary_file.read()
else:
raise ConversionError("Failed to convert OKH v1 manifest to OKH LOSH", [])
manifest_contents = None
with open(fn_losh, "rb") as binary_file:
manifest_contents = binary_file.read()

if os.path.exists(fn_v1):
os.remove(fn_v1)
Expand Down

0 comments on commit d9eca04

Please sign in to comment.