Skip to content

Commit

Permalink
core/modules: switch to normalise from do_cleanup, remove uncompress …
Browse files Browse the repository at this point in the history
…boilerplate
  • Loading branch information
karlicoss committed Oct 15, 2023
1 parent d7b3f53 commit bc80490
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 33 deletions.
12 changes: 5 additions & 7 deletions src/bleanser/core/modules/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Iterator, Any


from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file
from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file, Normalised


class ExtractObjectsNormaliser(BaseNormaliser):
Expand Down Expand Up @@ -42,13 +42,11 @@ def _emit_history(self, upath: Path, cleaned) -> None:
f.write("\n")

@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
with self.unpacked(path, wdir=wdir) as upath:
cleaned = unique_file_in_tempdir(input_filepath=path, wdir=wdir, suffix=path.suffix)
del path
def normalise(self, *, path: Path) -> Iterator[Normalised]:
cleaned = unique_file_in_tempdir(input_filepath=path, wdir=self.tmp_dir, suffix=path.suffix)

self._emit_history(upath, cleaned)
sort_file(cleaned)
self._emit_history(path, cleaned)
sort_file(cleaned)

yield cleaned

Expand Down
12 changes: 4 additions & 8 deletions src/bleanser/core/modules/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterator


from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file
from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file, Normalised
from bleanser.core.utils import Json, delkeys, patch_atoms # for convenience...
from bleanser.core.utils import mime

Expand All @@ -23,11 +23,7 @@ def cleanup(self, j: Json) -> Json:
return j

@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
with self.unpacked(path=path, wdir=wdir) as upath:
pass
del path # just to prevent from using by accident

def normalise(self, *, path: Path) -> Iterator[Normalised]:
# TODO maybe, later implement some sort of class variable instead of hardcoding
# note: deliberately keeping mime check inside do_cleanup, since it's executed in a parallel process
# otherwise it essentially blocks waiting for all mimes to compute..
Expand All @@ -38,11 +34,11 @@ def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
# 'application/json',
# }, mp

j = orjson.loads(upath.read_text())
j = orjson.loads(path.read_text())
j = self.cleanup(j)

# create a tempfile to write flattened data to
cleaned = unique_file_in_tempdir(input_filepath=upath, wdir=wdir, suffix='.json')
cleaned = unique_file_in_tempdir(input_filepath=path, wdir=self.tmp_dir, suffix='.json')

with cleaned.open('w') as fo:
if isinstance(j, list):
Expand Down
12 changes: 4 additions & 8 deletions src/bleanser/core/modules/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@
import shutil
import sqlite3
from sqlite3 import Connection
from subprocess import check_call
from typing import Dict, Any, Iterator, Sequence, ContextManager, Set, Tuple, ClassVar, Optional
from typing import Dict, Any, Iterator, Sequence, Set, Tuple, Optional


from ..common import parametrize
from ..common import Keep, Prune
from ..utils import mime
from ..processor import compute_groups, compute_instructions, BaseNormaliser, unique_file_in_tempdir, sort_file
from ..processor import compute_groups, compute_instructions, BaseNormaliser, unique_file_in_tempdir, sort_file, Normalised


from plumbum import local # type: ignore
Expand Down Expand Up @@ -225,8 +224,7 @@ def checked(cls, db: Path) -> Path:
# need to decide where to log them...

@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
# NOTE: path here is the _original_ path passed to bleanser, so we can't modify in place
def normalise(self, *, path: Path) -> Iterator[Normalised]:
# note: deliberately keeping mime check inside do_cleanup, since it's executed in a parallel process
# otherwise it essentially blocks waiting for all mimes to compute..
mp = mime(path)
Expand All @@ -237,8 +235,6 @@ def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
##

# TODO handle compressed databases later... need to think how to work around checking for no wal etc..
# with self.unpacked(path=path, wdir=wdir) as upath:
# pass
upath = path
del path # just to prevent from using by accident

Expand All @@ -248,7 +244,7 @@ def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:

assert upath.is_absolute(), f'{upath} is not an absolute path'

cleaned_db = unique_file_in_tempdir(input_filepath=upath, wdir=wdir, suffix='.db')
cleaned_db = unique_file_in_tempdir(input_filepath=upath, wdir=self.tmp_dir, suffix='.db')
unique_tmp_dir = cleaned_db.parent

from bleanser.core.ext.sqlite_dumben import run as dumben
Expand Down
14 changes: 4 additions & 10 deletions src/bleanser/core/modules/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Iterator


from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file
from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir, sort_file, Normalised


class Normaliser(BaseNormaliser):
Expand All @@ -16,17 +16,11 @@ def cleanup(self, t: etree._Element) -> etree._Element:
return t

@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
assert path.stat().st_size > 0, path # just in case

with self.unpacked(path=path, wdir=wdir) as upath:
pass
del path # just to prevent from using by accident

def normalise(self, *, path: Path) -> Iterator[Normalised]:
# todo not sure if need to release some resources here...
parser = etree.XMLParser(remove_blank_text=True)
# TODO we seem to lose comments here... meh
et = etree.fromstring(upath.read_bytes(), parser=parser)
et = etree.fromstring(path.read_bytes(), parser=parser)
# restore newlines just for the top level
assert et.text is None, et.text
et.text = '\n'
Expand All @@ -36,7 +30,7 @@ def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:

et = self.cleanup(et)

cleaned = unique_file_in_tempdir(input_filepath=upath, wdir=wdir, suffix='.xml')
cleaned = unique_file_in_tempdir(input_filepath=path, wdir=self.tmp_dir, suffix='.xml')
cleaned.write_text(etree.tounicode(et))

# TODO what is the assumption about shape?
Expand Down

0 comments on commit bc80490

Please sign in to comment.