Skip to content

Commit

Permalink
core: some refactoring and cleanup for bleanser interface
Browse files Browse the repository at this point in the history
- rename do_cleanup to normalise (backwards compatible)
- no need to pass around wdir anymore, it should be available in self.tmp_dir attribute
- do_normalise now handles boilerplate like decompression so it's not necessarry to do manually in child clases
- more consistent temporary directory handling everywhere
  • Loading branch information
karlicoss committed Oct 15, 2023
1 parent 4d7d84e commit d7b3f53
Show file tree
Hide file tree
Showing 8 changed files with 278 additions and 257 deletions.
25 changes: 12 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,18 @@ from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir
class Normaliser(BaseNormaliser):

@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
# temporarily decompress if the data is stored as compressed on disk
with self.unpacked(path=path, wdir=wdir) as upath:
def normalise(self, *, path: Path) -> Iterator[Path]:
# if the input file was compressed, the "path" you recieve here will be decompressed

# a temporary file we write 'normalised' data to, that can be easily diffed/compared
normalised = unique_file_in_tempdir(input_filepath=upath, wdir=self.tmp_dir)

# a temporary file we write 'clean' data to, that can be easily diffed/compared
cleaned = unique_file_in_tempdir(input_filepath=upath, wdir=wdir)
# some custom code here per-module that writes to 'normalised'

# some custom code here per-module that writes to 'cleaned'
yield normalised

yield cleaned


# this script could be run directly, or if its installed in a module like
# this script should be run as a module like
# python3 -m bleanser.modules.smscalls --glob ...
if __name__ == "__main__":
Normaliser.main()
Expand All @@ -73,11 +72,11 @@ This is **always** acting on the data loaded into memory/temporary files, it is

There are particular normalisers for different filetypes, e.g. [`json`](./src/bleanser/core/modules.json.py), [`xml`](./src/bleanser/core/modules/xml_clean.py), [`sqlite`](./src/bleanser/core/modules/sqlite.py) which might work if your data is especially basic, but typically this requires subclassing one of those and writing some custom code to 'cleanup' the data, so it can be properly compared/diffed.

### do_cleanup
### normalise

There are two ways you can think about `do_cleanup` (creating a 'cleaned'/normalised representation of an input file) -- by specifying an 'upper' or 'lower' bound:
There are two ways you can think about `normalise` (creating a 'cleaned'/normalised representation of an input file) -- by specifying an 'upper' or 'lower' bound:

- upper: specify which data you want to drop, dumping everything else to `cleaned`
- upper: specify which data you want to drop, dumping everything else to `normalised`
- lower: specify which keys/data you want to keep, e.g. only returning a few keys which uniquely identify events in the data

As an example say you had a JSON export:
Expand Down Expand Up @@ -178,4 +177,4 @@ if __name__ == "__main__":
Normaliser.main()
```

Otherwise if you have some complex data source you need to handle yourself, you can override `do_cleanup` and `unpacked` (how the data gets uncompressed/pre-processed) methods yourself
Otherwise if you have some complex data source you need to handle yourself, you can override `do_normalise` and `unpacked` (how the data gets uncompressed/pre-processed) methods yourself
2 changes: 1 addition & 1 deletion doc/options.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
An explanation of the `--multiway`/`--prune-dominated` options, modified from [zulip chat](https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/bleanser/near/258276779)

Say you had a bunch of sqlite databases and mapped them onto text dumps using `do_cleanup`. The idea is to figure out which dumps are redundant.
Say you had a bunch of sqlite databases and mapped them onto text dumps using `normalise`. The idea is to figure out which dumps are redundant.

Say you've got dumps `C.sql` and `B.sql` -- and you diff them (like literally, [`diff`](https://man7.org/linux/man-pages/man1/diff.1.html))

Expand Down
5 changes: 0 additions & 5 deletions src/bleanser/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ class Keep(Instruction):
pass


class Config(NamedTuple):
prune_dominated: bool = False
multiway : bool = False


### helper to define paramertized tests in function's body
from .utils import under_pytest
if under_pytest:
Expand Down
26 changes: 12 additions & 14 deletions src/bleanser/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Optional, List

from .common import logger, Dry, Move, Remove, Mode
from .processor import compute_instructions, apply_instructions
from .processor import compute_instructions, apply_instructions, bleanser_tmp_directory

import click

Expand Down Expand Up @@ -54,23 +54,21 @@ def diff(path1: str, path2: Path, *, glob: bool, from_: Optional[int], to: Optio
if difftool is not None:
os.environ['DIFFTOOL'] = difftool


for line in compute_diff(path1_, path2, Normaliser=Normaliser):
print(line)

# todo ugh, name sucks
@call_main.command(name='cleaned', short_help='cleanup file and dump to stdout')
@call_main.command(name='normalised', short_help='normalise file and dump to stdout')
@click.argument('path', type=Path)
@click.option('--stdout', is_flag=True)
def cleaned(path: Path, stdout: bool) -> None:
n = Normaliser()
from tempfile import TemporaryDirectory
with TemporaryDirectory() as td, n.do_cleanup(path, wdir=Path(td)) as cleaned:
if stdout:
print(cleaned.read_text())
else:
click.secho(f'You can examine cleaned file: {cleaned}', fg='green')
click.pause(info="Press any key when you've finished")
@click.option('--stdout', is_flag=True, help='print normalised files to stdout instead of printing the path to it')
def normalised(path: Path, stdout: bool) -> None:
with bleanser_tmp_directory() as base_tmp_dir:
n = Normaliser(input=path, base_tmp_dir=base_tmp_dir)
with n.do_normalise() as cleaned:
if stdout:
print(cleaned.read_text())
else:
click.secho(f'You can examine normalised file: {cleaned}', fg='green')
click.pause(info="Press any key when you've finished")


@call_main.command(name='prune', short_help='process & prune files')
Expand Down
2 changes: 0 additions & 2 deletions src/bleanser/core/modules/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ def cleanup(self, j: Json) -> Json:

@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
assert path.stat().st_size > 0, path # just in case

with self.unpacked(path=path, wdir=wdir) as upath:
pass
del path # just to prevent from using by accident
Expand Down
37 changes: 13 additions & 24 deletions src/bleanser/core/modules/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Dict, Any, Iterator, Sequence, ContextManager, Set, Tuple, ClassVar, Optional


from ..common import parametrize, Config
from ..common import parametrize
from ..common import Keep, Prune
from ..utils import mime
from ..processor import compute_groups, compute_instructions, BaseNormaliser, unique_file_in_tempdir, sort_file
Expand Down Expand Up @@ -71,20 +71,12 @@ def _dict2db(d: Dict, *, to: Path) -> Path:
return to # just for convenience


def _test_aux(path: Path, *, wdir: Path) -> ContextManager[Path]:
# TODO this assumes they are already cleaned up?
n = SqliteNormaliser()
return n.do_cleanup(path=path, wdir=wdir)


def test_sqlite_simple(tmp_path: Path) -> None:
config = Config(multiway=False, prune_dominated=True)
func = lambda paths: compute_groups(
paths,
cleanup=_test_aux,
diff_filter=SqliteNormaliser._DIFF_FILTER,
config=config,
)
class TestNormaliser(SqliteNormaliser):
MULTIWAY = False
PRUNE_DOMINATED = True

func = lambda paths: compute_groups(paths, Normaliser=TestNormaliser)

d: Dict[str, Any] = {'tq': [['col1', 'col2']]}
### just one file
Expand Down Expand Up @@ -185,13 +177,12 @@ def test_sqlite_simple(tmp_path: Path) -> None:


@parametrize('multiway', [False, True])
def test_sqlite_many(multiway: bool, tmp_path: Path) -> None:
config = Config(multiway=multiway)
N = 2000
def test_sqlite_many(tmp_path: Path, multiway: bool) -> None:
class TestNormaliser(SqliteNormaliser):
MULTIWAY = multiway
PRUNE_DOMINATED = True

def ident(path: Path, *, wdir: Path) -> ContextManager[Path]:
n = SqliteNormaliser()
return n.do_cleanup(path=path, wdir=wdir)
N = 2000

paths = []
d: Dict[str, Any] = {}
Expand All @@ -204,9 +195,9 @@ def ident(path: Path, *, wdir: Path) -> ContextManager[Path]:
paths.append(p)

# shouldn't crash
instrs = list(compute_instructions(
instructions = list(compute_instructions(
paths,
Normaliser=SqliteNormaliser,
Normaliser=TestNormaliser,
threads=None,
))

Expand Down Expand Up @@ -236,8 +227,6 @@ def checked(cls, db: Path) -> Path:
@contextmanager
def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
# NOTE: path here is the _original_ path passed to bleanser, so we can't modify in place
assert path.stat().st_size > 0, path # just in case
# TODO maybe, later implement some sort of class variable instead of hardcoding
# note: deliberately keeping mime check inside do_cleanup, since it's executed in a parallel process
# otherwise it essentially blocks waiting for all mimes to compute..
mp = mime(path)
Expand Down
Loading

0 comments on commit d7b3f53

Please sign in to comment.