core: some refactoring and cleanup for bleanser interface

- rename do_cleanup to normalise (backwards compatible) - no need to pass around wdir anymore, it should be available in self.tmp_dir attribute - do_normalise now handles boilerplate like decompression so it's not necessarry to do manually in child clases - more consistent temporary directory handling everywhere
karlicoss · Oct 15, 2023 · d7b3f53 · d7b3f53
1 parent 4d7d84e
commit d7b3f53
Show file tree

Hide file tree

Showing 8 changed files with 278 additions and 257 deletions.
diff --git a/README.md b/README.md
@@ -51,19 +51,18 @@ from bleanser.core.processor import BaseNormaliser, unique_file_in_tempdir
 class Normaliser(BaseNormaliser):
 
     @contextmanager
-    def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
-        # temporarily decompress if the data is stored as compressed on disk
-        with self.unpacked(path=path, wdir=wdir) as upath:
+    def normalise(self, *, path: Path) -> Iterator[Path]:
+        # if the input file was compressed, the "path" you recieve here will be decompressed
+
+        # a temporary file we write 'normalised' data to, that can be easily diffed/compared
+        normalised = unique_file_in_tempdir(input_filepath=upath, wdir=self.tmp_dir)
 
-            # a temporary file we write 'clean' data to, that can be easily diffed/compared
-            cleaned = unique_file_in_tempdir(input_filepath=upath, wdir=wdir)
+        # some custom code here per-module that writes to 'normalised'
 
-            # some custom code here per-module that writes to 'cleaned'
+        yield normalised
 
-        yield cleaned
 
-
-# this script could be run directly, or if its installed in a module like
+# this script should be run as a module like
 # python3 -m bleanser.modules.smscalls --glob ...
 if __name__ == "__main__":
     Normaliser.main()
@@ -73,11 +72,11 @@ This is **always** acting on the data loaded into memory/temporary files, it is
 
 There are particular normalisers for different filetypes, e.g. [`json`](./src/bleanser/core/modules.json.py), [`xml`](./src/bleanser/core/modules/xml_clean.py), [`sqlite`](./src/bleanser/core/modules/sqlite.py) which might work if your data is especially basic, but typically this requires subclassing one of those and writing some custom code to 'cleanup' the data, so it can be properly compared/diffed.
 
-### do_cleanup
+### normalise
 
-There are two ways you can think about `do_cleanup` (creating a 'cleaned'/normalised representation of an input file) -- by specifying an 'upper' or 'lower' bound:
+There are two ways you can think about `normalise` (creating a 'cleaned'/normalised representation of an input file) -- by specifying an 'upper' or 'lower' bound:
 
-- upper: specify which data you want to drop, dumping everything else to `cleaned`
+- upper: specify which data you want to drop, dumping everything else to `normalised`
 - lower: specify which keys/data you want to keep, e.g. only returning a few keys which uniquely identify events in the data
 
 As an example say you had a JSON export:
@@ -178,4 +177,4 @@ if __name__ == "__main__":
     Normaliser.main()
 ```
 
-Otherwise if you have some complex data source you need to handle yourself, you can override `do_cleanup` and `unpacked` (how the data gets uncompressed/pre-processed) methods yourself
+Otherwise if you have some complex data source you need to handle yourself, you can override `do_normalise` and `unpacked` (how the data gets uncompressed/pre-processed) methods yourself
diff --git a/doc/options.md b/doc/options.md
@@ -1,6 +1,6 @@
 An explanation of the `--multiway`/`--prune-dominated` options, modified from [zulip chat](https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/bleanser/near/258276779)
 
-Say you had a bunch of sqlite databases and mapped them onto text dumps using `do_cleanup`. The idea is to figure out which dumps are redundant.
+Say you had a bunch of sqlite databases and mapped them onto text dumps using `normalise`. The idea is to figure out which dumps are redundant.
 
 Say you've got dumps `C.sql` and `B.sql` -- and you diff them (like literally, [`diff`](https://man7.org/linux/man-pages/man1/diff.1.html))
 

diff --git a/src/bleanser/core/common.py b/src/bleanser/core/common.py
@@ -59,11 +59,6 @@ class Keep(Instruction):
     pass
 
 
-class Config(NamedTuple):
-    prune_dominated: bool = False
-    multiway       : bool = False
-
-
 ### helper to define paramertized tests in function's body
 from .utils import under_pytest
 if under_pytest:

diff --git a/src/bleanser/core/main.py b/src/bleanser/core/main.py
@@ -5,7 +5,7 @@
 from typing import Optional, List
 
 from .common import logger, Dry, Move, Remove, Mode
-from .processor import compute_instructions, apply_instructions
+from .processor import compute_instructions, apply_instructions, bleanser_tmp_directory
 
 import click
 
@@ -54,23 +54,21 @@ def diff(path1: str, path2: Path, *, glob: bool, from_: Optional[int], to: Optio
         if difftool is not None:
             os.environ['DIFFTOOL'] = difftool
 
-
         for line in compute_diff(path1_, path2, Normaliser=Normaliser):
             print(line)
 
-    # todo ugh, name sucks
-    @call_main.command(name='cleaned', short_help='cleanup file and dump to stdout')
+    @call_main.command(name='normalised', short_help='normalise file and dump to stdout')
     @click.argument('path', type=Path)
-    @click.option('--stdout', is_flag=True)
-    def cleaned(path: Path, stdout: bool) -> None:
-        n = Normaliser()
-        from tempfile import TemporaryDirectory
-        with TemporaryDirectory() as td, n.do_cleanup(path, wdir=Path(td)) as cleaned:
-            if stdout:
-                print(cleaned.read_text())
-            else:
-                click.secho(f'You can examine cleaned file: {cleaned}', fg='green')
-                click.pause(info="Press any key when you've finished")
+    @click.option('--stdout', is_flag=True, help='print normalised files to stdout instead of printing the path to it')
+    def normalised(path: Path, stdout: bool) -> None:
+        with bleanser_tmp_directory() as base_tmp_dir:
+            n = Normaliser(input=path, base_tmp_dir=base_tmp_dir)
+            with n.do_normalise() as cleaned:
+                if stdout:
+                    print(cleaned.read_text())
+                else:
+                    click.secho(f'You can examine normalised file: {cleaned}', fg='green')
+                    click.pause(info="Press any key when you've finished")
 
 
     @call_main.command(name='prune', short_help='process & prune files')

diff --git a/src/bleanser/core/modules/json.py b/src/bleanser/core/modules/json.py
@@ -24,8 +24,6 @@ def cleanup(self, j: Json) -> Json:
 
     @contextmanager
     def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
-        assert path.stat().st_size > 0, path  # just in case
-
         with self.unpacked(path=path, wdir=wdir) as upath:
             pass
         del path # just to prevent from using by accident

diff --git a/src/bleanser/core/modules/sqlite.py b/src/bleanser/core/modules/sqlite.py
@@ -12,7 +12,7 @@
 from typing import Dict, Any, Iterator, Sequence, ContextManager, Set, Tuple, ClassVar, Optional
 
 
-from ..common import parametrize, Config
+from ..common import parametrize
 from ..common import Keep, Prune
 from ..utils import mime
 from ..processor import compute_groups, compute_instructions, BaseNormaliser, unique_file_in_tempdir, sort_file
@@ -71,20 +71,12 @@ def _dict2db(d: Dict, *, to: Path) -> Path:
     return to  # just for convenience
 
 
-def _test_aux(path: Path, *, wdir: Path) -> ContextManager[Path]:
-    # TODO this assumes they are already cleaned up?
-    n = SqliteNormaliser()
-    return n.do_cleanup(path=path, wdir=wdir)
-
-
 def test_sqlite_simple(tmp_path: Path) -> None:
-    config = Config(multiway=False, prune_dominated=True)
-    func = lambda paths: compute_groups(
-        paths,
-        cleanup=_test_aux,
-        diff_filter=SqliteNormaliser._DIFF_FILTER,
-        config=config,
-    )
+    class TestNormaliser(SqliteNormaliser):
+        MULTIWAY = False
+        PRUNE_DOMINATED = True
+
+    func = lambda paths: compute_groups(paths, Normaliser=TestNormaliser)
 
     d: Dict[str, Any] = {'tq': [['col1', 'col2']]}
     ### just one file
@@ -185,13 +177,12 @@ def test_sqlite_simple(tmp_path: Path) -> None:
 
 
 @parametrize('multiway', [False, True])
-def test_sqlite_many(multiway: bool, tmp_path: Path) -> None:
-    config = Config(multiway=multiway)
-    N = 2000
+def test_sqlite_many(tmp_path: Path, multiway: bool) -> None:
+    class TestNormaliser(SqliteNormaliser):
+        MULTIWAY = multiway
+        PRUNE_DOMINATED = True
 
-    def ident(path: Path, *, wdir: Path) -> ContextManager[Path]:
-        n = SqliteNormaliser()
-        return n.do_cleanup(path=path, wdir=wdir)
+    N = 2000
 
     paths = []
     d: Dict[str, Any] =  {}
@@ -204,9 +195,9 @@ def ident(path: Path, *, wdir: Path) -> ContextManager[Path]:
         paths.append(p)
 
     # shouldn't crash
-    instrs = list(compute_instructions(
+    instructions = list(compute_instructions(
         paths,
-        Normaliser=SqliteNormaliser,
+        Normaliser=TestNormaliser,
         threads=None,
     ))
 
@@ -236,8 +227,6 @@ def checked(cls, db: Path) -> Path:
     @contextmanager
     def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
         # NOTE: path here is the _original_  path passed to bleanser, so we can't modify in place
-        assert path.stat().st_size > 0, path  # just in case
-        # TODO maybe, later implement some sort of class variable instead of hardcoding
         # note: deliberately keeping mime check inside do_cleanup, since it's executed in a parallel process
         # otherwise it essentially blocks waiting for all mimes to compute..
         mp = mime(path)