Skip to content

Commit

Permalink
more
Browse files Browse the repository at this point in the history
  • Loading branch information
karlicoss committed Oct 15, 2023
1 parent 4f64d7f commit e016ce3
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 36 deletions.
5 changes: 0 additions & 5 deletions src/bleanser/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ class Keep(Instruction):
pass


class Config(NamedTuple):
prune_dominated: bool = False
multiway : bool = False


### helper to define paramertized tests in function's body
from .utils import under_pytest
if under_pytest:
Expand Down
5 changes: 3 additions & 2 deletions src/bleanser/core/modules/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Dict, Any, Iterator, Sequence, ContextManager, Set, Tuple, ClassVar, Optional


from ..common import parametrize, Config
from ..common import parametrize
from ..common import Keep, Prune
from ..utils import mime
from ..processor import compute_groups, compute_instructions, BaseNormaliser, unique_file_in_tempdir, sort_file
Expand Down Expand Up @@ -180,6 +180,7 @@ class TestNormaliser(SqliteNormaliser):
def test_sqlite_many(tmp_path: Path, multiway: bool) -> None:
class TestNormaliser(SqliteNormaliser):
MULTIWAY = multiway
PRUNE_DOMINATED = True

N = 2000

Expand All @@ -194,7 +195,7 @@ class TestNormaliser(SqliteNormaliser):
paths.append(p)

# shouldn't crash
instrs = list(compute_instructions(
instructions = list(compute_instructions(
paths,
Normaliser=TestNormaliser,
threads=None,
Expand Down
64 changes: 35 additions & 29 deletions src/bleanser/core/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
from subprocess import check_call
from tempfile import TemporaryDirectory, gettempdir, NamedTemporaryFile
from time import time
from typing import Dict, Iterator, Sequence, Optional, Tuple, Optional, Union, ContextManager, Protocol, List, Set, ClassVar, Type, Iterable, NoReturn, Any, Callable
from typing import Dict, Iterator, Sequence, Optional, Tuple, Optional, Union, ContextManager, List, Set, ClassVar, Type, Iterable, NoReturn, Any, Callable
import warnings

from .common import Group, logger, Config, parametrize
from .common import Group, logger, parametrize
from .common import Instruction, Keep, Prune
from .common import divide_by_size
from .utils import total_dir_size
Expand All @@ -21,7 +21,7 @@

from kompress import CPath
import more_itertools
from plumbum import local # type: ignore
from plumbum import local # type: ignore


@contextmanager
Expand Down Expand Up @@ -59,6 +59,7 @@ def unique_file_in_tempdir(*, input_filepath: Path, wdir: Path, suffix: Optional
cleaned_path.parent.mkdir(parents=True, exist_ok=True)
return cleaned_path


# meh... see Fileset._union
# this gives it a bit of a speedup when comparing
def sort_file(filepath: Union[str, Path]) -> None:
Expand All @@ -74,7 +75,7 @@ def sort_file(filepath: Union[str, Path]) -> None:
class BaseNormaliser:
## user overridable configs
PRUNE_DOMINATED: ClassVar[bool] = False
MULTIWAY : ClassVar[bool] = False
MULTIWAY: ClassVar[bool] = False
##

# todo maybe get rid of it? might be overridden by subclasses but probs. shouldn't
Expand Down Expand Up @@ -143,7 +144,9 @@ def do_normalise(self) -> Iterator[Cleaned]:
if len(parameters) == 0:
kwargs = {}
elif parameters.keys() == {'path', 'wdir'}:
warnings.warn("path and wdir arguments on do_cleanup method are deprecated. Please remove them and switch to using self.input and self.tmp_dir")
warnings.warn(
"path and wdir arguments on do_cleanup method are deprecated. Please remove them and switch to using self.input and self.tmp_dir"
)
kwargs = {
'path': self.input,
'wdir': self.tmp_dir,
Expand Down Expand Up @@ -198,10 +201,10 @@ def main(cls) -> None:


def compute_groups(
paths: Sequence[Path],
*,
Normaliser: Type[BaseNormaliser],
threads: Optional[int]=None,
paths: Sequence[Path],
*,
Normaliser: Type[BaseNormaliser],
threads: Optional[int] = None,
) -> Iterator[Group]:
assert len(paths) == len(set(paths)), paths # just in case
assert len(paths) > 0 # just in case
Expand All @@ -228,12 +231,14 @@ def compute_groups(
func = _compute_groups_serial_as_list
else:
func = _compute_groups_serial
futures.append(pool.submit(
func,
paths=pp,
Normaliser=Normaliser,
base_tmp_dir=base_tmp_dir,
))
futures.append(
pool.submit(
func,
paths=pp,
Normaliser=Normaliser,
base_tmp_dir=base_tmp_dir,
)
)
emitted: Set[Path] = set()
for chunk, f in zip(chunks, futures):
last = chunk[0]
Expand Down Expand Up @@ -466,10 +471,10 @@ def _compute_groups_serial_as_list(*args: Any, **kwargs: Any) -> Iterable[Group]
# todo these are already normalized paths?
# although then harder to handle exceptions... ugh
def _compute_groups_serial(
paths: Sequence[Path],
*,
Normaliser: Type[BaseNormaliser],
base_tmp_dir: Path,
paths: Sequence[Path],
*,
Normaliser: Type[BaseNormaliser],
base_tmp_dir: Path,
) -> Iterable[Group]:
assert len(paths) > 0

Expand Down Expand Up @@ -510,6 +515,7 @@ def fset(*paths: Path) -> FileSet:

def unlink_tmp_output(cleaned: Path) -> None:
# meh. unlink is a bit manual, but bounds the filesystem use by two dumps
# todo maybe unlink whole tmp_dir for normaliser?
orig = cleaned2orig[cleaned]
if orig == cleaned:
# handle 'identity' cleanup -- shouldn't try to remove user files
Expand Down Expand Up @@ -656,8 +662,8 @@ def group(rm_last: bool) -> Group:
assert cached == total, 'Iterator should be fully processed!'

stale_files = [p for p in base_tmp_dir.rglob('*') if p.is_file()]
# TODO not sure if should keep it as assert of make a warning?
# assert len(stale_files) == 0, stale_files # FIXME
# TODO at the moment this assert fails sometimes -- need to investigate
# assert len(stale_files) == 0, stale_files


# note: also some tests in sqlite.py
Expand Down Expand Up @@ -700,6 +706,7 @@ def test_bounded_resources(tmp_path: Path, multiway: bool, randomize: bool) -> N

idx = 0
tmp_dir_spaces = []

def check_tmp_dir_space(tmp_dir: Path) -> None:
nonlocal idx
# logger.warning('ITERATION: %s', idx)
Expand All @@ -721,7 +728,6 @@ def check_tmp_dir_space(tmp_dir: Path) -> None:
tmp_dir_spaces.append(ds)
idx += 1


class TestNormaliser(BaseNormaliser):
MULTIWAY = multiway
PRUNE_DOMINATED = True
Expand All @@ -734,7 +740,6 @@ def do_cleanup(self) -> Iterator[Path]:
check_tmp_dir_space(self._base_tmp_dir)
yield normalised


func = lambda paths: compute_groups(paths, Normaliser=TestNormaliser)

# force it to compute
Expand Down Expand Up @@ -765,7 +770,6 @@ class TestNormaliser(BaseNormaliser):
MULTIWAY = multiway
PRUNE_DOMINATED = True


paths = []
for i in range(N):
p = tmp_path / f'{i:05}'
Expand Down Expand Up @@ -999,7 +1003,7 @@ def groups_to_instructions(groups: Iterable[Group]) -> Iterator[Instruction]:


def test_groups_to_instructions() -> None:
def do(*pp: Sequence[str], config=Config()):
def do(*pp: Sequence[str]):
ppp = [list(map(Path, s)) for s in pp]
# for this test we assume pivots are just at the edges
grit = (
Expand Down Expand Up @@ -1100,10 +1104,10 @@ def do(*pp: Sequence[str], config=Config()):


def compute_instructions(
paths: Sequence[Path],
*,
Normaliser: Type[BaseNormaliser],
threads: Optional[int],
paths: Sequence[Path],
*,
Normaliser: Type[BaseNormaliser],
threads: Optional[int],
) -> Iterator[Instruction]:
groups: Iterable[Group] = compute_groups(
paths=paths,
Expand Down Expand Up @@ -1137,9 +1141,11 @@ def apply_instructions(instructions: Iterable[Instruction], *, mode: Mode=Dry(),
totals = '???'

rm_action = {
# fmt: off
Dry : click.style('REMOVE (dry mode)', fg='yellow'),
Move : click.style('MOVE ', fg='yellow'),
Remove: click.style('REMOVE ', fg='red' ),
# fmt: on
}[type(mode)]

tot_files = 0
Expand Down

0 comments on commit e016ce3

Please sign in to comment.