From 32c26e8ffa81e33c5db6a7a71913ebd1d5fc5683 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 17 Oct 2023 01:20:49 +0100 Subject: [PATCH 1/3] experiment with hypothesis normaliser via HPI extraction and the corresponding 'cleanup' normaliser --- src/bleanser/modules/hpi/hypothesis.py | 32 ++++++++++++++++++++++++++ src/bleanser/modules/hypothesis.py | 13 +++++++++++ 2 files changed, 45 insertions(+) create mode 100644 src/bleanser/modules/hpi/hypothesis.py create mode 100644 src/bleanser/modules/hypothesis.py diff --git a/src/bleanser/modules/hpi/hypothesis.py b/src/bleanser/modules/hpi/hypothesis.py new file mode 100644 index 0000000..3dc02c2 --- /dev/null +++ b/src/bleanser/modules/hpi/hypothesis.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +from pathlib import Path +from typing import Any, Iterator + +from bleanser.core.modules.extract import ExtractObjectsNormaliser + +from my.core.cfg import tmp_config +import my.hypothesis + + +# FIXME need to disable cachew when using normalising via HPI +# otherwise will mess up the cache all the time +# or even potentially can give inconsistent results if there is a bug in cache key + + +class Normaliser(ExtractObjectsNormaliser): + def extract_objects(self, path: Path) -> Iterator[Any]: + class config: + class hypothesis: + export_path = path + + with tmp_config(modules=my.hypothesis.__name__, config=config): + ## sanity check to make sure tmp_config worked as expected + # for most modules should be able to use module.inputs() directly though + dal = my.hypothesis._dal() + assert len(dal.sources) == 1 + ## + yield from my.hypothesis.highlights() + + +if __name__ == '__main__': + Normaliser.main() diff --git a/src/bleanser/modules/hypothesis.py b/src/bleanser/modules/hypothesis.py new file mode 100644 index 0000000..d9e7798 --- /dev/null +++ b/src/bleanser/modules/hypothesis.py @@ -0,0 +1,13 @@ +from bleanser.core.modules.json import JsonNormaliser, Json + + +class Normaliser(JsonNormaliser): + def cleanup(self, j: Json) -> Json: + if isinstance(j, list): + # old export format + return j + del j['profile']['features'] # flaky + return j + +if __name__ == '__main__': + Normaliser.main() From 50a8a121eda77730f8e7bce8e969cfdd0c0b0200 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 17 Oct 2023 23:31:24 +0100 Subject: [PATCH 2/3] kobo --- src/bleanser/modules/hpi/hypothesis.py | 1 - src/bleanser/modules/hpi/kobo.py | 33 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 src/bleanser/modules/hpi/kobo.py diff --git a/src/bleanser/modules/hpi/hypothesis.py b/src/bleanser/modules/hpi/hypothesis.py index 3dc02c2..f25845b 100644 --- a/src/bleanser/modules/hpi/hypothesis.py +++ b/src/bleanser/modules/hpi/hypothesis.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 from pathlib import Path from typing import Any, Iterator diff --git a/src/bleanser/modules/hpi/kobo.py b/src/bleanser/modules/hpi/kobo.py new file mode 100644 index 0000000..a7d3d78 --- /dev/null +++ b/src/bleanser/modules/hpi/kobo.py @@ -0,0 +1,33 @@ +from pathlib import Path +from typing import Any, Iterator + +from bleanser.core.modules.extract import ExtractObjectsNormaliser + +from my.core.cfg import tmp_config +import my.kobo + + +class Normaliser(ExtractObjectsNormaliser): + def extract_objects(self, path: Path) -> Iterator[Any]: + class config: + class kobo: + export_path = path + + with tmp_config(modules=my.kobo.__name__, config=config): + assert len(my.kobo.DATABASES) == 1 + yield from [] + + yield from my.kobo._iter_highlights() + # iter_highlights + # iter_events + # + ## sanity check to make sure tmp_config worked as expected + # for most modules should be able to use module.inputs() directly though + # dal = my.hypothesis._dal() + # assert len(dal.sources) == 1 + ## + # yield from my.hypothesis.highlights() + + +if __name__ == '__main__': + Normaliser.main() From 30f94a3fdaa62ad6a597f6487d729c7987e094f7 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Wed, 18 Oct 2023 22:30:14 +0100 Subject: [PATCH 3/3] reddit normalising via hpi --- src/bleanser/modules/hpi/reddit.py | 54 ++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/bleanser/modules/hpi/reddit.py diff --git a/src/bleanser/modules/hpi/reddit.py b/src/bleanser/modules/hpi/reddit.py new file mode 100644 index 0000000..2c7230b --- /dev/null +++ b/src/bleanser/modules/hpi/reddit.py @@ -0,0 +1,54 @@ +import os +from pathlib import Path +from typing import Any, Iterator + +from bleanser.core.modules.extract import ExtractObjectsNormaliser + +from my.core.cfg import tmp_config +from my.core.freezer import Freezer + + +# disable cache, otherwise it's gonna flush it all the time +os.environ['CACHEW_DISABLE'] = '*' +os.environ.pop('ENLIGHTEN_ENABLE', None) +os.environ['LOGGING_LEVEL_rexport_dal'] = 'WARNING' +# os.environ['LOGGING_LEVEL_my_reddit_rexport'] = 'WARNING' + +import my.reddit.rexport as reddit + + +class Normaliser(ExtractObjectsNormaliser): + def extract_objects(self, path: Path) -> Iterator[Any]: + class config: + class reddit: + # FIXME need to put in reddit.rexport + export_path = path + + with tmp_config(modules=reddit.__name__, config=config): + ## sanity check to make sure tmp_config worked as expected + # for most modules should be able to use module.inputs() directly though + assert len(reddit.inputs()) == 1 + + reddit_profile = lambda: [reddit.profile()] + for (method, type_) in [ + # fmt: off + (reddit.saved , reddit.Save ), + (reddit.comments , reddit.Comment ), + (reddit.submissions , reddit.Submission ), + (reddit.upvoted , reddit.Upvote ), + (reddit.subreddits , reddit.Subreddit ), + (reddit.multireddits, reddit.Multireddit), + (reddit_profile , reddit.Profile ), + # fmt: on + ]: + # need to run it past freezer so it's dumped as dataclass + freezer = Freezer(Orig=type_) + for x in map(freezer.freeze, method()): + # raw data might be too noisy + x.raw = None # type: ignore + # FIXME currently freezer hardcodes RRR for dataclass name + yield {type_.__name__: x} + + +if __name__ == '__main__': + Normaliser.main()