Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

experiment with hypothesis normaliser via HPI extraction and the corresponding 'cleanup' normaliser #28

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/bleanser/modules/hpi/hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pathlib import Path
from typing import Any, Iterator

from bleanser.core.modules.extract import ExtractObjectsNormaliser

from my.core.cfg import tmp_config
import my.hypothesis


# FIXME need to disable cachew when using normalising via HPI
# otherwise will mess up the cache all the time
# or even potentially can give inconsistent results if there is a bug in cache key


class Normaliser(ExtractObjectsNormaliser):
def extract_objects(self, path: Path) -> Iterator[Any]:
class config:
class hypothesis:
export_path = path

with tmp_config(modules=my.hypothesis.__name__, config=config):
## sanity check to make sure tmp_config worked as expected
# for most modules should be able to use module.inputs() directly though
dal = my.hypothesis._dal()
assert len(dal.sources) == 1
##
yield from my.hypothesis.highlights()


if __name__ == '__main__':
Normaliser.main()
33 changes: 33 additions & 0 deletions src/bleanser/modules/hpi/kobo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from pathlib import Path
from typing import Any, Iterator

from bleanser.core.modules.extract import ExtractObjectsNormaliser

from my.core.cfg import tmp_config
import my.kobo


class Normaliser(ExtractObjectsNormaliser):
def extract_objects(self, path: Path) -> Iterator[Any]:
class config:
class kobo:
export_path = path

with tmp_config(modules=my.kobo.__name__, config=config):
assert len(my.kobo.DATABASES) == 1
yield from []

yield from my.kobo._iter_highlights()
# iter_highlights
# iter_events
#
## sanity check to make sure tmp_config worked as expected
# for most modules should be able to use module.inputs() directly though
# dal = my.hypothesis._dal()
# assert len(dal.sources) == 1
##
# yield from my.hypothesis.highlights()


if __name__ == '__main__':
Normaliser.main()
54 changes: 54 additions & 0 deletions src/bleanser/modules/hpi/reddit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
from pathlib import Path
from typing import Any, Iterator

from bleanser.core.modules.extract import ExtractObjectsNormaliser

from my.core.cfg import tmp_config
from my.core.freezer import Freezer


# disable cache, otherwise it's gonna flush it all the time
os.environ['CACHEW_DISABLE'] = '*'
os.environ.pop('ENLIGHTEN_ENABLE', None)
os.environ['LOGGING_LEVEL_rexport_dal'] = 'WARNING'
# os.environ['LOGGING_LEVEL_my_reddit_rexport'] = 'WARNING'

import my.reddit.rexport as reddit


class Normaliser(ExtractObjectsNormaliser):
def extract_objects(self, path: Path) -> Iterator[Any]:
class config:
class reddit:
# FIXME need to put in reddit.rexport
export_path = path

with tmp_config(modules=reddit.__name__, config=config):
## sanity check to make sure tmp_config worked as expected
# for most modules should be able to use module.inputs() directly though
assert len(reddit.inputs()) == 1

reddit_profile = lambda: [reddit.profile()]
for (method, type_) in [
# fmt: off
(reddit.saved , reddit.Save ),
(reddit.comments , reddit.Comment ),
(reddit.submissions , reddit.Submission ),
(reddit.upvoted , reddit.Upvote ),
(reddit.subreddits , reddit.Subreddit ),
(reddit.multireddits, reddit.Multireddit),
(reddit_profile , reddit.Profile ),
# fmt: on
]:
# need to run it past freezer so it's dumped as dataclass
freezer = Freezer(Orig=type_)
for x in map(freezer.freeze, method()):
# raw data might be too noisy
x.raw = None # type: ignore
# FIXME currently freezer hardcodes RRR for dataclass name
yield {type_.__name__: x}


if __name__ == '__main__':
Normaliser.main()
13 changes: 13 additions & 0 deletions src/bleanser/modules/hypothesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from bleanser.core.modules.json import JsonNormaliser, Json


class Normaliser(JsonNormaliser):
def cleanup(self, j: Json) -> Json:
if isinstance(j, list):
# old export format
return j
del j['profile']['features'] # flaky
return j

if __name__ == '__main__':
Normaliser.main()
Loading