Skip to content

Commit

Permalink
general: refactor things here and there
Browse files Browse the repository at this point in the history
- get rid of legacy History class
- make everything more iterative
- minor cli enhancements (fix --no-serve)
  • Loading branch information
karlicoss committed Nov 9, 2020
1 parent 0b27ae3 commit eb7ae12
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 235 deletions.
69 changes: 35 additions & 34 deletions src/promnesia/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import inspect
import sys
from typing import List, Tuple, Optional, Dict, Sequence, Iterable
from typing import List, Tuple, Optional, Dict, Sequence, Iterable, Iterator
from pathlib import Path
from datetime import datetime
from subprocess import check_call
Expand All @@ -13,9 +13,10 @@
from . import config
from . import server
from .misc import install_server
from .common import PathIsh, History, make_filter, get_logger, get_tmpdir
from .common import previsits_to_history, Source, appdirs, python3, get_system_zone
from .dump import dump_histories
from .common import PathIsh, get_logger, get_tmpdir, DbVisit, Res
from .common import Source, appdirs, python3, get_system_zone
from .dump import visits_to_sqlite
from .extract import extract_visits, make_filter


def _do_index() -> Iterable[Exception]:
Expand All @@ -30,38 +31,35 @@ def _do_index() -> Iterable[Exception]:
logger.warning("OUTPUT_DIR '%s' didn't exist, creating", output_dir)
output_dir.mkdir(exist_ok=True, parents=True)

filters = [make_filter(f) for f in cfg.FILTERS]
for f in filters:
History.add_filter(f) # meh..


all_histories = []
errors = []

for idx in indexers:
if isinstance(idx, Exception):
errors.append(idx)
continue
# TODO more defensive! e.g. might not have __module__
einfo = f'{idx.ff.__module__}:{idx.ff.__name__} {idx.args} {idx.kwargs}'

hist, err = previsits_to_history(idx, src=idx.name)
errors.extend(err)
all_histories.append((einfo, hist))

# TODO perhaps it's better to open connection and dump as we collect so it consumes less memory?
dump_histories(all_histories)

# also keep & return errors for further display
errors: List[Exception] = []

def iter_all_visits() -> Iterator[Res[DbVisit]]:
for idx in indexers:
if isinstance(idx, Exception):
errors.append(idx)
yield idx
continue
# todo use this context? not sure where to attach...
einfo = f'{getattr(idx.ff, "__module__", None)}:{getattr(idx.ff, "__name__", None)} {idx.args} {idx.kwargs}'
for v in extract_visits(idx, src=idx.name):
if isinstance(v, Exception):
errors.append(v)
yield v

visits_to_sqlite(iter_all_visits())
return errors


def do_index(config_file: Path):
config.load_from(config_file)
def do_index(config_file: Path) -> None:
logger = get_logger()
config.load_from(config_file) # meh.. should be cleaner
try:
errors = _do_index()
errors = list(_do_index())
finally:
config.reset()
if len(list(errors)) > 0:
if len(errors) > 0:
logger.error('%d errors, exit code 1', len(errors))
sys.exit(1)


Expand Down Expand Up @@ -106,14 +104,17 @@ def do_demo(*, index_as: str, params: Sequence[str], port: Optional[str], config
)
config.instance = cfg

errors = _do_index()
errors = list(_do_index())
if len(errors) > 0:
logger.error('%d errors during indexing (see logs above for backtraces)', len(errors))
for e in errors:
logger.error(e)

dbp = config.get().output_dir / 'promnesia.sqlite'
if port is None:
logger.warning("Port isn't specified, not serving!")
logger.warning(f"Port isn't specified, not serving!\nYou can inspect the database in the meantime, e.g. 'sqlitebrowser {dbp}'")
else:
server._run(port=port, db=config.get().output_dir / 'promnesia.sqlite', timezone=get_system_zone(), quiet=False)
server._run(port=port, db=dbp, timezone=get_system_zone(), quiet=False)

if sys.stdin.isatty():
input("Press any key when ready")
Expand Down Expand Up @@ -202,7 +203,7 @@ def main() -> None:
#

ap.add_argument('--port', type=str, default='13131' , help='Port to serve on')
ap.add_argument('--no-serve', action='store_false', dest='server', help='Pass to only index without running server')
ap.add_argument('--no-serve', action='store_const', const=None, dest='port', help='Pass to only index without running server')
ap.add_argument('--config', type=Path, required=False , help='Config to run against. If omitted, will use empty base config')
ap.add_argument(
'--as',
Expand Down
158 changes: 5 additions & 153 deletions src/promnesia/common.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
from collections.abc import Sized
from datetime import datetime, date
import os
import re
from typing import NamedTuple, Set, Iterable, Dict, TypeVar, Callable, List, Optional, Union, Any, Collection, Sequence, Tuple, TypeVar
from pathlib import Path
from glob import glob
import itertools
import logging
from functools import lru_cache
import shutil
import traceback
import pytz
import warnings

from .cannon import CanonifyException, canonify
from .cannon import canonify


T = TypeVar('T')
Expand Down Expand Up @@ -171,15 +169,6 @@ def make(p: Visit, src: SourceName) -> Res['DbVisit']:

Filter = Callable[[Url], bool]

def make_filter(thing) -> Filter:
if isinstance(thing, str):
rc = re.compile(thing)
def filter_(u: str) -> bool:
return rc.search(u) is not None
return filter_
else: # must be predicate
return thing


from .logging import LazyLogger
logger = LazyLogger('promnesia', level='DEBUG')
Expand All @@ -188,100 +177,6 @@ def get_logger() -> logging.Logger:
return logger


# TODO need to get rid of this.. just some legacy stuf...
class History(Sized):
# TODO I guess instead filter on DbVisit making site?
FILTERS: List[Filter] = [
make_filter(f) for f in
[
r'^chrome-devtools://',
r'^chrome-extension://',
r'^chrome-error://',
r'^chrome-native://',
r'^chrome-search://',

r'chrome://newtab',
r'chrome://apps',
r'chrome://history',

r'^about:',
r'^blob:',
r'^view-source:',

r'^content:',

# TODO maybe file:// too?
# chrome-search:
]
]

@classmethod
def add_filter(cls, filterish):
cls.FILTERS.append(make_filter(filterish))

def __init__(self, *, src: SourceName):
self.vmap: Dict[Visit, DbVisit] = {}
# TODO err... why does it map from previsit???
self.logger = get_logger()
self.src = src

# TODO mm. maybe history should get filters from some global config?
# wonder how okay is it to set class attribute..

@classmethod
def filtered(cls, url: Url) -> bool:
for f in cls.FILTERS:
if f(url):
return True
return False

@property
def visits(self) -> List[DbVisit]:
return list(self.vmap.values())

def register(self, v: Visit) -> Optional[Exception]:
# TODO should we filter before normalising? not sure...
if History.filtered(v.url):
return None

# TODO perhaps take normalised into account here??
if v in self.vmap:
return None

res = DbVisit.make(v, src=self.src)
if isinstance(res, CanonifyException):
# todo need to be a warning, so it's only happening once on a specific url
self.logger.error('error while canonnifying %s... ignoring', v)
self.logger.exception(res)
return None
elif isinstance(res, Exception):
return res
else:
db_visit = res

self.vmap[v] = db_visit
return None
# TODO hmm some filters make sense before stripping off protocol...

## only used in tests?..
def _nmap(self):
from itertools import groupby
key = lambda x: x.norm_url
return {k: list(g) for k, g in groupby(sorted(self.visits, key=key), key=key)}

def __len__(self) -> int:
return len(self._nmap())

def __contains__(self, url) -> bool:
return url in self._nmap()

def __getitem__(self, url: Url):
return self._nmap()[url]
#

def __repr__(self):
return 'History{' + repr(self.visits) + '}'


# kinda singleton
@lru_cache(1)
Expand Down Expand Up @@ -396,53 +291,6 @@ def name(self) -> str:
Indexer = Source


# TODO do we really need it?
def previsits_to_history(extractor, *, src: SourceName) -> Tuple[List[DbVisit], List[Exception]]:
ex = extractor
# TODO isinstance wrapper?
# TODO make more defensive?
logger = get_logger()

log_info: str
if isinstance(ex, Indexer):
log_info = f'{ex.ff.__module__}:{ex.ff.__name__} {ex.args} {ex.kwargs} ...'
extr = lambda: ex.ff(*ex.args, **ex.kwargs)
else:
# TODO if it's a lambda?
log_info = f'{ex.__module__}:{ex.__name__}'
extr = ex


logger.info('extracting via %s ...', log_info)

h = History(src=src)
errors = []
try:
previsits = list(extr())
except Exception as e:
logger.exception(e)
return [], [e]

for p in previsits:
if isinstance(p, Exception):
errors.append(p)
parts = ['indexer emitted exception\n']
# eh, exception type is ignored by format_exception completely, apparently??
parts.extend(traceback.format_exception(Exception, p, p.__traceback__))
logger.error(''.join(parts))
continue

# TODO check whether it's filtered before construction? probably doesn't really impact
res = h.register(p)
if isinstance(res, Exception):
logger.exception(res)
errors.append(res)

# TODO should handle filtering properly?
logger.info('extracting via %s: got %d visits', log_info, len(h))
return h.visits, errors


# not sure if necessary anymore?
# NOTE: used in configs...
def last(path: PathIsh, *parts: str) -> Path:
Expand Down Expand Up @@ -582,3 +430,7 @@ def get_system_tz():
def file_mtime(path: PathIsh) -> datetime:
tz = get_system_tz()
return datetime.fromtimestamp(Path(path).stat().st_mtime, tz=tz)


def now_tz() -> datetime:
return datetime.now(tz=get_system_tz())
43 changes: 29 additions & 14 deletions src/promnesia/dump.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from pathlib import Path
import shutil
from typing import Dict, List, Tuple, Set
from typing import Dict, List, Tuple, Set, Iterable

from more_itertools import chunked

Expand All @@ -10,7 +10,7 @@

from cachew import NTBinder

from .common import get_logger, DbVisit, get_tmpdir
from .common import get_logger, DbVisit, get_tmpdir, Res, now_tz, Loc
from . import config


Expand All @@ -27,18 +27,33 @@
_CHUNK_BY = 10


def dump_histories(all_histories: List[Tuple[str, List[DbVisit]]]) -> None:
def visits_to_sqlite(vit: Iterable[Res[DbVisit]]) -> None:
logger = get_logger()
output_dir = Path(config.get().output_dir)
db_path = output_dir / 'promnesia.sqlite'

def iter_visits():
for e, h in all_histories:
# TODO sort them somehow for determinism?
# TODO what do we do with errors?
# TODO maybe conform them to schema and dump too?
# TODO or, dump to a separate table?
yield from h
now = now_tz()
ok = 0
errors = 0
def vit_ok() -> Iterable[DbVisit]:
nonlocal errors, ok
for v in vit:
if isinstance(v, DbVisit):
ok += 1
yield v
else:
errors += 1
# conform to the schema and dump. can't hurt anyway
ev = DbVisit(
norm_url='<error>',
orig_url='<error>',
dt=now,
locator=Loc.make('<errror>'),
src='error',
# todo attach backtrace?
context=repr(v),
)
yield ev

tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
if not policy_update:
Expand All @@ -53,8 +68,8 @@ def iter_visits():

cleared: Set[str] = set()
with engine.begin() as conn:
for chunk in chunked(iter_visits(), n=_CHUNK_BY):
srcs = set(v.src for v in chunk)
for chunk in chunked(vit_ok(), n=_CHUNK_BY):
srcs = set(v.src or '' for v in chunk)
new = srcs.difference(cleared)
for src in new:
conn.execute(table.delete().where(table.c.src == src))
Expand All @@ -67,5 +82,5 @@ def iter_visits():
if not policy_update:
shutil.move(str(tpath), str(db_path))

logger.info('saved database to %s', db_path)
# TODO log error count
errs = '' if errors == 0 else f', {errors} ERRORS'
logger.info('saved database to "%s". %d total (%d OK%s)', db_path, ok + errors, ok, errs)
Loading

0 comments on commit eb7ae12

Please sign in to comment.