From c442081c3ac46859462ccd10f65dc89e39a2f44d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 9 Nov 2020 05:00:24 +0000 Subject: [PATCH] make python-magic optional, rely on builtin mimetypes module first https://github.com/karlicoss/promnesia/issues/116 --- setup.py | 15 +++++++-------- src/promnesia/common.py | 33 ++++++++++++++++++++++++++++---- src/promnesia/sources/auto.py | 17 +++++----------- src/promnesia/sources/browser.py | 5 ++--- 4 files changed, 43 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index 86b18be6..ba7ae5a0 100644 --- a/setup.py +++ b/setup.py @@ -29,12 +29,15 @@ def main(): python_requires='>=3.6', install_requires=[ - *DEPS_INDEXER, - *DEPS_SERVER, + 'appdirs', # for portable user directories detection + 'tzlocal', 'more_itertools', 'pytz', 'sqlalchemy', # DB api 'cachew>=0.8.0', # caching with type hints + + *DEPS_INDEXER, + *DEPS_SERVER, ], extras_require={ 'testing': [ @@ -61,17 +64,12 @@ def main(): } ) +# todo might be nice to ensure they are installable in separation? DEPS_INDEXER = [ - 'appdirs', # for portable user directories detection - 'urlextract', - - # TODO could be optional? - 'python-magic', # for detecting mime types ] DEPS_SERVER = [ - 'tzlocal', 'hug', ] @@ -80,6 +78,7 @@ def main(): # althrough server uses it so not sure... ('optional', 'dependencies that bring some bells & whistles'): [ 'logzero', # pretty colored logging + 'python-magic', # better mimetype decetion ], ('HPI' , 'dependencies for [[https://github.com/karlicoss/HPI][HPI]]'): [ 'HPI', # pypi version diff --git a/src/promnesia/common.py b/src/promnesia/common.py index 38d485a0..b6d345c5 100644 --- a/src/promnesia/common.py +++ b/src/promnesia/common.py @@ -352,12 +352,37 @@ def python3() -> str: # ideally would be nice to fix it properly https://github.com/ahupp/python-magic#windows @lru_cache(1) def _magic(): - import magic # type: ignore - return magic.Magic(mime=True) + logger = get_logger() + try: + import magic # type: ignore + except ModuleNotFoundError as me: + logger.exception(me) + msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation" + logger.warning(msg) + warnings.warn(msg) + return lambda path: None # stub + else: + mm = magic.Magic(mime=True) + return mm.from_file -def mime(path: PathIsh) -> str: - return _magic().from_file(str(path)) +@lru_cache(1) +def _mimetypes(): + import mimetypes + mimetypes.init() + return mimetypes + + +def mime(path: PathIsh) -> Optional[str]: + ps = str(path) + mimetypes = _mimetypes() + # first try mimetypes, it's only using the filename without opening the file + pm, _ = mimetypes.guess_type(ps) + if pm is not None: + return pm + # next, libmagic, it might access the file, so a bit slower + magic = _magic() + return magic(ps) def find_args(root: Path, follow: bool) -> List[str]: diff --git a/src/promnesia/sources/auto.py b/src/promnesia/sources/auto.py index 3bf174d4..b7fb31a0 100644 --- a/src/promnesia/sources/auto.py +++ b/src/promnesia/sources/auto.py @@ -252,25 +252,18 @@ def rit() -> Iterable[Path]: yield from r -import mimetypes -mimetypes.init() - - def by_path(pp: Path): suf = pp.suffix.lower() # firt check suffixes, it's faster s = type2idx(suf) if s is not None: return s, None - # then try mimetypes, it's only using the filename - pm, _ = mimetypes.guess_type(str(pp)) - if pm is not None: - s = type2idx(pm) - if s is not None: - return s, pm - # lastly, use libmagic, it's the slowest + # then try with mime pm = mime(pp) - return type2idx(pm), pm + if pm is not None: + return type2idx(pm), pm + else: + return None, None def _index_file(pp: Path, opts: Options) -> Results: diff --git a/src/promnesia/sources/browser.py b/src/promnesia/sources/browser.py index 1c8269e9..b5838c7e 100644 --- a/src/promnesia/sources/browser.py +++ b/src/promnesia/sources/browser.py @@ -6,7 +6,7 @@ import pytz -from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, _magic +from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, mime from .. import config # todo mcachew? @@ -16,12 +16,11 @@ def index(p: PathIsh) -> Results: - mime = _magic() pp = Path(p) assert pp.exists() # just in case of broken symlinks # is_file check because it also returns dirs - is_db = lambda x: x.is_file() and mime.from_file(str(x)) in ['application/x-sqlite3'] + is_db = lambda x: x.is_file() and mime(x) in ['application/x-sqlite3'] # todo warn if filtered out too many? # todo wonder how quickly mimes can be computed?