diff --git a/README.rst b/README.rst index 9b9e80f69b120..94df109e1c2c4 100644 --- a/README.rst +++ b/README.rst @@ -50,6 +50,7 @@ scikit-learn requires: - Python (>= 3.5) - NumPy (>= 1.11.0) - SciPy (>= 0.17.0) +- joblib (>= 0.11) **Scikit-learn 0.20 was the last version to support Python2.7.** Scikit-learn 0.21 and later require Python 3.5 or newer. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6b66219d5f8c2..3a950325812dd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,7 +11,6 @@ jobs: py35_np_atlas: DISTRIB: 'ubuntu' PYTHON_VERSION: '3.5' - SKLEARN_SITE_JOBLIB: '1' JOBLIB_VERSION: '0.11' SKLEARN_NO_OPENMP: 'True' # Linux + Python 3.5 build with OpenBLAS and without SITE_JOBLIB @@ -23,10 +22,11 @@ jobs: SCIPY_VERSION: '0.17.0' CYTHON_VERSION: '*' PILLOW_VERSION: '4.0.0' + # later version of joblib are not packaged in conda for Python 3.5 + JOBLIB_VERSION: '0.12.3' COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. # It runs tests requiring pandas and PyAMG. - # It also runs with the site joblib instead of the vendored copy of joblib. pylatest_conda: DISTRIB: 'conda' PYTHON_VERSION: '*' @@ -41,7 +41,6 @@ jobs: COVERAGE: 'true' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' TEST_DOCSTRINGS: 'true' - SKLEARN_SITE_JOBLIB: '1' CHECK_WARNINGS: 'true' - template: build_tools/azure/posix.yml diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 2ae3a8d400d25..97f5cb4f7e465 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -11,11 +11,11 @@ IF "%PYTHON_ARCH%"=="64" ( call deactivate @rem Clean up any left-over from a previous build conda remove --all -q -y -n %VIRTUALENV% - conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow + conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow joblib call activate %VIRTUALENV% ) else ( - pip install numpy scipy cython pytest wheel pillow + pip install numpy scipy cython pytest wheel pillow joblib ) if "%COVERAGE%" == "true" ( pip install coverage codecov pytest-cov diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 472cb61a171bf..519f7de1e5037 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -27,7 +27,7 @@ make_conda() { if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="python=$PYTHON_VERSION pip pytest pytest-cov \ numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ - cython=$CYTHON_VERSION" + cython=$CYTHON_VERSION joblib=$JOBLIB_VERSION" if [[ "$INSTALL_MKL" == "true" ]]; then TO_INSTALL="$TO_INSTALL mkl" @@ -47,10 +47,6 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="$TO_INSTALL pillow=$PILLOW_VERSION" fi - if [[ -n "$JOBLIB_VERSION" ]]; then - TO_INSTALL="$TO_INSTALL joblib=$JOBLIB_VERSION" - fi - make_conda $TO_INSTALL elif [[ "$DISTRIB" == "ubuntu" ]]; then diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index e3cd159db7ebc..c38ea5bafdc4a 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -113,7 +113,6 @@ elif [[ "$DISTRIB" == "scipy-dev" ]]; then pip install --pre --upgrade --timeout=60 -f $dev_url numpy scipy pandas cython echo "Installing joblib master" pip install https://github.com/joblib/joblib/archive/master.zip - export SKLEARN_SITE_JOBLIB=1 echo "Installing pillow master" pip install https://github.com/python-pillow/Pillow/archive/master.zip pip install pytest pytest-cov diff --git a/conftest.py b/conftest.py index 45a5a8af29d20..71f3be3192cba 100644 --- a/conftest.py +++ b/conftest.py @@ -61,3 +61,13 @@ def pytest_collection_modifyitems(config, items): for item in items: if isinstance(item, DoctestItem): item.add_marker(skip_marker) + + +def pytest_configure(config): + import sys + sys._is_pytest_session = True + + +def pytest_unconfigure(config): + import sys + del sys._is_pytest_session diff --git a/doc/modules/computing.rst b/doc/modules/computing.rst index d25a339c5b77a..46676ac8aa5ee 100644 --- a/doc/modules/computing.rst +++ b/doc/modules/computing.rst @@ -553,6 +553,11 @@ These environment variables should be set before importing scikit-learn. is supported. In addition, dumps from joblib.Memory might be incompatible, and you might loose some caches and have to redownload some datasets. + .. deprecated:: 0.21 + + As of version 0.21 this parameter has no effect, vendored joblib was + removed and site joblib is always used. + :SKLEARN_ASSUME_FINITE: Sets the default value for the `assume_finite` argument of diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 42b7e2b4d37d3..0a102864e4e1b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -634,6 +634,16 @@ Multiple modules :issue:`13422` by :user:`Madhura Parikh ` and :user:`Clément Doumouro `. + +Dependencies +............ + +- |Enhancement| Joblib is no longer vendored in scikit-learn, and becomes a + dependency. Minimal supported version is joblib 0.11, however using + version >= 0.13 is strongly recommended. + :issue:`13531` by :user:`Roman Yurchak `. + + Changes to estimator checks --------------------------- diff --git a/setup.py b/setup.py index 7535eb3550040..daaf5dbd22acd 100755 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ SCIPY_MIN_VERSION = '0.17.0' NUMPY_MIN_VERSION = '1.11.0' +JOBLIB_MIN_VERSION = '0.11' # Optional setuptools features # We need to import setuptools early, if we want setuptools features, @@ -226,7 +227,8 @@ def setup_package(): cmdclass=cmdclass, install_requires=[ 'numpy>={}'.format(NUMPY_MIN_VERSION), - 'scipy>={}'.format(SCIPY_MIN_VERSION) + 'scipy>={}'.format(SCIPY_MIN_VERSION), + 'joblib>={}'.format(JOBLIB_MIN_VERSION) ], **extra_setuptools_args) diff --git a/sklearn/externals/README b/sklearn/externals/README index 38859bb488fc4..eef7ba7dd652e 100644 --- a/sklearn/externals/README +++ b/sklearn/externals/README @@ -1,9 +1,6 @@ This directory contains bundled external dependencies that are updated every once in a while. -Note to developers and advanced users: setting the SKLEARN_SITE_JOBLIB to -a non null value will force scikit-learn to use the site joblib. - Note for distribution packagers: if you want to remove the duplicated code and depend on a packaged version, we suggest that you simply do a symbolic link in this directory. diff --git a/sklearn/externals/copy_joblib.sh b/sklearn/externals/copy_joblib.sh deleted file mode 100755 index f2c4ab3ed359b..0000000000000 --- a/sklearn/externals/copy_joblib.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/sh -# Script to do a local install of joblib -set +x -export LC_ALL=C -INSTALL_FOLDER=tmp/joblib_install -rm -rf joblib $INSTALL_FOLDER -if [ -z "$1" ] -then - JOBLIB=joblib -else - JOBLIB=$1 -fi - -pip install --no-cache $JOBLIB --target $INSTALL_FOLDER -cp -r $INSTALL_FOLDER/joblib joblib -rm -rf $INSTALL_FOLDER - -# Needed to rewrite the doctests -# Note: BSD sed -i needs an argument unders OSX -# so first renaming to .bak and then deleting backup files -find joblib -name "*.py" | xargs sed -i.bak "s/from joblib/from sklearn.externals.joblib/" -find joblib -name "*.bak" | xargs rm - -# Remove the tests folders to speed-up test time for scikit-learn. -# joblib is already tested on its own CI infrastructure upstream. -rm -r joblib/test diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py index e74f874639bf4..d024ec80c5a2b 100644 --- a/sklearn/externals/joblib/__init__.py +++ b/sklearn/externals/joblib/__init__.py @@ -1,133 +1,15 @@ -"""Joblib is a set of tools to provide **lightweight pipelining in -Python**. In particular: +# Import necessary to preserve backward compatibility of pickles +import sys +import warnings -1. transparent disk-caching of functions and lazy re-evaluation - (memoize pattern) +from joblib import * -2. easy simple parallel computing -Joblib is optimized to be **fast** and **robust** in particular on large -data and has specific optimizations for `numpy` arrays. It is -**BSD-licensed**. +msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be removed " + "in 0.23. Please import this functionality directly from joblib, " + "which can be installed with: pip install joblib. If this warning is " + "raised when loading pickled models, you may need to re-serialize " + "those models with scikit-learn 0.21+.") - - ==================== =============================================== - **Documentation:** https://joblib.readthedocs.io - - **Download:** http://pypi.python.org/pypi/joblib#downloads - - **Source code:** http://github.com/joblib/joblib - - **Report issues:** http://github.com/joblib/joblib/issues - ==================== =============================================== - - -Vision --------- - -The vision is to provide tools to easily achieve better performance and -reproducibility when working with long running jobs. - - * **Avoid computing twice the same thing**: code is rerun over an - over, for instance when prototyping computational-heavy jobs (as in - scientific development), but hand-crafted solution to alleviate this - issue is error-prone and often leads to unreproducible results - - * **Persist to disk transparently**: persisting in an efficient way - arbitrary objects containing large data is hard. Using - joblib's caching mechanism avoids hand-written persistence and - implicitly links the file on disk to the execution context of - the original Python object. As a result, joblib's persistence is - good for resuming an application status or computational job, eg - after a crash. - -Joblib addresses these problems while **leaving your code and your flow -control as unmodified as possible** (no framework, no new paradigms). - -Main features ------------------- - -1) **Transparent and fast disk-caching of output value:** a memoize or - make-like functionality for Python functions that works well for - arbitrary Python objects, including very large numpy arrays. Separate - persistence and flow-execution logic from domain logic or algorithmic - code by writing the operations as a set of steps with well-defined - inputs and outputs: Python functions. Joblib can save their - computation to disk and rerun it only if necessary:: - - >>> from sklearn.externals.joblib import Memory - >>> cachedir = 'your_cache_dir_goes_here' - >>> mem = Memory(cachedir) - >>> import numpy as np - >>> a = np.vander(np.arange(3)).astype(np.float) - >>> square = mem.cache(np.square) - >>> b = square(a) # doctest: +ELLIPSIS - ________________________________________________________________________________ - [Memory] Calling square... - square(array([[0., 0., 1.], - [1., 1., 1.], - [4., 2., 1.]])) - ___________________________________________________________square - 0...s, 0.0min - - >>> c = square(a) - >>> # The above call did not trigger an evaluation - -2) **Embarrassingly parallel helper:** to make it easy to write readable - parallel code and debug it quickly:: - - >>> from sklearn.externals.joblib import Parallel, delayed - >>> from math import sqrt - >>> Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10)) - [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] - - -3) **Fast compressed Persistence**: a replacement for pickle to work - efficiently on Python objects containing large data ( - *joblib.dump* & *joblib.load* ). - -.. - >>> import shutil ; shutil.rmtree(cachedir) - -""" - -# PEP0440 compatible formatted version, see: -# https://www.python.org/dev/peps/pep-0440/ -# -# Generic release markers: -# X.Y -# X.Y.Z # For bugfix releases -# -# Admissible pre-release markers: -# X.YaN # Alpha release -# X.YbN # Beta release -# X.YrcN # Release Candidate -# X.Y # Final release -# -# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. -# 'X.Y.dev0' is the canonical version of 'X.Y.dev' -# -__version__ = '0.13.0' - - -from .memory import Memory, MemorizedResult, register_store_backend -from .logger import PrintTime -from .logger import Logger -from .hashing import hash -from .numpy_pickle import dump -from .numpy_pickle import load -from .compressor import register_compressor -from .parallel import Parallel -from .parallel import delayed -from .parallel import cpu_count -from .parallel import register_parallel_backend -from .parallel import parallel_backend -from .parallel import effective_n_jobs - -from .externals.loky import wrap_non_picklable_objects - - -__all__ = ['Memory', 'MemorizedResult', 'PrintTime', 'Logger', 'hash', 'dump', - 'load', 'Parallel', 'delayed', 'cpu_count', 'effective_n_jobs', - 'register_parallel_backend', 'parallel_backend', - 'register_store_backend', 'register_compressor', - 'wrap_non_picklable_objects'] +if not hasattr(sys, "_is_pytest_session"): + warnings.warn(msg, category=DeprecationWarning) diff --git a/sklearn/externals/joblib/_compat.py b/sklearn/externals/joblib/_compat.py deleted file mode 100644 index 0c6e752478f01..0000000000000 --- a/sklearn/externals/joblib/_compat.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -Compatibility layer for Python 3/Python 2 single codebase -""" -import sys - -PY3_OR_LATER = sys.version_info[0] >= 3 -PY27 = sys.version_info[:2] == (2, 7) - -try: - _basestring = basestring - _bytes_or_unicode = (str, unicode) -except NameError: - _basestring = str - _bytes_or_unicode = (bytes, str) - - -def with_metaclass(meta, *bases): - """Create a base class with a metaclass.""" - return meta("NewBase", bases, {}) diff --git a/sklearn/externals/joblib/_dask.py b/sklearn/externals/joblib/_dask.py deleted file mode 100644 index 98f8a65db3263..0000000000000 --- a/sklearn/externals/joblib/_dask.py +++ /dev/null @@ -1,259 +0,0 @@ -from __future__ import print_function, division, absolute_import - -import contextlib - -from uuid import uuid4 -import weakref - -from .parallel import AutoBatchingMixin, ParallelBackendBase, BatchedCalls -from .parallel import parallel_backend - -try: - import distributed -except ImportError: - distributed = None - -if distributed is not None: - from distributed.client import Client, _wait - from distributed.utils import funcname, itemgetter - from distributed import get_client, secede, rejoin - from distributed.worker import thread_state - from distributed.sizeof import sizeof - from tornado import gen - - -def is_weakrefable(obj): - try: - weakref.ref(obj) - return True - except TypeError: - return False - - -class _WeakKeyDictionary: - """A variant of weakref.WeakKeyDictionary for unhashable objects. - - This datastructure is used to store futures for broadcasted data objects - such as large numpy arrays or pandas dataframes that are not hashable and - therefore cannot be used as keys of traditional python dicts. - - Futhermore using a dict with id(array) as key is not safe because the - Python is likely to reuse id of recently collected arrays. - """ - - def __init__(self): - self._data = {} - - def __getitem__(self, obj): - ref, val = self._data[id(obj)] - if ref() is not obj: - # In case of a race condition with on_destroy. - raise KeyError(obj) - return val - - def __setitem__(self, obj, value): - key = id(obj) - try: - ref, _ = self._data[key] - if ref() is not obj: - # In case of race condition with on_destroy. - raise KeyError(obj) - except KeyError: - # Insert the new entry in the mapping along with a weakref - # callback to automatically delete the entry from the mapping - # as soon as the object used as key is garbage collected. - def on_destroy(_): - del self._data[key] - ref = weakref.ref(obj, on_destroy) - self._data[key] = ref, value - - def __len__(self): - return len(self._data) - - def clear(self): - self._data.clear() - - -def _funcname(x): - try: - if isinstance(x, BatchedCalls): - x = x.items[0][0] - except Exception: - pass - return funcname(x) - - -class Batch(object): - def __init__(self, tasks): - self.tasks = tasks - - def __call__(self, *data): - results = [] - with parallel_backend('dask'): - for func, args, kwargs in self.tasks: - args = [a(data) if isinstance(a, itemgetter) else a - for a in args] - kwargs = {k: v(data) if isinstance(v, itemgetter) else v - for (k, v) in kwargs.items()} - results.append(func(*args, **kwargs)) - return results - - def __reduce__(self): - return Batch, (self.tasks,) - - -class DaskDistributedBackend(ParallelBackendBase, AutoBatchingMixin): - MIN_IDEAL_BATCH_DURATION = 0.2 - MAX_IDEAL_BATCH_DURATION = 1.0 - - def __init__(self, scheduler_host=None, scatter=None, - client=None, loop=None, **submit_kwargs): - if client is None: - if scheduler_host: - client = Client(scheduler_host, loop=loop, - set_as_default=False) - else: - try: - client = get_client() - except ValueError: - msg = ("To use Joblib with Dask first create a Dask Client" - "\n\n" - " from dask.distributed import Client\n" - " client = Client()\n" - "or\n" - " client = Client('scheduler-address:8786')") - raise ValueError(msg) - - self.client = client - - if scatter is not None and not isinstance(scatter, (list, tuple)): - raise TypeError("scatter must be a list/tuple, got " - "`%s`" % type(scatter).__name__) - - if scatter is not None and len(scatter) > 0: - # Keep a reference to the scattered data to keep the ids the same - self._scatter = list(scatter) - scattered = self.client.scatter(scatter, broadcast=True) - self.data_futures = {id(x): f for x, f in zip(scatter, scattered)} - else: - self._scatter = [] - self.data_futures = {} - self.task_futures = set() - self.submit_kwargs = submit_kwargs - - def __reduce__(self): - return (DaskDistributedBackend, ()) - - def get_nested_backend(self): - return DaskDistributedBackend(client=self.client), -1 - - def configure(self, n_jobs=1, parallel=None, **backend_args): - return self.effective_n_jobs(n_jobs) - - def start_call(self): - self.call_data_futures = _WeakKeyDictionary() - - def stop_call(self): - # The explicit call to clear is required to break a cycling reference - # to the futures. - self.call_data_futures.clear() - - def effective_n_jobs(self, n_jobs): - return sum(self.client.ncores().values()) - - def _to_func_args(self, func): - collected_futures = [] - itemgetters = dict() - - # Futures that are dynamically generated during a single call to - # Parallel.__call__. - call_data_futures = getattr(self, 'call_data_futures', None) - - def maybe_to_futures(args): - for arg in args: - arg_id = id(arg) - if arg_id in itemgetters: - yield itemgetters[arg_id] - continue - - f = self.data_futures.get(arg_id, None) - if f is None and call_data_futures is not None: - try: - f = call_data_futures[arg] - except KeyError: - if is_weakrefable(arg) and sizeof(arg) > 1e3: - # Automatically scatter large objects to some of - # the workers to avoid duplicated data transfers. - # Rely on automated inter-worker data stealing if - # more workers need to reuse this data - # concurrently. - [f] = self.client.scatter([arg]) - call_data_futures[arg] = f - - if f is not None: - getter = itemgetter(len(collected_futures)) - collected_futures.append(f) - itemgetters[arg_id] = getter - arg = getter - yield arg - - tasks = [] - for f, args, kwargs in func.items: - args = list(maybe_to_futures(args)) - kwargs = dict(zip(kwargs.keys(), - maybe_to_futures(kwargs.values()))) - tasks.append((f, args, kwargs)) - - if not collected_futures: - return func, () - return (Batch(tasks), collected_futures) - - def apply_async(self, func, callback=None): - key = '%s-batch-%s' % (_funcname(func), uuid4().hex) - func, args = self._to_func_args(func) - - future = self.client.submit(func, *args, key=key, **self.submit_kwargs) - self.task_futures.add(future) - - @gen.coroutine - def callback_wrapper(): - result = yield _wait([future]) - self.task_futures.remove(future) - if callback is not None: - callback(result) # gets called in separate thread - - self.client.loop.add_callback(callback_wrapper) - - ref = weakref.ref(future) # avoid reference cycle - - def get(): - return ref().result() - - future.get = get # monkey patch to achieve AsyncResult API - return future - - def abort_everything(self, ensure_ready=True): - """ Tell the client to cancel any task submitted via this instance - - joblib.Parallel will never access those results - """ - self.client.cancel(self.task_futures) - self.task_futures.clear() - - @contextlib.contextmanager - def retrieval_context(self): - """Override ParallelBackendBase.retrieval_context to avoid deadlocks. - - This removes thread from the worker's thread pool (using 'secede'). - Seceding avoids deadlock in nested parallelism settings. - """ - # See 'joblib.Parallel.__call__' and 'joblib.Parallel.retrieve' for how - # this is used. - if hasattr(thread_state, 'execution_state'): - # we are in a worker. Secede to avoid deadlock. - secede() - - yield - - if hasattr(thread_state, 'execution_state'): - rejoin() diff --git a/sklearn/externals/joblib/_memmapping_reducer.py b/sklearn/externals/joblib/_memmapping_reducer.py deleted file mode 100644 index 5ba78195b22cd..0000000000000 --- a/sklearn/externals/joblib/_memmapping_reducer.py +++ /dev/null @@ -1,434 +0,0 @@ -""" -Reducer using memory mapping for numpy arrays -""" -# Author: Thomas Moreau -# Copyright: 2017, Thomas Moreau -# License: BSD 3 clause - -from mmap import mmap -import errno -import os -import stat -import threading -import atexit -import tempfile -import warnings -import weakref -from uuid import uuid4 - -try: - WindowsError -except NameError: - WindowsError = type(None) - -from pickle import whichmodule -try: - # Python 2 compat - from cPickle import loads - from cPickle import dumps -except ImportError: - from pickle import loads - from pickle import dumps - -from pickle import HIGHEST_PROTOCOL, PicklingError - -try: - import numpy as np - from numpy.lib.stride_tricks import as_strided -except ImportError: - np = None - -from .numpy_pickle import load -from .numpy_pickle import dump -from .backports import make_memmap -from .disk import delete_folder - -# Some system have a ramdisk mounted by default, we can use it instead of /tmp -# as the default folder to dump big arrays to share with subprocesses. -SYSTEM_SHARED_MEM_FS = '/dev/shm' - -# Minimal number of bytes available on SYSTEM_SHARED_MEM_FS to consider using -# it as the default folder to dump big arrays to share with subprocesses. -SYSTEM_SHARED_MEM_FS_MIN_SIZE = int(2e9) - -# Folder and file permissions to chmod temporary files generated by the -# memmapping pool. Only the owner of the Python process can access the -# temporary files and folder. -FOLDER_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR -FILE_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR - - -class _WeakArrayKeyMap: - """A variant of weakref.WeakKeyDictionary for unhashable numpy arrays. - - This datastructure will be used with numpy arrays as obj keys, therefore we - do not use the __get__ / __set__ methods to avoid any conflict with the - numpy fancy indexing syntax. - """ - - def __init__(self): - self._data = {} - - def get(self, obj): - ref, val = self._data[id(obj)] - if ref() is not obj: - # In case of race condition with on_destroy: could never be - # triggered by the joblib tests with CPython. - raise KeyError(obj) - return val - - def set(self, obj, value): - key = id(obj) - try: - ref, _ = self._data[key] - if ref() is not obj: - # In case of race condition with on_destroy: could never be - # triggered by the joblib tests with CPython. - raise KeyError(obj) - except KeyError: - # Insert the new entry in the mapping along with a weakref - # callback to automatically delete the entry from the mapping - # as soon as the object used as key is garbage collected. - def on_destroy(_): - del self._data[key] - ref = weakref.ref(obj, on_destroy) - self._data[key] = ref, value - - def __getstate__(self): - raise PicklingError("_WeakArrayKeyMap is not pickleable") - - -############################################################################### -# Support for efficient transient pickling of numpy data structures - - -def _get_backing_memmap(a): - """Recursively look up the original np.memmap instance base if any.""" - b = getattr(a, 'base', None) - if b is None: - # TODO: check scipy sparse datastructure if scipy is installed - # a nor its descendants do not have a memmap base - return None - - elif isinstance(b, mmap): - # a is already a real memmap instance. - return a - - else: - # Recursive exploration of the base ancestry - return _get_backing_memmap(b) - - -def _get_temp_dir(pool_folder_name, temp_folder=None): - """Get the full path to a subfolder inside the temporary folder. - - Parameters - ---------- - pool_folder_name : str - Sub-folder name used for the serialization of a pool instance. - - temp_folder: str, optional - Folder to be used by the pool for memmapping large arrays - for sharing memory with worker processes. If None, this will try in - order: - - - a folder pointed by the JOBLIB_TEMP_FOLDER environment - variable, - - /dev/shm if the folder exists and is writable: this is a - RAMdisk filesystem available by default on modern Linux - distributions, - - the default system temporary folder that can be - overridden with TMP, TMPDIR or TEMP environment - variables, typically /tmp under Unix operating systems. - - Returns - ------- - pool_folder : str - full path to the temporary folder - use_shared_mem : bool - whether the temporary folder is written to the system shared memory - folder or some other temporary folder. - """ - use_shared_mem = False - if temp_folder is None: - temp_folder = os.environ.get('JOBLIB_TEMP_FOLDER', None) - if temp_folder is None: - if os.path.exists(SYSTEM_SHARED_MEM_FS): - try: - shm_stats = os.statvfs(SYSTEM_SHARED_MEM_FS) - available_nbytes = shm_stats.f_bsize * shm_stats.f_bavail - if available_nbytes > SYSTEM_SHARED_MEM_FS_MIN_SIZE: - # Try to see if we have write access to the shared mem - # folder only if it is reasonably large (that is 2GB or - # more). - temp_folder = SYSTEM_SHARED_MEM_FS - pool_folder = os.path.join(temp_folder, pool_folder_name) - if not os.path.exists(pool_folder): - os.makedirs(pool_folder) - use_shared_mem = True - except (IOError, OSError): - # Missing rights in the /dev/shm partition, fallback to regular - # temp folder. - temp_folder = None - if temp_folder is None: - # Fallback to the default tmp folder, typically /tmp - temp_folder = tempfile.gettempdir() - temp_folder = os.path.abspath(os.path.expanduser(temp_folder)) - pool_folder = os.path.join(temp_folder, pool_folder_name) - return pool_folder, use_shared_mem - - -def has_shareable_memory(a): - """Return True if a is backed by some mmap buffer directly or not.""" - return _get_backing_memmap(a) is not None - - -def _strided_from_memmap(filename, dtype, mode, offset, order, shape, strides, - total_buffer_len): - """Reconstruct an array view on a memory mapped file.""" - if mode == 'w+': - # Do not zero the original data when unpickling - mode = 'r+' - - if strides is None: - # Simple, contiguous memmap - return make_memmap(filename, dtype=dtype, shape=shape, mode=mode, - offset=offset, order=order) - else: - # For non-contiguous data, memmap the total enclosing buffer and then - # extract the non-contiguous view with the stride-tricks API - base = make_memmap(filename, dtype=dtype, shape=total_buffer_len, - mode=mode, offset=offset, order=order) - return as_strided(base, shape=shape, strides=strides) - - -def _reduce_memmap_backed(a, m): - """Pickling reduction for memmap backed arrays. - - a is expected to be an instance of np.ndarray (or np.memmap) - m is expected to be an instance of np.memmap on the top of the ``base`` - attribute ancestry of a. ``m.base`` should be the real python mmap object. - """ - # offset that comes from the striding differences between a and m - a_start, a_end = np.byte_bounds(a) - m_start = np.byte_bounds(m)[0] - offset = a_start - m_start - - # offset from the backing memmap - offset += m.offset - - if m.flags['F_CONTIGUOUS']: - order = 'F' - else: - # The backing memmap buffer is necessarily contiguous hence C if not - # Fortran - order = 'C' - - if a.flags['F_CONTIGUOUS'] or a.flags['C_CONTIGUOUS']: - # If the array is a contiguous view, no need to pass the strides - strides = None - total_buffer_len = None - else: - # Compute the total number of items to map from which the strided - # view will be extracted. - strides = a.strides - total_buffer_len = (a_end - a_start) // a.itemsize - return (_strided_from_memmap, - (m.filename, a.dtype, m.mode, offset, order, a.shape, strides, - total_buffer_len)) - - -def reduce_memmap(a): - """Pickle the descriptors of a memmap instance to reopen on same file.""" - m = _get_backing_memmap(a) - if m is not None: - # m is a real mmap backed memmap instance, reduce a preserving striding - # information - return _reduce_memmap_backed(a, m) - else: - # This memmap instance is actually backed by a regular in-memory - # buffer: this can happen when using binary operators on numpy.memmap - # instances - return (loads, (dumps(np.asarray(a), protocol=HIGHEST_PROTOCOL),)) - - -class ArrayMemmapReducer(object): - """Reducer callable to dump large arrays to memmap files. - - Parameters - ---------- - max_nbytes: int - Threshold to trigger memmapping of large arrays to files created - a folder. - temp_folder: str - Path of a folder where files for backing memmapped arrays are created. - mmap_mode: 'r', 'r+' or 'c' - Mode for the created memmap datastructure. See the documentation of - numpy.memmap for more details. Note: 'w+' is coerced to 'r+' - automatically to avoid zeroing the data on unpickling. - verbose: int, optional, 0 by default - If verbose > 0, memmap creations are logged. - If verbose > 1, both memmap creations, reuse and array pickling are - logged. - prewarm: bool, optional, False by default. - Force a read on newly memmapped array to make sure that OS pre-cache it - memory. This can be useful to avoid concurrent disk access when the - same data array is passed to different worker processes. - """ - - def __init__(self, max_nbytes, temp_folder, mmap_mode, verbose=0, - prewarm=True): - self._max_nbytes = max_nbytes - self._temp_folder = temp_folder - self._mmap_mode = mmap_mode - self.verbose = int(verbose) - self._prewarm = prewarm - self._memmaped_arrays = _WeakArrayKeyMap() - - def __reduce__(self): - # The ArrayMemmapReducer is passed to the children processes: it needs - # to be pickled but the _WeakArrayKeyMap need to be skipped as it's - # only guaranteed to be consistent with the parent process memory - # garbage collection. - args = (self._max_nbytes, self._temp_folder, self._mmap_mode) - kwargs = { - 'verbose': self.verbose, - 'prewarm': self._prewarm, - } - return ArrayMemmapReducer, args, kwargs - - def __call__(self, a): - m = _get_backing_memmap(a) - if m is not None: - # a is already backed by a memmap file, let's reuse it directly - return _reduce_memmap_backed(a, m) - - if (not a.dtype.hasobject and self._max_nbytes is not None and - a.nbytes > self._max_nbytes): - # check that the folder exists (lazily create the pool temp folder - # if required) - try: - os.makedirs(self._temp_folder) - os.chmod(self._temp_folder, FOLDER_PERMISSIONS) - except OSError as e: - if e.errno != errno.EEXIST: - raise e - - try: - basename = self._memmaped_arrays.get(a) - except KeyError: - # Generate a new unique random filename. The process and thread - # ids are only useful for debugging purpose and to make it - # easier to cleanup orphaned files in case of hard process - # kill (e.g. by "kill -9" or segfault). - basename = "{}-{}-{}.pkl".format( - os.getpid(), id(threading.current_thread()), uuid4().hex) - self._memmaped_arrays.set(a, basename) - filename = os.path.join(self._temp_folder, basename) - - # In case the same array with the same content is passed several - # times to the pool subprocess children, serialize it only once - - # XXX: implement an explicit reference counting scheme to make it - # possible to delete temporary files as soon as the workers are - # done processing this data. - if not os.path.exists(filename): - if self.verbose > 0: - print("Memmapping (shape={}, dtype={}) to new file {}" - .format(a.shape, a.dtype, filename)) - for dumped_filename in dump(a, filename): - os.chmod(dumped_filename, FILE_PERMISSIONS) - - if self._prewarm: - # Warm up the data by accessing it. This operation ensures - # that the disk access required to create the memmapping - # file are performed in the reducing process and avoids - # concurrent memmap creation in multiple children - # processes. - load(filename, mmap_mode=self._mmap_mode).max() - elif self.verbose > 1: - print("Memmapping (shape={}, dtype={}) to old file {}" - .format(a.shape, a.dtype, filename)) - - # The worker process will use joblib.load to memmap the data - return (load, (filename, self._mmap_mode)) - else: - # do not convert a into memmap, let pickler do its usual copy with - # the default system pickler - if self.verbose > 1: - print("Pickling array (shape={}, dtype={})." - .format(a.shape, a.dtype)) - return (loads, (dumps(a, protocol=HIGHEST_PROTOCOL),)) - - -def get_memmapping_reducers( - pool_id, forward_reducers=None, backward_reducers=None, - temp_folder=None, max_nbytes=1e6, mmap_mode='r', verbose=0, - prewarm=False, **kwargs): - """Construct a pair of memmapping reducer linked to a tmpdir. - - This function manage the creation and the clean up of the temporary folders - underlying the memory maps and should be use to get the reducers necessary - to construct joblib pool or executor. - """ - if forward_reducers is None: - forward_reducers = dict() - if backward_reducers is None: - backward_reducers = dict() - - # Prepare a sub-folder name for the serialization of this particular - # pool instance (do not create in advance to spare FS write access if - # no array is to be dumped): - pool_folder_name = "joblib_memmapping_folder_{}_{}".format( - os.getpid(), pool_id) - pool_folder, use_shared_mem = _get_temp_dir(pool_folder_name, - temp_folder) - - # Register the garbage collector at program exit in case caller forgets - # to call terminate explicitly: note we do not pass any reference to - # self to ensure that this callback won't prevent garbage collection of - # the pool instance and related file handler resources such as POSIX - # semaphores and pipes - pool_module_name = whichmodule(delete_folder, 'delete_folder') - - def _cleanup(): - # In some cases the Python runtime seems to set delete_folder to - # None just before exiting when accessing the delete_folder - # function from the closure namespace. So instead we reimport - # the delete_folder function explicitly. - # https://github.com/joblib/joblib/issues/328 - # We cannot just use from 'joblib.pool import delete_folder' - # because joblib should only use relative imports to allow - # easy vendoring. - delete_folder = __import__( - pool_module_name, fromlist=['delete_folder']).delete_folder - try: - delete_folder(pool_folder) - except WindowsError: - warnings.warn("Failed to clean temporary folder: {}" - .format(pool_folder)) - - atexit.register(_cleanup) - - if np is not None: - # Register smart numpy.ndarray reducers that detects memmap backed - # arrays and that is also able to dump to memmap large in-memory - # arrays over the max_nbytes threshold - if prewarm == "auto": - prewarm = not use_shared_mem - forward_reduce_ndarray = ArrayMemmapReducer( - max_nbytes, pool_folder, mmap_mode, verbose, - prewarm=prewarm) - forward_reducers[np.ndarray] = forward_reduce_ndarray - forward_reducers[np.memmap] = reduce_memmap - - # Communication from child process to the parent process always - # pickles in-memory numpy.ndarray without dumping them as memmap - # to avoid confusing the caller and make it tricky to collect the - # temporary folder - backward_reduce_ndarray = ArrayMemmapReducer( - None, pool_folder, mmap_mode, verbose) - backward_reducers[np.ndarray] = backward_reduce_ndarray - backward_reducers[np.memmap] = reduce_memmap - - return forward_reducers, backward_reducers, pool_folder diff --git a/sklearn/externals/joblib/_memory_helpers.py b/sklearn/externals/joblib/_memory_helpers.py deleted file mode 100644 index 56a8b93f012d7..0000000000000 --- a/sklearn/externals/joblib/_memory_helpers.py +++ /dev/null @@ -1,105 +0,0 @@ -try: - # Available in Python 3 - from tokenize import open as open_py_source - -except ImportError: - # Copied from python3 tokenize - from codecs import lookup, BOM_UTF8 - import re - from io import TextIOWrapper, open - cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") - - def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if enc == "utf-8" or enc.startswith("utf-8-"): - return "utf-8" - if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ - enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): - return "iso-8859-1" - return orig_enc - - def _detect_encoding(readline): - """ - The detect_encoding() function is used to detect the encoding that - should be used to decode a Python source file. It requires one - argment, readline, in the same way as the tokenize() generator. - - It will call readline a maximum of twice, and return the encoding used - (as a string) and a list of any lines (left as bytes) it has read in. - - It detects the encoding from the presence of a utf-8 bom or an encoding - cookie as specified in pep-0263. If both a bom and a cookie are - present, but disagree, a SyntaxError will be raised. If the encoding - cookie is an invalid charset, raise a SyntaxError. Note that if a - utf-8 bom is found, 'utf-8-sig' is returned. - - If no encoding is specified, then the default of 'utf-8' will be - returned. - """ - bom_found = False - encoding = None - default = 'utf-8' - - def read_or_stop(): - try: - return readline() - except StopIteration: - return b'' - - def find_cookie(line): - try: - line_string = line.decode('ascii') - except UnicodeDecodeError: - return None - - matches = cookie_re.findall(line_string) - if not matches: - return None - encoding = _get_normal_name(matches[0]) - try: - codec = lookup(encoding) - except LookupError: - # This behaviour mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - if codec.name != 'utf-8': - # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - encoding += '-sig' - return encoding - - first = read_or_stop() - if first.startswith(BOM_UTF8): - bom_found = True - first = first[3:] - default = 'utf-8-sig' - if not first: - return default, [] - - encoding = find_cookie(first) - if encoding: - return encoding, [first] - - second = read_or_stop() - if not second: - return default, [first] - - encoding = find_cookie(second) - if encoding: - return encoding, [first, second] - - return default, [first, second] - - def open_py_source(filename): - """Open a file in read only mode using the encoding detected by - detect_encoding(). - """ - buffer = open(filename, 'rb') - encoding, lines = _detect_encoding(buffer.readline) - buffer.seek(0) - text = TextIOWrapper(buffer, encoding, line_buffering=True) - text.mode = 'r' - return text diff --git a/sklearn/externals/joblib/_multiprocessing_helpers.py b/sklearn/externals/joblib/_multiprocessing_helpers.py deleted file mode 100644 index be642b869febe..0000000000000 --- a/sklearn/externals/joblib/_multiprocessing_helpers.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Helper module to factorize the conditional multiprocessing import logic - -We use a distinct module to simplify import statements and avoid introducing -circular dependencies (for instance for the assert_spawning name). -""" -import os -import sys -import warnings - - -# Obtain possible configuration from the environment, assuming 1 (on) -# by default, upon 0 set to None. Should instructively fail if some non -# 0/1 value is set. -mp = int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)) or None -if mp: - try: - import multiprocessing as mp - except ImportError: - mp = None - -# 2nd stage: validate that locking is available on the system and -# issue a warning if not -if mp is not None: - try: - # Use the spawn context - if sys.version_info < (3, 3): - Semaphore = mp.Semaphore - else: - # Using mp.Semaphore has a border effect and set the default - # backend for multiprocessing. To avoid that, we use the 'spawn' - # context which is available on all supported platforms. - ctx = mp.get_context('spawn') - Semaphore = ctx.Semaphore - _sem = Semaphore() - del _sem # cleanup - except (ImportError, OSError) as e: - mp = None - warnings.warn('%s. joblib will operate in serial mode' % (e,)) - - -# 3rd stage: backward compat for the assert_spawning helper -if mp is not None: - try: - # Python 3.4+ - from multiprocessing.context import assert_spawning - except ImportError: - from multiprocessing.forking import assert_spawning -else: - assert_spawning = None diff --git a/sklearn/externals/joblib/_parallel_backends.py b/sklearn/externals/joblib/_parallel_backends.py deleted file mode 100644 index 0f0bcf0ab4213..0000000000000 --- a/sklearn/externals/joblib/_parallel_backends.py +++ /dev/null @@ -1,590 +0,0 @@ -""" -Backends for embarrassingly parallel code. -""" - -import gc -import os -import sys -import warnings -import threading -import functools -import contextlib -from abc import ABCMeta, abstractmethod - -from .format_stack import format_exc -from .my_exceptions import WorkerInterrupt, TransportableException -from ._multiprocessing_helpers import mp -from ._compat import with_metaclass, PY27 -if mp is not None: - from .disk import delete_folder - from .pool import MemmappingPool - from multiprocessing.pool import ThreadPool - from .executor import get_memmapping_executor - - # Compat between concurrent.futures and multiprocessing TimeoutError - from multiprocessing import TimeoutError - from .externals.loky._base import TimeoutError as LokyTimeoutError - from .externals.loky import process_executor, cpu_count - - -class ParallelBackendBase(with_metaclass(ABCMeta)): - """Helper abc which defines all methods a ParallelBackend must implement""" - - supports_timeout = False - nesting_level = 0 - - def __init__(self, nesting_level=0): - self.nesting_level = nesting_level - - SUPPORTED_CLIB_VARS = [ - 'OMP_NUM_THREADS', 'OPENBLAS_NUM_THREADS', 'MKL_NUM_THREADS', - 'VECLIB_MAXIMUM_THREADS', 'NUMEXPR_NUM_THREADS' - ] - - @abstractmethod - def effective_n_jobs(self, n_jobs): - """Determine the number of jobs that can actually run in parallel - - n_jobs is the number of workers requested by the callers. Passing - n_jobs=-1 means requesting all available workers for instance matching - the number of CPU cores on the worker host(s). - - This method should return a guesstimate of the number of workers that - can actually perform work concurrently. The primary use case is to make - it possible for the caller to know in how many chunks to slice the - work. - - In general working on larger data chunks is more efficient (less - scheduling overhead and better use of CPU cache prefetching heuristics) - as long as all the workers have enough work to do. - """ - - @abstractmethod - def apply_async(self, func, callback=None): - """Schedule a func to be run""" - - def configure(self, n_jobs=1, parallel=None, prefer=None, require=None, - **backend_args): - """Reconfigure the backend and return the number of workers. - - This makes it possible to reuse an existing backend instance for - successive independent calls to Parallel with different parameters. - """ - self.parallel = parallel - return self.effective_n_jobs(n_jobs) - - def start_call(self): - """Call-back method called at the beginning of a Parallel call""" - - def stop_call(self): - """Call-back method called at the end of a Parallel call""" - - def terminate(self): - """Shutdown the workers and free the shared memory.""" - - def compute_batch_size(self): - """Determine the optimal batch size""" - return 1 - - def batch_completed(self, batch_size, duration): - """Callback indicate how long it took to run a batch""" - - def get_exceptions(self): - """List of exception types to be captured.""" - return [] - - def abort_everything(self, ensure_ready=True): - """Abort any running tasks - - This is called when an exception has been raised when executing a tasks - and all the remaining tasks will be ignored and can therefore be - aborted to spare computation resources. - - If ensure_ready is True, the backend should be left in an operating - state as future tasks might be re-submitted via that same backend - instance. - - If ensure_ready is False, the implementer of this method can decide - to leave the backend in a closed / terminated state as no new task - are expected to be submitted to this backend. - - Setting ensure_ready to False is an optimization that can be leveraged - when aborting tasks via killing processes from a local process pool - managed by the backend it-self: if we expect no new tasks, there is no - point in re-creating new workers. - """ - # Does nothing by default: to be overridden in subclasses when - # canceling tasks is possible. - pass - - def get_nested_backend(self): - """Backend instance to be used by nested Parallel calls. - - By default a thread-based backend is used for the first level of - nesting. Beyond, switch to sequential backend to avoid spawning too - many threads on the host. - """ - nesting_level = getattr(self, 'nesting_level', 0) + 1 - if nesting_level > 1: - return SequentialBackend(nesting_level=nesting_level), None - else: - return ThreadingBackend(nesting_level=nesting_level), None - - @contextlib.contextmanager - def retrieval_context(self): - """Context manager to manage an execution context. - - Calls to Parallel.retrieve will be made inside this context. - - By default, this does nothing. It may be useful for subclasses to - handle nested parallelism. In particular, it may be required to avoid - deadlocks if a backend manages a fixed number of workers, when those - workers may be asked to do nested Parallel calls. Without - 'retrieval_context' this could lead to deadlock, as all the workers - managed by the backend may be "busy" waiting for the nested parallel - calls to finish, but the backend has no free workers to execute those - tasks. - """ - yield - - @classmethod - def limit_clib_threads(cls, n_threads=1): - """Initializer to limit the number of threads used by some C-libraries. - - This function set the number of threads to `n_threads` for OpenMP, MKL, - Accelerated and OpenBLAS libraries, that can be used with scientific - computing tools like numpy. - """ - for var in cls.SUPPORTED_CLIB_VARS: - var_value = os.environ.get(var, None) - if var_value is None: - os.environ[var] = str(n_threads) - - -class SequentialBackend(ParallelBackendBase): - """A ParallelBackend which will execute all batches sequentially. - - Does not use/create any threading objects, and hence has minimal - overhead. Used when n_jobs == 1. - """ - - uses_threads = True - supports_sharedmem = True - - def effective_n_jobs(self, n_jobs): - """Determine the number of jobs which are going to run in parallel""" - if n_jobs == 0: - raise ValueError('n_jobs == 0 in Parallel has no meaning') - return 1 - - def apply_async(self, func, callback=None): - """Schedule a func to be run""" - result = ImmediateResult(func) - if callback: - callback(result) - return result - - def get_nested_backend(self): - # import is not top level to avoid cyclic import errors. - from .parallel import get_active_backend - - # SequentialBackend should neither change the nesting level, the - # default backend or the number of jobs. Just return the current one. - return get_active_backend() - - -class PoolManagerMixin(object): - """A helper class for managing pool of workers.""" - - _pool = None - - def effective_n_jobs(self, n_jobs): - """Determine the number of jobs which are going to run in parallel""" - if n_jobs == 0: - raise ValueError('n_jobs == 0 in Parallel has no meaning') - elif mp is None or n_jobs is None: - # multiprocessing is not available or disabled, fallback - # to sequential mode - return 1 - elif n_jobs < 0: - n_jobs = max(cpu_count() + 1 + n_jobs, 1) - return n_jobs - - def terminate(self): - """Shutdown the process or thread pool""" - if self._pool is not None: - self._pool.close() - self._pool.terminate() # terminate does a join() - self._pool = None - - def _get_pool(self): - """Used by apply_async to make it possible to implement lazy init""" - return self._pool - - def apply_async(self, func, callback=None): - """Schedule a func to be run""" - return self._get_pool().apply_async( - SafeFunction(func), callback=callback) - - def abort_everything(self, ensure_ready=True): - """Shutdown the pool and restart a new one with the same parameters""" - self.terminate() - if ensure_ready: - self.configure(n_jobs=self.parallel.n_jobs, parallel=self.parallel, - **self.parallel._backend_args) - - -class AutoBatchingMixin(object): - """A helper class for automagically batching jobs.""" - - # In seconds, should be big enough to hide multiprocessing dispatching - # overhead. - # This settings was found by running benchmarks/bench_auto_batching.py - # with various parameters on various platforms. - MIN_IDEAL_BATCH_DURATION = .2 - - # Should not be too high to avoid stragglers: long jobs running alone - # on a single worker while other workers have no work to process any more. - MAX_IDEAL_BATCH_DURATION = 2 - - # Batching counters default values - _DEFAULT_EFFECTIVE_BATCH_SIZE = 1 - _DEFAULT_SMOOTHED_BATCH_DURATION = 0.0 - - def __init__(self): - self._effective_batch_size = self._DEFAULT_EFFECTIVE_BATCH_SIZE - self._smoothed_batch_duration = self._DEFAULT_SMOOTHED_BATCH_DURATION - - def compute_batch_size(self): - """Determine the optimal batch size""" - old_batch_size = self._effective_batch_size - batch_duration = self._smoothed_batch_duration - if (batch_duration > 0 and - batch_duration < self.MIN_IDEAL_BATCH_DURATION): - # The current batch size is too small: the duration of the - # processing of a batch of task is not large enough to hide - # the scheduling overhead. - ideal_batch_size = int(old_batch_size * - self.MIN_IDEAL_BATCH_DURATION / - batch_duration) - # Multiply by two to limit oscilations between min and max. - batch_size = max(2 * ideal_batch_size, 1) - self._effective_batch_size = batch_size - if self.parallel.verbose >= 10: - self.parallel._print( - "Batch computation too fast (%.4fs.) " - "Setting batch_size=%d.", (batch_duration, batch_size)) - elif (batch_duration > self.MAX_IDEAL_BATCH_DURATION and - old_batch_size >= 2): - # The current batch size is too big. If we schedule overly long - # running batches some CPUs might wait with nothing left to do - # while a couple of CPUs a left processing a few long running - # batches. Better reduce the batch size a bit to limit the - # likelihood of scheduling such stragglers. - batch_size = old_batch_size // 2 - self._effective_batch_size = batch_size - if self.parallel.verbose >= 10: - self.parallel._print( - "Batch computation too slow (%.4fs.) " - "Setting batch_size=%d.", (batch_duration, batch_size)) - else: - # No batch size adjustment - batch_size = old_batch_size - - if batch_size != old_batch_size: - # Reset estimation of the smoothed mean batch duration: this - # estimate is updated in the multiprocessing apply_async - # CallBack as long as the batch_size is constant. Therefore - # we need to reset the estimate whenever we re-tune the batch - # size. - self._smoothed_batch_duration = \ - self._DEFAULT_SMOOTHED_BATCH_DURATION - - return batch_size - - def batch_completed(self, batch_size, duration): - """Callback indicate how long it took to run a batch""" - if batch_size == self._effective_batch_size: - # Update the smoothed streaming estimate of the duration of a batch - # from dispatch to completion - old_duration = self._smoothed_batch_duration - if old_duration == self._DEFAULT_SMOOTHED_BATCH_DURATION: - # First record of duration for this batch size after the last - # reset. - new_duration = duration - else: - # Update the exponentially weighted average of the duration of - # batch for the current effective size. - new_duration = 0.8 * old_duration + 0.2 * duration - self._smoothed_batch_duration = new_duration - - def reset_batch_stats(self): - """Reset batch statistics to default values. - - This avoids interferences with future jobs. - """ - self._effective_batch_size = self._DEFAULT_EFFECTIVE_BATCH_SIZE - self._smoothed_batch_duration = self._DEFAULT_SMOOTHED_BATCH_DURATION - - -class ThreadingBackend(PoolManagerMixin, ParallelBackendBase): - """A ParallelBackend which will use a thread pool to execute batches in. - - This is a low-overhead backend but it suffers from the Python Global - Interpreter Lock if the called function relies a lot on Python objects. - Mostly useful when the execution bottleneck is a compiled extension that - explicitly releases the GIL (for instance a Cython loop wrapped in a "with - nogil" block or an expensive call to a library such as NumPy). - - The actual thread pool is lazily initialized: the actual thread pool - construction is delayed to the first call to apply_async. - - ThreadingBackend is used as the default backend for nested calls. - """ - - supports_timeout = True - uses_threads = True - supports_sharedmem = True - - def configure(self, n_jobs=1, parallel=None, **backend_args): - """Build a process or thread pool and return the number of workers""" - n_jobs = self.effective_n_jobs(n_jobs) - if n_jobs == 1: - # Avoid unnecessary overhead and use sequential backend instead. - raise FallbackToBackend( - SequentialBackend(nesting_level=self.nesting_level)) - self.parallel = parallel - self._n_jobs = n_jobs - return n_jobs - - def _get_pool(self): - """Lazily initialize the thread pool - - The actual pool of worker threads is only initialized at the first - call to apply_async. - """ - if self._pool is None: - self._pool = ThreadPool(self._n_jobs) - return self._pool - - -class MultiprocessingBackend(PoolManagerMixin, AutoBatchingMixin, - ParallelBackendBase): - """A ParallelBackend which will use a multiprocessing.Pool. - - Will introduce some communication and memory overhead when exchanging - input and output data with the with the worker Python processes. - However, does not suffer from the Python Global Interpreter Lock. - """ - - # Environment variables to protect against bad situations when nesting - JOBLIB_SPAWNED_PROCESS = "__JOBLIB_SPAWNED_PARALLEL__" - - supports_timeout = True - - def effective_n_jobs(self, n_jobs): - """Determine the number of jobs which are going to run in parallel. - - This also checks if we are attempting to create a nested parallel - loop. - """ - if mp is None: - return 1 - - if mp.current_process().daemon: - # Daemonic processes cannot have children - if n_jobs != 1: - warnings.warn( - 'Multiprocessing-backed parallel loops cannot be nested,' - ' setting n_jobs=1', - stacklevel=3) - return 1 - - if process_executor._CURRENT_DEPTH > 0: - # Mixing loky and multiprocessing in nested loop is not supported - if n_jobs != 1: - warnings.warn( - 'Multiprocessing-backed parallel loops cannot be nested,' - ' below loky, setting n_jobs=1', - stacklevel=3) - return 1 - - if not isinstance(threading.current_thread(), threading._MainThread): - # Prevent posix fork inside in non-main posix threads - if n_jobs != 1: - warnings.warn( - 'Multiprocessing-backed parallel loops cannot be nested' - ' below threads, setting n_jobs=1', - stacklevel=3) - return 1 - - return super(MultiprocessingBackend, self).effective_n_jobs(n_jobs) - - def configure(self, n_jobs=1, parallel=None, prefer=None, require=None, - **memmappingpool_args): - """Build a process or thread pool and return the number of workers""" - n_jobs = self.effective_n_jobs(n_jobs) - if n_jobs == 1: - raise FallbackToBackend( - SequentialBackend(nesting_level=self.nesting_level)) - - already_forked = int(os.environ.get(self.JOBLIB_SPAWNED_PROCESS, 0)) - if already_forked: - raise ImportError( - '[joblib] Attempting to do parallel computing ' - 'without protecting your import on a system that does ' - 'not support forking. To use parallel-computing in a ' - 'script, you must protect your main loop using "if ' - "__name__ == '__main__'" - '". Please see the joblib documentation on Parallel ' - 'for more information') - # Set an environment variable to avoid infinite loops - os.environ[self.JOBLIB_SPAWNED_PROCESS] = '1' - - # Make sure to free as much memory as possible before forking - gc.collect() - self._pool = MemmappingPool( - n_jobs, initializer=self.limit_clib_threads, **memmappingpool_args) - self.parallel = parallel - return n_jobs - - def terminate(self): - """Shutdown the process or thread pool""" - super(MultiprocessingBackend, self).terminate() - if self.JOBLIB_SPAWNED_PROCESS in os.environ: - del os.environ[self.JOBLIB_SPAWNED_PROCESS] - - self.reset_batch_stats() - - -class LokyBackend(AutoBatchingMixin, ParallelBackendBase): - """Managing pool of workers with loky instead of multiprocessing.""" - - supports_timeout = True - - def configure(self, n_jobs=1, parallel=None, prefer=None, require=None, - idle_worker_timeout=300, **memmappingexecutor_args): - """Build a process executor and return the number of workers""" - n_jobs = self.effective_n_jobs(n_jobs) - if n_jobs == 1: - raise FallbackToBackend( - SequentialBackend(nesting_level=self.nesting_level)) - - self._workers = get_memmapping_executor( - n_jobs, timeout=idle_worker_timeout, - initializer=self.limit_clib_threads, - **memmappingexecutor_args) - self.parallel = parallel - return n_jobs - - def effective_n_jobs(self, n_jobs): - """Determine the number of jobs which are going to run in parallel""" - if n_jobs == 0: - raise ValueError('n_jobs == 0 in Parallel has no meaning') - elif mp is None or n_jobs is None: - # multiprocessing is not available or disabled, fallback - # to sequential mode - return 1 - elif mp.current_process().daemon: - # Daemonic processes cannot have children - if n_jobs != 1: - warnings.warn( - 'Loky-backed parallel loops cannot be called in a' - ' multiprocessing, setting n_jobs=1', - stacklevel=3) - return 1 - elif not isinstance(threading.current_thread(), threading._MainThread): - # Prevent posix fork inside in non-main posix threads - if n_jobs != 1: - warnings.warn( - 'Loky-backed parallel loops cannot be nested below ' - 'threads, setting n_jobs=1', - stacklevel=3) - return 1 - elif n_jobs < 0: - n_jobs = max(cpu_count() + 1 + n_jobs, 1) - return n_jobs - - def apply_async(self, func, callback=None): - """Schedule a func to be run""" - future = self._workers.submit(SafeFunction(func)) - future.get = functools.partial(self.wrap_future_result, future) - if callback is not None: - future.add_done_callback(callback) - return future - - @staticmethod - def wrap_future_result(future, timeout=None): - """Wrapper for Future.result to implement the same behaviour as - AsyncResults.get from multiprocessing.""" - try: - return future.result(timeout=timeout) - except LokyTimeoutError: - raise TimeoutError() - - def terminate(self): - if self._workers is not None: - # Terminate does not shutdown the workers as we want to reuse them - # in latter calls but we free as much memory as we can by deleting - # the shared memory - delete_folder(self._workers._temp_folder) - self._workers = None - - self.reset_batch_stats() - - def abort_everything(self, ensure_ready=True): - """Shutdown the workers and restart a new one with the same parameters - """ - self._workers.shutdown(kill_workers=True) - delete_folder(self._workers._temp_folder) - self._workers = None - if ensure_ready: - self.configure(n_jobs=self.parallel.n_jobs, parallel=self.parallel) - - -class ImmediateResult(object): - def __init__(self, batch): - # Don't delay the application, to avoid keeping the input - # arguments in memory - self.results = batch() - - def get(self): - return self.results - - -class SafeFunction(object): - """Wrapper that handles the serialization of exception tracebacks. - - If an exception is triggered when calling the inner function, a copy of - the full traceback is captured to make it possible to serialize - it so that it can be rendered in a different Python process. - """ - def __init__(self, func): - self.func = func - - def __call__(self, *args, **kwargs): - try: - return self.func(*args, **kwargs) - except KeyboardInterrupt: - # We capture the KeyboardInterrupt and reraise it as - # something different, as multiprocessing does not - # interrupt processing for a KeyboardInterrupt - raise WorkerInterrupt() - except BaseException: - if PY27: - # Capture the traceback of the worker to make it part of - # the final exception message. - e_type, e_value, e_tb = sys.exc_info() - text = format_exc(e_type, e_value, e_tb, context=10, - tb_offset=1) - raise TransportableException(text, e_type) - else: - # Rely on Python 3 built-in Remote Traceback reporting - raise - - -class FallbackToBackend(Exception): - """Raised when configuration should fallback to another backend""" - - def __init__(self, backend): - self.backend = backend diff --git a/sklearn/externals/joblib/_store_backends.py b/sklearn/externals/joblib/_store_backends.py deleted file mode 100644 index 9196f0a7746a1..0000000000000 --- a/sklearn/externals/joblib/_store_backends.py +++ /dev/null @@ -1,415 +0,0 @@ -"""Storage providers backends for Memory caching.""" - -import re -import os -import os.path -import datetime -import json -import shutil -import warnings -import collections -import operator -import threading -from abc import ABCMeta, abstractmethod - -from ._compat import with_metaclass, _basestring -from .backports import concurrency_safe_rename -from .disk import mkdirp, memstr_to_bytes, rm_subdirs -from . import numpy_pickle - -CacheItemInfo = collections.namedtuple('CacheItemInfo', - 'path size last_access') - - -def concurrency_safe_write(object_to_write, filename, write_func): - """Writes an object into a unique file in a concurrency-safe way.""" - thread_id = id(threading.current_thread()) - temporary_filename = '{}.thread-{}-pid-{}'.format( - filename, thread_id, os.getpid()) - write_func(object_to_write, temporary_filename) - - return temporary_filename - - -class StoreBackendBase(with_metaclass(ABCMeta)): - """Helper Abstract Base Class which defines all methods that - a StorageBackend must implement.""" - - location = None - - @abstractmethod - def _open_item(self, f, mode): - """Opens an item on the store and return a file-like object. - - This method is private and only used by the StoreBackendMixin object. - - Parameters - ---------- - f: a file-like object - The file-like object where an item is stored and retrieved - mode: string, optional - the mode in which the file-like object is opened allowed valued are - 'rb', 'wb' - - Returns - ------- - a file-like object - """ - - @abstractmethod - def _item_exists(self, location): - """Checks if an item location exists in the store. - - This method is private and only used by the StoreBackendMixin object. - - Parameters - ---------- - location: string - The location of an item. On a filesystem, this corresponds to the - absolute path, including the filename, of a file. - - Returns - ------- - True if the item exists, False otherwise - """ - - @abstractmethod - def _move_item(self, src, dst): - """Moves an item from src to dst in the store. - - This method is private and only used by the StoreBackendMixin object. - - Parameters - ---------- - src: string - The source location of an item - dst: string - The destination location of an item - """ - - @abstractmethod - def create_location(self, location): - """Creates a location on the store. - - Parameters - ---------- - location: string - The location in the store. On a filesystem, this corresponds to a - directory. - """ - - @abstractmethod - def clear_location(self, location): - """Clears a location on the store. - - Parameters - ---------- - location: string - The location in the store. On a filesystem, this corresponds to a - directory or a filename absolute path - """ - - @abstractmethod - def get_items(self): - """Returns the whole list of items available in the store. - - Returns - ------- - The list of items identified by their ids (e.g filename in a - filesystem). - """ - - @abstractmethod - def configure(self, location, verbose=0, backend_options=dict()): - """Configures the store. - - Parameters - ---------- - location: string - The base location used by the store. On a filesystem, this - corresponds to a directory. - verbose: int - The level of verbosity of the store - backend_options: dict - Contains a dictionnary of named paremeters used to configure the - store backend. - """ - - -class StoreBackendMixin(object): - """Class providing all logic for managing the store in a generic way. - - The StoreBackend subclass has to implement 3 methods: create_location, - clear_location and configure. The StoreBackend also has to provide - a private _open_item, _item_exists and _move_item methods. The _open_item - method has to have the same signature as the builtin open and return a - file-like object. - """ - - def load_item(self, path, verbose=1, msg=None): - """Load an item from the store given its path as a list of - strings.""" - full_path = os.path.join(self.location, *path) - - if verbose > 1: - if verbose < 10: - print('{0}...'.format(msg)) - else: - print('{0} from {1}'.format(msg, full_path)) - - mmap_mode = (None if not hasattr(self, 'mmap_mode') - else self.mmap_mode) - - filename = os.path.join(full_path, 'output.pkl') - if not self._item_exists(filename): - raise KeyError("Non-existing item (may have been " - "cleared).\nFile %s does not exist" % filename) - - # file-like object cannot be used when mmap_mode is set - if mmap_mode is None: - with self._open_item(filename, "rb") as f: - item = numpy_pickle.load(f) - else: - item = numpy_pickle.load(filename, mmap_mode=mmap_mode) - return item - - def dump_item(self, path, item, verbose=1): - """Dump an item in the store at the path given as a list of - strings.""" - try: - item_path = os.path.join(self.location, *path) - if not self._item_exists(item_path): - self.create_location(item_path) - filename = os.path.join(item_path, 'output.pkl') - if verbose > 10: - print('Persisting in %s' % item_path) - - def write_func(to_write, dest_filename): - with self._open_item(dest_filename, "wb") as f: - numpy_pickle.dump(to_write, f, - compress=self.compress) - - self._concurrency_safe_write(item, filename, write_func) - except: # noqa: E722 - " Race condition in the creation of the directory " - - def clear_item(self, path): - """Clear the item at the path, given as a list of strings.""" - item_path = os.path.join(self.location, *path) - if self._item_exists(item_path): - self.clear_location(item_path) - - def contains_item(self, path): - """Check if there is an item at the path, given as a list of - strings""" - item_path = os.path.join(self.location, *path) - filename = os.path.join(item_path, 'output.pkl') - - return self._item_exists(filename) - - def get_item_info(self, path): - """Return information about item.""" - return {'location': os.path.join(self.location, - *path)} - - def get_metadata(self, path): - """Return actual metadata of an item.""" - try: - item_path = os.path.join(self.location, *path) - filename = os.path.join(item_path, 'metadata.json') - with self._open_item(filename, 'rb') as f: - return json.loads(f.read().decode('utf-8')) - except: # noqa: E722 - return {} - - def store_metadata(self, path, metadata): - """Store metadata of a computation.""" - try: - item_path = os.path.join(self.location, *path) - self.create_location(item_path) - filename = os.path.join(item_path, 'metadata.json') - - def write_func(to_write, dest_filename): - with self._open_item(dest_filename, "wb") as f: - f.write(json.dumps(to_write).encode('utf-8')) - - self._concurrency_safe_write(metadata, filename, write_func) - except: # noqa: E722 - pass - - def contains_path(self, path): - """Check cached function is available in store.""" - func_path = os.path.join(self.location, *path) - return self.object_exists(func_path) - - def clear_path(self, path): - """Clear all items with a common path in the store.""" - func_path = os.path.join(self.location, *path) - if self._item_exists(func_path): - self.clear_location(func_path) - - def store_cached_func_code(self, path, func_code=None): - """Store the code of the cached function.""" - func_path = os.path.join(self.location, *path) - if not self._item_exists(func_path): - self.create_location(func_path) - - if func_code is not None: - filename = os.path.join(func_path, "func_code.py") - with self._open_item(filename, 'wb') as f: - f.write(func_code.encode('utf-8')) - - def get_cached_func_code(self, path): - """Store the code of the cached function.""" - path += ['func_code.py', ] - filename = os.path.join(self.location, *path) - try: - with self._open_item(filename, 'rb') as f: - return f.read().decode('utf-8') - except: # noqa: E722 - raise - - def get_cached_func_info(self, path): - """Return information related to the cached function if it exists.""" - return {'location': os.path.join(self.location, *path)} - - def clear(self): - """Clear the whole store content.""" - self.clear_location(self.location) - - def reduce_store_size(self, bytes_limit): - """Reduce store size to keep it under the given bytes limit.""" - items_to_delete = self._get_items_to_delete(bytes_limit) - - for item in items_to_delete: - if self.verbose > 10: - print('Deleting item {0}'.format(item)) - try: - self.clear_location(item.path) - except OSError: - # Even with ignore_errors=True can shutil.rmtree - # can raise OSErrror with [Errno 116] Stale file - # handle if another process has deleted the folder - # already. - pass - - def _get_items_to_delete(self, bytes_limit): - """Get items to delete to keep the store under a size limit.""" - if isinstance(bytes_limit, _basestring): - bytes_limit = memstr_to_bytes(bytes_limit) - - items = self.get_items() - size = sum(item.size for item in items) - - to_delete_size = size - bytes_limit - if to_delete_size < 0: - return [] - - # We want to delete first the cache items that were accessed a - # long time ago - items.sort(key=operator.attrgetter('last_access')) - - items_to_delete = [] - size_so_far = 0 - - for item in items: - if size_so_far > to_delete_size: - break - - items_to_delete.append(item) - size_so_far += item.size - - return items_to_delete - - def _concurrency_safe_write(self, to_write, filename, write_func): - """Writes an object into a file in a concurrency-safe way.""" - temporary_filename = concurrency_safe_write(to_write, - filename, write_func) - self._move_item(temporary_filename, filename) - - def __repr__(self): - """Printable representation of the store location.""" - return '{class_name}(location="{location}")'.format( - class_name=self.__class__.__name__, location=self.location) - - -class FileSystemStoreBackend(StoreBackendBase, StoreBackendMixin): - """A StoreBackend used with local or network file systems.""" - - _open_item = staticmethod(open) - _item_exists = staticmethod(os.path.exists) - _move_item = staticmethod(concurrency_safe_rename) - - def clear_location(self, location): - """Delete location on store.""" - if (location == self.location): - rm_subdirs(location) - else: - shutil.rmtree(location, ignore_errors=True) - - def create_location(self, location): - """Create object location on store""" - mkdirp(location) - - def get_items(self): - """Returns the whole list of items available in the store.""" - items = [] - - for dirpath, _, filenames in os.walk(self.location): - is_cache_hash_dir = re.match('[a-f0-9]{32}', - os.path.basename(dirpath)) - - if is_cache_hash_dir: - output_filename = os.path.join(dirpath, 'output.pkl') - try: - last_access = os.path.getatime(output_filename) - except OSError: - try: - last_access = os.path.getatime(dirpath) - except OSError: - # The directory has already been deleted - continue - - last_access = datetime.datetime.fromtimestamp(last_access) - try: - full_filenames = [os.path.join(dirpath, fn) - for fn in filenames] - dirsize = sum(os.path.getsize(fn) - for fn in full_filenames) - except OSError: - # Either output_filename or one of the files in - # dirpath does not exist any more. We assume this - # directory is being cleaned by another process already - continue - - items.append(CacheItemInfo(dirpath, dirsize, - last_access)) - - return items - - def configure(self, location, verbose=1, backend_options=None): - """Configure the store backend. - - For this backend, valid store options are 'compress' and 'mmap_mode' - """ - if backend_options is None: - backend_options = {} - - # setup location directory - self.location = location - if not os.path.exists(self.location): - mkdirp(self.location) - - # item can be stored compressed for faster I/O - self.compress = backend_options.get('compress', False) - - # FileSystemStoreBackend can be used with mmap_mode options under - # certain conditions. - mmap_mode = backend_options.get('mmap_mode') - if self.compress and mmap_mode is not None: - warnings.warn('Compressed items cannot be memmapped in a ' - 'filesystem store. Option will be ignored.', - stacklevel=2) - - self.mmap_mode = mmap_mode - self.verbose = verbose diff --git a/sklearn/externals/joblib/backports.py b/sklearn/externals/joblib/backports.py deleted file mode 100644 index be6c9c506e895..0000000000000 --- a/sklearn/externals/joblib/backports.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Backports of fixes for joblib dependencies -""" -import os -import time -import ctypes -import sys - -from distutils.version import LooseVersion - -try: - import numpy as np - - def make_memmap(filename, dtype='uint8', mode='r+', offset=0, - shape=None, order='C'): - """Backport of numpy memmap offset fix. - - See https://github.com/numpy/numpy/pull/8443 for more details. - - The numpy fix will be available in numpy 1.13. - """ - mm = np.memmap(filename, dtype=dtype, mode=mode, offset=offset, - shape=shape, order=order) - if LooseVersion(np.__version__) < '1.13': - mm.offset = offset - return mm -except ImportError: - def make_memmap(filename, dtype='uint8', mode='r+', offset=0, - shape=None, order='C'): - raise NotImplementedError( - "'joblib.backports.make_memmap' should not be used " - 'if numpy is not installed.') - - -if os.name == 'nt': - # https://github.com/joblib/joblib/issues/540 - access_denied_errors = (5, 13) - try: - from os import replace - except ImportError: - # Python 2.7 - def replace(src, dst): - if not isinstance(src, unicode): # noqa - src = unicode(src, sys.getfilesystemencoding()) # noqa - if not isinstance(dst, unicode): # noqa - dst = unicode(dst, sys.getfilesystemencoding()) # noqa - - movefile_replace_existing = 0x1 - return_value = ctypes.windll.kernel32.MoveFileExW( - src, dst, movefile_replace_existing) - if return_value == 0: - raise ctypes.WinError() - - def concurrency_safe_rename(src, dst): - """Renames ``src`` into ``dst`` overwriting ``dst`` if it exists. - - On Windows os.replace (or for Python 2.7 its implementation - through MoveFileExW) can yield permission errors if executed by - two different processes. - """ - max_sleep_time = 1 - total_sleep_time = 0 - sleep_time = 0.001 - while total_sleep_time < max_sleep_time: - try: - replace(src, dst) - break - except Exception as exc: - if getattr(exc, 'winerror', None) in access_denied_errors: - time.sleep(sleep_time) - total_sleep_time += sleep_time - sleep_time *= 2 - else: - raise - else: - raise -else: - try: - from os import replace as concurrency_safe_rename - except ImportError: - from os import rename as concurrency_safe_rename # noqa diff --git a/sklearn/externals/joblib/compressor.py b/sklearn/externals/joblib/compressor.py deleted file mode 100644 index 7692fd9f2888c..0000000000000 --- a/sklearn/externals/joblib/compressor.py +++ /dev/null @@ -1,594 +0,0 @@ -"""Classes and functions for managing compressors.""" - -import sys -import io -import zlib -from distutils.version import LooseVersion - -from ._compat import _basestring, PY3_OR_LATER - -try: - from threading import RLock -except ImportError: - from dummy_threading import RLock - -try: - import bz2 -except ImportError: - bz2 = None - -try: - import lzma -except ImportError: - lzma = None - -try: - import lz4 - if PY3_OR_LATER: - from lz4.frame import LZ4FrameFile -except ImportError: - lz4 = None - -LZ4_NOT_INSTALLED_ERROR = ('LZ4 is not installed. Install it with pip: ' - 'http://python-lz4.readthedocs.io/') - -# Registered compressors -_COMPRESSORS = {} - -# Magic numbers of supported compression file formats. -_ZFILE_PREFIX = b'ZF' # used with pickle files created before 0.9.3. -_ZLIB_PREFIX = b'\x78' -_GZIP_PREFIX = b'\x1f\x8b' -_BZ2_PREFIX = b'BZ' -_XZ_PREFIX = b'\xfd\x37\x7a\x58\x5a' -_LZMA_PREFIX = b'\x5d\x00' -_LZ4_PREFIX = b'\x04\x22\x4D\x18' - - -def register_compressor(compressor_name, compressor, - force=False): - """Register a new compressor. - - Parameters - ----------- - compressor_name: str. - The name of the compressor. - compressor: CompressorWrapper - An instance of a 'CompressorWrapper'. - """ - global _COMPRESSORS - if not isinstance(compressor_name, _basestring): - raise ValueError("Compressor name should be a string, " - "'{}' given.".format(compressor_name)) - - if not isinstance(compressor, CompressorWrapper): - raise ValueError("Compressor should implement the CompressorWrapper " - "interface, '{}' given.".format(compressor)) - - if (compressor.fileobj_factory is not None and - (not hasattr(compressor.fileobj_factory, 'read') or - not hasattr(compressor.fileobj_factory, 'write') or - not hasattr(compressor.fileobj_factory, 'seek') or - not hasattr(compressor.fileobj_factory, 'tell'))): - raise ValueError("Compressor 'fileobj_factory' attribute should " - "implement the file object interface, '{}' given." - .format(compressor.fileobj_factory)) - - if compressor_name in _COMPRESSORS and not force: - raise ValueError("Compressor '{}' already registered." - .format(compressor_name)) - - _COMPRESSORS[compressor_name] = compressor - - -class CompressorWrapper(): - """A wrapper around a compressor file object. - - Attributes - ---------- - obj: a file-like object - The object must implement the buffer interface and will be used - internally to compress/decompress the data. - prefix: bytestring - A bytestring corresponding to the magic number that identifies the - file format associated to the compressor. - extention: str - The file extension used to automatically select this compressor during - a dump to a file. - """ - - def __init__(self, obj, prefix=b'', extension=''): - self.fileobj_factory = obj - self.prefix = prefix - self.extension = extension - - def compressor_file(self, fileobj, compresslevel=None): - """Returns an instance of a compressor file object.""" - if compresslevel is None: - return self.fileobj_factory(fileobj, 'wb') - else: - return self.fileobj_factory(fileobj, 'wb', - compresslevel=compresslevel) - - def decompressor_file(self, fileobj): - """Returns an instance of a decompressor file object.""" - return self.fileobj_factory(fileobj, 'rb') - - -class BZ2CompressorWrapper(CompressorWrapper): - - prefix = _BZ2_PREFIX - extension = '.bz2' - - def __init__(self): - if bz2 is not None: - self.fileobj_factory = bz2.BZ2File - else: - self.fileobj_factory = None - - def _check_versions(self): - if bz2 is None: - raise ValueError('bz2 module is not compiled on your python ' - 'standard library.') - - def compressor_file(self, fileobj, compresslevel=None): - """Returns an instance of a compressor file object.""" - self._check_versions() - if compresslevel is None: - return self.fileobj_factory(fileobj, 'wb') - else: - return self.fileobj_factory(fileobj, 'wb', - compresslevel=compresslevel) - - def decompressor_file(self, fileobj): - """Returns an instance of a decompressor file object.""" - self._check_versions() - if PY3_OR_LATER: - fileobj = self.fileobj_factory(fileobj, 'rb') - else: - # In python 2, BZ2File doesn't support a fileobj opened in - # binary mode. In this case, we pass the filename. - fileobj = self.fileobj_factory(fileobj.name, 'rb') - return fileobj - - -class LZMACompressorWrapper(CompressorWrapper): - - prefix = _LZMA_PREFIX - extension = '.lzma' - - def __init__(self): - if lzma is not None: - self.fileobj_factory = lzma.LZMAFile - else: - self.fileobj_factory = None - - def compressor_file(self, fileobj, compresslevel=None): - """Returns an instance of a compressor file object.""" - if compresslevel is None: - return self.fileobj_factory(fileobj, 'wb', - format=lzma.FORMAT_ALONE) - else: - return self.fileobj_factory(fileobj, 'wb', - format=lzma.FORMAT_ALONE, - preset=compresslevel) - - def decompressor_file(self, fileobj): - """Returns an instance of a decompressor file object.""" - if PY3_OR_LATER and lzma is not None: - # We support lzma only in python 3 because in python 2 users - # may have installed the pyliblzma package, which also provides - # the lzma module, but that unfortunately doesn't fully support - # the buffer interface required by joblib. - # See https://github.com/joblib/joblib/issues/403 for details. - return lzma.LZMAFile(fileobj, 'rb') - else: - raise NotImplementedError("Lzma decompression is not " - "supported for this version of " - "python ({}.{})" - .format(sys.version_info[0], - sys.version_info[1])) - - -class XZCompressorWrapper(LZMACompressorWrapper): - - prefix = _XZ_PREFIX - extension = '.xz' - - def __init__(self): - if lzma is not None: - self.fileobj_factory = lzma.LZMAFile - else: - self.fileobj_factory = None - - def compressor_file(self, fileobj, compresslevel=None): - """Returns an instance of a compressor file object.""" - if compresslevel is None: - return self.fileobj_factory(fileobj, 'wb', check=lzma.CHECK_NONE) - else: - return self.fileobj_factory(fileobj, 'wb', check=lzma.CHECK_NONE, - preset=compresslevel) - - -class LZ4CompressorWrapper(CompressorWrapper): - - prefix = _LZ4_PREFIX - extension = '.lz4' - - def __init__(self): - if PY3_OR_LATER and lz4 is not None: - self.fileobj_factory = LZ4FrameFile - else: - self.fileobj_factory = None - - def _check_versions(self): - if not PY3_OR_LATER: - raise ValueError('lz4 compression is only available with ' - 'python3+.') - - if lz4 is None or LooseVersion(lz4.__version__) < LooseVersion('0.19'): - raise ValueError(LZ4_NOT_INSTALLED_ERROR) - - def compressor_file(self, fileobj, compresslevel=None): - """Returns an instance of a compressor file object.""" - self._check_versions() - if compresslevel is None: - return self.fileobj_factory(fileobj, 'wb') - else: - return self.fileobj_factory(fileobj, 'wb', - compression_level=compresslevel) - - def decompressor_file(self, fileobj): - """Returns an instance of a decompressor file object.""" - self._check_versions() - return self.fileobj_factory(fileobj, 'rb') - - -############################################################################### -# base file compression/decompression object definition -_MODE_CLOSED = 0 -_MODE_READ = 1 -_MODE_READ_EOF = 2 -_MODE_WRITE = 3 -_BUFFER_SIZE = 8192 - - -class BinaryZlibFile(io.BufferedIOBase): - """A file object providing transparent zlib (de)compression. - - A BinaryZlibFile can act as a wrapper for an existing file object, or refer - directly to a named file on disk. - - Note that BinaryZlibFile provides only a *binary* file interface: data read - is returned as bytes, and data to be written should be given as bytes. - - This object is an adaptation of the BZ2File object and is compatible with - versions of python >= 2.7. - - If filename is a str or bytes object, it gives the name - of the file to be opened. Otherwise, it should be a file object, - which will be used to read or write the compressed data. - - mode can be 'rb' for reading (default) or 'wb' for (over)writing - - If mode is 'wb', compresslevel can be a number between 1 - and 9 specifying the level of compression: 1 produces the least - compression, and 9 produces the most compression. 3 is the default. - """ - - wbits = zlib.MAX_WBITS - - def __init__(self, filename, mode="rb", compresslevel=3): - # This lock must be recursive, so that BufferedIOBase's - # readline(), readlines() and writelines() don't deadlock. - self._lock = RLock() - self._fp = None - self._closefp = False - self._mode = _MODE_CLOSED - self._pos = 0 - self._size = -1 - self.compresslevel = compresslevel - - if not isinstance(compresslevel, int) or not (1 <= compresslevel <= 9): - raise ValueError("'compresslevel' must be an integer " - "between 1 and 9. You provided 'compresslevel={}'" - .format(compresslevel)) - - if mode == "rb": - self._mode = _MODE_READ - self._decompressor = zlib.decompressobj(self.wbits) - self._buffer = b"" - self._buffer_offset = 0 - elif mode == "wb": - self._mode = _MODE_WRITE - self._compressor = zlib.compressobj(self.compresslevel, - zlib.DEFLATED, self.wbits, - zlib.DEF_MEM_LEVEL, 0) - else: - raise ValueError("Invalid mode: %r" % (mode,)) - - if isinstance(filename, _basestring): - self._fp = io.open(filename, mode) - self._closefp = True - elif hasattr(filename, "read") or hasattr(filename, "write"): - self._fp = filename - else: - raise TypeError("filename must be a str or bytes object, " - "or a file") - - def close(self): - """Flush and close the file. - - May be called more than once without error. Once the file is - closed, any other operation on it will raise a ValueError. - """ - with self._lock: - if self._mode == _MODE_CLOSED: - return - try: - if self._mode in (_MODE_READ, _MODE_READ_EOF): - self._decompressor = None - elif self._mode == _MODE_WRITE: - self._fp.write(self._compressor.flush()) - self._compressor = None - finally: - try: - if self._closefp: - self._fp.close() - finally: - self._fp = None - self._closefp = False - self._mode = _MODE_CLOSED - self._buffer = b"" - self._buffer_offset = 0 - - @property - def closed(self): - """True if this file is closed.""" - return self._mode == _MODE_CLOSED - - def fileno(self): - """Return the file descriptor for the underlying file.""" - self._check_not_closed() - return self._fp.fileno() - - def seekable(self): - """Return whether the file supports seeking.""" - return self.readable() and self._fp.seekable() - - def readable(self): - """Return whether the file was opened for reading.""" - self._check_not_closed() - return self._mode in (_MODE_READ, _MODE_READ_EOF) - - def writable(self): - """Return whether the file was opened for writing.""" - self._check_not_closed() - return self._mode == _MODE_WRITE - - # Mode-checking helper functions. - - def _check_not_closed(self): - if self.closed: - fname = getattr(self._fp, 'name', None) - msg = "I/O operation on closed file" - if fname is not None: - msg += " {}".format(fname) - msg += "." - raise ValueError(msg) - - def _check_can_read(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("File not open for reading") - - def _check_can_write(self): - if self._mode != _MODE_WRITE: - self._check_not_closed() - raise io.UnsupportedOperation("File not open for writing") - - def _check_can_seek(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("Seeking is only supported " - "on files open for reading") - if not self._fp.seekable(): - raise io.UnsupportedOperation("The underlying file object " - "does not support seeking") - - # Fill the readahead buffer if it is empty. Returns False on EOF. - def _fill_buffer(self): - if self._mode == _MODE_READ_EOF: - return False - # Depending on the input data, our call to the decompressor may not - # return any data. In this case, try again after reading another block. - while self._buffer_offset == len(self._buffer): - try: - rawblock = (self._decompressor.unused_data or - self._fp.read(_BUFFER_SIZE)) - if not rawblock: - raise EOFError - except EOFError: - # End-of-stream marker and end of file. We're good. - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - self._buffer = self._decompressor.decompress(rawblock) - self._buffer_offset = 0 - return True - - # Read data until EOF. - # If return_data is false, consume the data without returning it. - def _read_all(self, return_data=True): - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while self._fill_buffer(): - if return_data: - blocks.append(self._buffer) - self._pos += len(self._buffer) - self._buffer = b"" - if return_data: - return b"".join(blocks) - - # Read a block of up to n bytes. - # If return_data is false, consume the data without returning it. - def _read_block(self, n_bytes, return_data=True): - # If we have enough data buffered, return immediately. - end = self._buffer_offset + n_bytes - if end <= len(self._buffer): - data = self._buffer[self._buffer_offset: end] - self._buffer_offset = end - self._pos += len(data) - return data if return_data else None - - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while n_bytes > 0 and self._fill_buffer(): - if n_bytes < len(self._buffer): - data = self._buffer[:n_bytes] - self._buffer_offset = n_bytes - else: - data = self._buffer - self._buffer = b"" - if return_data: - blocks.append(data) - self._pos += len(data) - n_bytes -= len(data) - if return_data: - return b"".join(blocks) - - def read(self, size=-1): - """Read up to size uncompressed bytes from the file. - - If size is negative or omitted, read until EOF is reached. - Returns b'' if the file is already at EOF. - """ - with self._lock: - self._check_can_read() - if size == 0: - return b"" - elif size < 0: - return self._read_all() - else: - return self._read_block(size) - - def readinto(self, b): - """Read up to len(b) bytes into b. - - Returns the number of bytes read (0 for EOF). - """ - with self._lock: - return io.BufferedIOBase.readinto(self, b) - - def write(self, data): - """Write a byte string to the file. - - Returns the number of uncompressed bytes written, which is - always len(data). Note that due to buffering, the file on disk - may not reflect the data written until close() is called. - """ - with self._lock: - self._check_can_write() - # Convert data type if called by io.BufferedWriter. - if isinstance(data, memoryview): - data = data.tobytes() - - compressed = self._compressor.compress(data) - self._fp.write(compressed) - self._pos += len(data) - return len(data) - - # Rewind the file to the beginning of the data stream. - def _rewind(self): - self._fp.seek(0, 0) - self._mode = _MODE_READ - self._pos = 0 - self._decompressor = zlib.decompressobj(self.wbits) - self._buffer = b"" - self._buffer_offset = 0 - - def seek(self, offset, whence=0): - """Change the file position. - - The new position is specified by offset, relative to the - position indicated by whence. Values for whence are: - - 0: start of stream (default); offset must not be negative - 1: current stream position - 2: end of stream; offset must not be positive - - Returns the new file position. - - Note that seeking is emulated, so depending on the parameters, - this operation may be extremely slow. - """ - with self._lock: - self._check_can_seek() - - # Recalculate offset as an absolute file position. - if whence == 0: - pass - elif whence == 1: - offset = self._pos + offset - elif whence == 2: - # Seeking relative to EOF - we need to know the file's size. - if self._size < 0: - self._read_all(return_data=False) - offset = self._size + offset - else: - raise ValueError("Invalid value for whence: %s" % (whence,)) - - # Make it so that offset is the number of bytes to skip forward. - if offset < self._pos: - self._rewind() - else: - offset -= self._pos - - # Read and discard data until we reach the desired position. - self._read_block(offset, return_data=False) - - return self._pos - - def tell(self): - """Return the current file position.""" - with self._lock: - self._check_not_closed() - return self._pos - - -class ZlibCompressorWrapper(CompressorWrapper): - - def __init__(self): - CompressorWrapper.__init__(self, obj=BinaryZlibFile, - prefix=_ZLIB_PREFIX, extension='.z') - - -class BinaryGzipFile(BinaryZlibFile): - """A file object providing transparent gzip (de)compression. - - If filename is a str or bytes object, it gives the name - of the file to be opened. Otherwise, it should be a file object, - which will be used to read or write the compressed data. - - mode can be 'rb' for reading (default) or 'wb' for (over)writing - - If mode is 'wb', compresslevel can be a number between 1 - and 9 specifying the level of compression: 1 produces the least - compression, and 9 produces the most compression. 3 is the default. - """ - - wbits = 31 # zlib compressor/decompressor wbits value for gzip format. - - -class GzipCompressorWrapper(CompressorWrapper): - - def __init__(self): - CompressorWrapper.__init__(self, obj=BinaryGzipFile, - prefix=_GZIP_PREFIX, extension='.gz') diff --git a/sklearn/externals/joblib/disk.py b/sklearn/externals/joblib/disk.py deleted file mode 100644 index c90c3df3609cd..0000000000000 --- a/sklearn/externals/joblib/disk.py +++ /dev/null @@ -1,124 +0,0 @@ -""" -Disk management utilities. -""" - -# Authors: Gael Varoquaux -# Lars Buitinck -# Copyright (c) 2010 Gael Varoquaux -# License: BSD Style, 3 clauses. - - -import os -import sys -import time -import errno -import shutil -import warnings - - -try: - WindowsError -except NameError: - WindowsError = OSError - - -def disk_used(path): - """ Return the disk usage in a directory.""" - size = 0 - for file in os.listdir(path) + ['.']: - stat = os.stat(os.path.join(path, file)) - if hasattr(stat, 'st_blocks'): - size += stat.st_blocks * 512 - else: - # on some platform st_blocks is not available (e.g., Windows) - # approximate by rounding to next multiple of 512 - size += (stat.st_size // 512 + 1) * 512 - # We need to convert to int to avoid having longs on some systems (we - # don't want longs to avoid problems we SQLite) - return int(size / 1024.) - - -def memstr_to_bytes(text): - """ Convert a memory text to its value in bytes. - """ - kilo = 1024 - units = dict(K=kilo, M=kilo ** 2, G=kilo ** 3) - try: - size = int(units[text[-1]] * float(text[:-1])) - except (KeyError, ValueError): - raise ValueError( - "Invalid literal for size give: %s (type %s) should be " - "alike '10G', '500M', '50K'." % (text, type(text))) - return size - - -def mkdirp(d): - """Ensure directory d exists (like mkdir -p on Unix) - No guarantee that the directory is writable. - """ - try: - os.makedirs(d) - except OSError as e: - if e.errno != errno.EEXIST: - raise - - -# if a rmtree operation fails in rm_subdirs, wait for this much time (in secs), -# then retry up to RM_SUBDIRS_N_RETRY times. If it still fails, raise the -# exception. this mecanism ensures that the sub-process gc have the time to -# collect and close the memmaps before we fail. -RM_SUBDIRS_RETRY_TIME = 0.1 -RM_SUBDIRS_N_RETRY = 5 - - -def rm_subdirs(path, onerror=None): - """Remove all subdirectories in this path. - - The directory indicated by `path` is left in place, and its subdirectories - are erased. - - If onerror is set, it is called to handle the error with arguments (func, - path, exc_info) where func is os.listdir, os.remove, or os.rmdir; - path is the argument to that function that caused it to fail; and - exc_info is a tuple returned by sys.exc_info(). If onerror is None, - an exception is raised. - """ - - # NOTE this code is adapted from the one in shutil.rmtree, and is - # just as fast - - names = [] - try: - names = os.listdir(path) - except os.error: - if onerror is not None: - onerror(os.listdir, path, sys.exc_info()) - else: - raise - - for name in names: - fullname = os.path.join(path, name) - delete_folder(fullname, onerror=onerror) - - -def delete_folder(folder_path, onerror=None): - """Utility function to cleanup a temporary folder if it still exists.""" - if os.path.isdir(folder_path): - if onerror is not None: - shutil.rmtree(folder_path, False, onerror) - else: - # allow the rmtree to fail once, wait and re-try. - # if the error is raised again, fail - err_count = 0 - while True: - try: - shutil.rmtree(folder_path, False, None) - break - except (OSError, WindowsError): - err_count += 1 - if err_count > RM_SUBDIRS_N_RETRY: - warnings.warn( - "Unable to delete folder {} after {} tentatives." - .format(folder_path, RM_SUBDIRS_N_RETRY)) - raise - time.sleep(RM_SUBDIRS_RETRY_TIME) diff --git a/sklearn/externals/joblib/executor.py b/sklearn/externals/joblib/executor.py deleted file mode 100644 index c63472d608944..0000000000000 --- a/sklearn/externals/joblib/executor.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Utility function to construct a loky.ReusableExecutor with custom pickler. - -This module provides efficient ways of working with data stored in -shared memory with numpy.memmap arrays without inducing any memory -copy between the parent and child processes. -""" -# Author: Thomas Moreau -# Copyright: 2017, Thomas Moreau -# License: BSD 3 clause - -import random -from .disk import delete_folder -from ._memmapping_reducer import get_memmapping_reducers -from .externals.loky.reusable_executor import get_reusable_executor - - -_backend_args = None - - -def get_memmapping_executor(n_jobs, timeout=300, initializer=None, initargs=(), - **backend_args): - """Factory for ReusableExecutor with automatic memmapping for large numpy - arrays. - """ - global _backend_args - reuse = _backend_args is None or _backend_args == backend_args - _backend_args = backend_args - - id_executor = random.randint(0, int(1e10)) - job_reducers, result_reducers, temp_folder = get_memmapping_reducers( - id_executor, **backend_args) - _executor = get_reusable_executor(n_jobs, job_reducers=job_reducers, - result_reducers=result_reducers, - reuse=reuse, timeout=timeout, - initializer=initializer, - initargs=initargs) - # If executor doesn't have a _temp_folder, it means it is a new executor - # and the reducers have been used. Else, the previous reducers are used - # and we should not change this attibute. - if not hasattr(_executor, "_temp_folder"): - _executor._temp_folder = temp_folder - else: - delete_folder(temp_folder) - return _executor - - -class _TestingMemmappingExecutor(): - """Wrapper around ReusableExecutor to ease memmapping testing with Pool - and Executor. This is only for testing purposes. - """ - def __init__(self, n_jobs, **backend_args): - self._executor = get_memmapping_executor(n_jobs, **backend_args) - self._temp_folder = self._executor._temp_folder - - def apply_async(self, func, args): - """Schedule a func to be run""" - future = self._executor.submit(func, *args) - future.get = future.result - return future - - def terminate(self): - self._executor.shutdown() - delete_folder(self._temp_folder) - - def map(self, f, *args): - res = self._executor.map(f, *args) - return list(res) diff --git a/sklearn/externals/joblib/externals/__init__.py b/sklearn/externals/joblib/externals/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/sklearn/externals/joblib/externals/cloudpickle/__init__.py b/sklearn/externals/joblib/externals/cloudpickle/__init__.py deleted file mode 100644 index 579876a24310c..0000000000000 --- a/sklearn/externals/joblib/externals/cloudpickle/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import absolute_import - -from .cloudpickle import * - -__version__ = '0.6.1' diff --git a/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py b/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py deleted file mode 100644 index bf92569c1e8c0..0000000000000 --- a/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py +++ /dev/null @@ -1,1195 +0,0 @@ -""" -This class is defined to override standard pickle functionality - -The goals of it follow: --Serialize lambdas and nested functions to compiled byte code --Deal with main module correctly --Deal with other non-serializable objects - -It does not include an unpickler, as standard python unpickling suffices. - -This module was extracted from the `cloud` package, developed by `PiCloud, Inc. -`_. - -Copyright (c) 2012, Regents of the University of California. -Copyright (c) 2009 `PiCloud, Inc. `_. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the University of California, Berkeley nor the - names of its contributors may be used to endorse or promote - products derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" -from __future__ import print_function - -import io -import dis -import sys -import types -import opcode -import pickle -import struct -import logging -import weakref -import operator -import importlib -import itertools -import traceback -from functools import partial - - -# cloudpickle is meant for inter process communication: we expect all -# communicating processes to run the same Python version hence we favor -# communication speed over compatibility: -DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL - - -if sys.version < '3': - from pickle import Pickler - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - PY3 = False -else: - types.ClassType = type - from pickle import _Pickler as Pickler - from io import BytesIO as StringIO - PY3 = True - - -# Container for the global namespace to ensure consistent unpickling of -# functions defined in dynamic modules (modules not registed in sys.modules). -_dynamic_modules_globals = weakref.WeakValueDictionary() - - -class _DynamicModuleFuncGlobals(dict): - """Global variables referenced by a function defined in a dynamic module - - To avoid leaking references we store such context in a WeakValueDictionary - instance. However instances of python builtin types such as dict cannot - be used directly as values in such a construct, hence the need for a - derived class. - """ - pass - - -def _make_cell_set_template_code(): - """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF - - Notes - ----- - In Python 3, we could use an easier function: - - .. code-block:: python - - def f(): - cell = None - - def _stub(value): - nonlocal cell - cell = value - - return _stub - - _cell_set_template_code = f() - - This function is _only_ a LOAD_FAST(arg); STORE_DEREF, but that is - invalid syntax on Python 2. If we use this function we also don't need - to do the weird freevars/cellvars swap below - """ - def inner(value): - lambda: cell # make ``cell`` a closure so that we get a STORE_DEREF - cell = value - - co = inner.__code__ - - # NOTE: we are marking the cell variable as a free variable intentionally - # so that we simulate an inner function instead of the outer function. This - # is what gives us the ``nonlocal`` behavior in a Python 2 compatible way. - if not PY3: - return types.CodeType( - co.co_argcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - else: - return types.CodeType( - co.co_argcount, - co.co_kwonlyargcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - - -_cell_set_template_code = _make_cell_set_template_code() - - -def cell_set(cell, value): - """Set the value of a closure cell. - """ - return types.FunctionType( - _cell_set_template_code, - {}, - '_cell_set_inner', - (), - (cell,), - )(value) - - -# relevant opcodes -STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] -DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL'] -LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL'] -GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) -HAVE_ARGUMENT = dis.HAVE_ARGUMENT -EXTENDED_ARG = dis.EXTENDED_ARG - - -def islambda(func): - return getattr(func, '__name__') == '' - - -_BUILTIN_TYPE_NAMES = {} -for k, v in types.__dict__.items(): - if type(v) is type: - _BUILTIN_TYPE_NAMES[v] = k - - -def _builtin_type(name): - return getattr(types, name) - - -def _make__new__factory(type_): - def _factory(): - return type_.__new__ - return _factory - - -# NOTE: These need to be module globals so that they're pickleable as globals. -_get_dict_new = _make__new__factory(dict) -_get_frozenset_new = _make__new__factory(frozenset) -_get_list_new = _make__new__factory(list) -_get_set_new = _make__new__factory(set) -_get_tuple_new = _make__new__factory(tuple) -_get_object_new = _make__new__factory(object) - -# Pre-defined set of builtin_function_or_method instances that can be -# serialized. -_BUILTIN_TYPE_CONSTRUCTORS = { - dict.__new__: _get_dict_new, - frozenset.__new__: _get_frozenset_new, - set.__new__: _get_set_new, - list.__new__: _get_list_new, - tuple.__new__: _get_tuple_new, - object.__new__: _get_object_new, -} - - -if sys.version_info < (3, 4): - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - code = getattr(code, 'co_code', b'') - if not PY3: - code = map(ord, code) - - n = len(code) - i = 0 - extended_arg = 0 - while i < n: - op = code[i] - i += 1 - if op >= HAVE_ARGUMENT: - oparg = code[i] + code[i + 1] * 256 + extended_arg - extended_arg = 0 - i += 2 - if op == EXTENDED_ARG: - extended_arg = oparg * 65536 - if op in GLOBAL_OPS: - yield op, oparg - -else: - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - for instr in dis.get_instructions(code): - op = instr.opcode - if op in GLOBAL_OPS: - yield op, instr.arg - - -class CloudPickler(Pickler): - - dispatch = Pickler.dispatch.copy() - - def __init__(self, file, protocol=None): - if protocol is None: - protocol = DEFAULT_PROTOCOL - Pickler.__init__(self, file, protocol=protocol) - # set of modules to unpickle - self.modules = set() - # map ids to dictionary. used to ensure that functions can share global env - self.globals_ref = {} - - def dump(self, obj): - self.inject_addons() - try: - return Pickler.dump(self, obj) - except RuntimeError as e: - if 'recursion' in e.args[0]: - msg = """Could not pickle object as excessively deep recursion required.""" - raise pickle.PicklingError(msg) - else: - raise - - def save_memoryview(self, obj): - self.save(obj.tobytes()) - - dispatch[memoryview] = save_memoryview - - if not PY3: - def save_buffer(self, obj): - self.save(str(obj)) - - dispatch[buffer] = save_buffer # noqa: F821 'buffer' was removed in Python 3 - - def save_module(self, obj): - """ - Save a module as an import - """ - self.modules.add(obj) - if _is_dynamic(obj): - self.save_reduce(dynamic_subimport, (obj.__name__, vars(obj)), - obj=obj) - else: - self.save_reduce(subimport, (obj.__name__,), obj=obj) - - dispatch[types.ModuleType] = save_module - - def save_codeobject(self, obj): - """ - Save a code object - """ - if PY3: - args = ( - obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, - obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, obj.co_varnames, - obj.co_filename, obj.co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, - obj.co_cellvars - ) - else: - args = ( - obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, - obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars - ) - self.save_reduce(types.CodeType, args, obj=obj) - - dispatch[types.CodeType] = save_codeobject - - def save_function(self, obj, name=None): - """ Registered with the dispatch to handle all function types. - - Determines what kind of function obj is (e.g. lambda, defined at - interactive prompt, etc) and handles the pickling appropriately. - """ - try: - should_special_case = obj in _BUILTIN_TYPE_CONSTRUCTORS - except TypeError: - # Methods of builtin types aren't hashable in python 2. - should_special_case = False - - if should_special_case: - # We keep a special-cased cache of built-in type constructors at - # global scope, because these functions are structured very - # differently in different python versions and implementations (for - # example, they're instances of types.BuiltinFunctionType in - # CPython, but they're ordinary types.FunctionType instances in - # PyPy). - # - # If the function we've received is in that cache, we just - # serialize it as a lookup into the cache. - return self.save_reduce(_BUILTIN_TYPE_CONSTRUCTORS[obj], (), obj=obj) - - write = self.write - - if name is None: - name = obj.__name__ - try: - # whichmodule() could fail, see - # https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling - modname = pickle.whichmodule(obj, name) - except Exception: - modname = None - # print('which gives %s %s %s' % (modname, obj, name)) - try: - themodule = sys.modules[modname] - except KeyError: - # eval'd items such as namedtuple give invalid items for their function __module__ - modname = '__main__' - - if modname == '__main__': - themodule = None - - try: - lookedup_by_name = getattr(themodule, name, None) - except Exception: - lookedup_by_name = None - - if themodule: - self.modules.add(themodule) - if lookedup_by_name is obj: - return self.save_global(obj, name) - - # a builtin_function_or_method which comes in as an attribute of some - # object (e.g., itertools.chain.from_iterable) will end - # up with modname "__main__" and so end up here. But these functions - # have no __code__ attribute in CPython, so the handling for - # user-defined functions below will fail. - # So we pickle them here using save_reduce; have to do it differently - # for different python versions. - if not hasattr(obj, '__code__'): - if PY3: - rv = obj.__reduce_ex__(self.proto) - else: - if hasattr(obj, '__self__'): - rv = (getattr, (obj.__self__, name)) - else: - raise pickle.PicklingError("Can't pickle %r" % obj) - return self.save_reduce(obj=obj, *rv) - - # if func is lambda, def'ed at prompt, is in main, or is nested, then - # we'll pickle the actual function object rather than simply saving a - # reference (as is done in default pickler), via save_function_tuple. - if (islambda(obj) - or getattr(obj.__code__, 'co_filename', None) == '' - or themodule is None): - self.save_function_tuple(obj) - return - else: - # func is nested - if lookedup_by_name is None or lookedup_by_name is not obj: - self.save_function_tuple(obj) - return - - if obj.__dict__: - # essentially save_reduce, but workaround needed to avoid recursion - self.save(_restore_attr) - write(pickle.MARK + pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) - self.save(obj.__dict__) - write(pickle.TUPLE + pickle.REDUCE) - else: - write(pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) - - dispatch[types.FunctionType] = save_function - - def _save_subimports(self, code, top_level_dependencies): - """ - Ensure de-pickler imports any package child-modules that - are needed by the function - """ - - # check if any known dependency is an imported package - for x in top_level_dependencies: - if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__: - # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' - # A concurrent thread could mutate sys.modules, - # make sure we iterate over a copy to avoid exceptions - for name in list(sys.modules): - # Older versions of pytest will add a "None" module to sys.modules. - if name is not None and name.startswith(prefix): - # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) - if not tokens - set(code.co_names): - # ensure unpickler executes this import - self.save(sys.modules[name]) - # then discards the reference to it - self.write(pickle.POP) - - def save_dynamic_class(self, obj): - """ - Save a class that can't be stored as module global. - - This method is used to serialize classes that are defined inside - functions, or that otherwise can't be serialized as attribute lookups - from global modules. - """ - clsdict = dict(obj.__dict__) # copy dict proxy to a dict - clsdict.pop('__weakref__', None) - - # For ABCMeta in python3.7+, remove _abc_impl as it is not picklable. - # This is a fix which breaks the cache but this only makes the first - # calls to issubclass slower. - if "_abc_impl" in clsdict: - import abc - (registry, _, _, _) = abc._get_dump(obj) - clsdict["_abc_impl"] = [subclass_weakref() - for subclass_weakref in registry] - - # On PyPy, __doc__ is a readonly attribute, so we need to include it in - # the initial skeleton class. This is safe because we know that the - # doc can't participate in a cycle with the original class. - type_kwargs = {'__doc__': clsdict.pop('__doc__', None)} - - # If type overrides __dict__ as a property, include it in the type kwargs. - # In Python 2, we can't set this attribute after construction. - __dict__ = clsdict.pop('__dict__', None) - if isinstance(__dict__, property): - type_kwargs['__dict__'] = __dict__ - - save = self.save - write = self.write - - # We write pickle instructions explicitly here to handle the - # possibility that the type object participates in a cycle with its own - # __dict__. We first write an empty "skeleton" version of the class and - # memoize it before writing the class' __dict__ itself. We then write - # instructions to "rehydrate" the skeleton class by restoring the - # attributes from the __dict__. - # - # A type can appear in a cycle with its __dict__ if an instance of the - # type appears in the type's __dict__ (which happens for the stdlib - # Enum class), or if the type defines methods that close over the name - # of the type, (which is common for Python 2-style super() calls). - - # Push the rehydration function. - save(_rehydrate_skeleton_class) - - # Mark the start of the args tuple for the rehydration function. - write(pickle.MARK) - - # Create and memoize an skeleton class with obj's name and bases. - tp = type(obj) - self.save_reduce(tp, (obj.__name__, obj.__bases__, type_kwargs), obj=obj) - - # Now save the rest of obj's __dict__. Any references to obj - # encountered while saving will point to the skeleton class. - save(clsdict) - - # Write a tuple of (skeleton_class, clsdict). - write(pickle.TUPLE) - - # Call _rehydrate_skeleton_class(skeleton_class, clsdict) - write(pickle.REDUCE) - - def save_function_tuple(self, func): - """ Pickles an actual func object. - - A func comprises: code, globals, defaults, closure, and dict. We - extract and save these, injecting reducing functions at certain points - to recreate the func object. Keep in mind that some of these pieces - can contain a ref to the func itself. Thus, a naive save on these - pieces could trigger an infinite loop of save's. To get around that, - we first create a skeleton func object using just the code (this is - safe, since this won't contain a ref to the func), and memoize it as - soon as it's created. The other stuff can then be filled in later. - """ - if is_tornado_coroutine(func): - self.save_reduce(_rebuild_tornado_coroutine, (func.__wrapped__,), - obj=func) - return - - save = self.save - write = self.write - - code, f_globals, defaults, closure_values, dct, base_globals = self.extract_func_data(func) - - save(_fill_function) # skeleton function updater - write(pickle.MARK) # beginning of tuple that _fill_function expects - - self._save_subimports( - code, - itertools.chain(f_globals.values(), closure_values or ()), - ) - - # create a skeleton function object and memoize it - save(_make_skel_func) - save(( - code, - len(closure_values) if closure_values is not None else -1, - base_globals, - )) - write(pickle.REDUCE) - self.memoize(func) - - # save the rest of the func data needed by _fill_function - state = { - 'globals': f_globals, - 'defaults': defaults, - 'dict': dct, - 'closure_values': closure_values, - 'module': func.__module__, - 'name': func.__name__, - 'doc': func.__doc__, - } - if hasattr(func, '__annotations__') and sys.version_info >= (3, 7): - state['annotations'] = func.__annotations__ - if hasattr(func, '__qualname__'): - state['qualname'] = func.__qualname__ - save(state) - write(pickle.TUPLE) - write(pickle.REDUCE) # applies _fill_function on the tuple - - _extract_code_globals_cache = ( - weakref.WeakKeyDictionary() - if not hasattr(sys, "pypy_version_info") - else {}) - - @classmethod - def extract_code_globals(cls, co): - """ - Find all globals names read or written to by codeblock co - """ - out_names = cls._extract_code_globals_cache.get(co) - if out_names is None: - try: - names = co.co_names - except AttributeError: - # PyPy "builtin-code" object - out_names = set() - else: - out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} - - # see if nested function have any global refs - if co.co_consts: - for const in co.co_consts: - if type(const) is types.CodeType: - out_names |= cls.extract_code_globals(const) - - cls._extract_code_globals_cache[co] = out_names - - return out_names - - def extract_func_data(self, func): - """ - Turn the function into a tuple of data necessary to recreate it: - code, globals, defaults, closure_values, dict - """ - code = func.__code__ - - # extract all global ref's - func_global_refs = self.extract_code_globals(code) - - # process all variables referenced by global environment - f_globals = {} - for var in func_global_refs: - if var in func.__globals__: - f_globals[var] = func.__globals__[var] - - # defaults requires no processing - defaults = func.__defaults__ - - # process closure - closure = ( - list(map(_get_cell_contents, func.__closure__)) - if func.__closure__ is not None - else None - ) - - # save the dict - dct = func.__dict__ - - base_globals = self.globals_ref.get(id(func.__globals__), None) - if base_globals is None: - # For functions defined in a well behaved module use - # vars(func.__module__) for base_globals. This is necessary to - # share the global variables across multiple pickled functions from - # this module. - if hasattr(func, '__module__') and func.__module__ is not None: - base_globals = func.__module__ - else: - base_globals = {} - self.globals_ref[id(func.__globals__)] = base_globals - - return (code, f_globals, defaults, closure, dct, base_globals) - - def save_builtin_function(self, obj): - if obj.__module__ == "__builtin__": - return self.save_global(obj) - return self.save_function(obj) - - dispatch[types.BuiltinFunctionType] = save_builtin_function - - def save_global(self, obj, name=None, pack=struct.pack): - """ - Save a "global". - - The name of this method is somewhat misleading: all types get - dispatched here. - """ - if obj is type(None): - return self.save_reduce(type, (None,), obj=obj) - elif obj is type(Ellipsis): - return self.save_reduce(type, (Ellipsis,), obj=obj) - elif obj is type(NotImplemented): - return self.save_reduce(type, (NotImplemented,), obj=obj) - - if obj.__module__ == "__main__": - return self.save_dynamic_class(obj) - - try: - return Pickler.save_global(self, obj, name=name) - except Exception: - if obj.__module__ == "__builtin__" or obj.__module__ == "builtins": - if obj in _BUILTIN_TYPE_NAMES: - return self.save_reduce( - _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) - - typ = type(obj) - if typ is not obj and isinstance(obj, (type, types.ClassType)): - return self.save_dynamic_class(obj) - - raise - - dispatch[type] = save_global - dispatch[types.ClassType] = save_global - - def save_instancemethod(self, obj): - # Memoization rarely is ever useful due to python bounding - if obj.__self__ is None: - self.save_reduce(getattr, (obj.im_class, obj.__name__)) - else: - if PY3: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) - else: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__), - obj=obj) - - dispatch[types.MethodType] = save_instancemethod - - def save_inst(self, obj): - """Inner logic to save instance. Based off pickle.save_inst""" - cls = obj.__class__ - - # Try the dispatch table (pickle module doesn't do it) - f = self.dispatch.get(cls) - if f: - f(self, obj) # Call unbound method with explicit self - return - - memo = self.memo - write = self.write - save = self.save - - if hasattr(obj, '__getinitargs__'): - args = obj.__getinitargs__() - len(args) # XXX Assert it's a sequence - pickle._keep_alive(args, memo) - else: - args = () - - write(pickle.MARK) - - if self.bin: - save(cls) - for arg in args: - save(arg) - write(pickle.OBJ) - else: - for arg in args: - save(arg) - write(pickle.INST + cls.__module__ + '\n' + cls.__name__ + '\n') - - self.memoize(obj) - - try: - getstate = obj.__getstate__ - except AttributeError: - stuff = obj.__dict__ - else: - stuff = getstate() - pickle._keep_alive(stuff, memo) - save(stuff) - write(pickle.BUILD) - - if not PY3: - dispatch[types.InstanceType] = save_inst - - def save_property(self, obj): - # properties not correctly saved in python - self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj) - - dispatch[property] = save_property - - def save_classmethod(self, obj): - orig_func = obj.__func__ - self.save_reduce(type(obj), (orig_func,), obj=obj) - - dispatch[classmethod] = save_classmethod - dispatch[staticmethod] = save_classmethod - - def save_itemgetter(self, obj): - """itemgetter serializer (needed for namedtuple support)""" - class Dummy: - def __getitem__(self, item): - return item - items = obj(Dummy()) - if not isinstance(items, tuple): - items = (items,) - return self.save_reduce(operator.itemgetter, items) - - if type(operator.itemgetter) is type: - dispatch[operator.itemgetter] = save_itemgetter - - def save_attrgetter(self, obj): - """attrgetter serializer""" - class Dummy(object): - def __init__(self, attrs, index=None): - self.attrs = attrs - self.index = index - def __getattribute__(self, item): - attrs = object.__getattribute__(self, "attrs") - index = object.__getattribute__(self, "index") - if index is None: - index = len(attrs) - attrs.append(item) - else: - attrs[index] = ".".join([attrs[index], item]) - return type(self)(attrs, index) - attrs = [] - obj(Dummy(attrs)) - return self.save_reduce(operator.attrgetter, tuple(attrs)) - - if type(operator.attrgetter) is type: - dispatch[operator.attrgetter] = save_attrgetter - - def save_file(self, obj): - """Save a file""" - try: - import StringIO as pystringIO # we can't use cStringIO as it lacks the name attribute - except ImportError: - import io as pystringIO - - if not hasattr(obj, 'name') or not hasattr(obj, 'mode'): - raise pickle.PicklingError("Cannot pickle files that do not map to an actual file") - if obj is sys.stdout: - return self.save_reduce(getattr, (sys, 'stdout'), obj=obj) - if obj is sys.stderr: - return self.save_reduce(getattr, (sys, 'stderr'), obj=obj) - if obj is sys.stdin: - raise pickle.PicklingError("Cannot pickle standard input") - if obj.closed: - raise pickle.PicklingError("Cannot pickle closed files") - if hasattr(obj, 'isatty') and obj.isatty(): - raise pickle.PicklingError("Cannot pickle files that map to tty objects") - if 'r' not in obj.mode and '+' not in obj.mode: - raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode) - - name = obj.name - - retval = pystringIO.StringIO() - - try: - # Read the whole file - curloc = obj.tell() - obj.seek(0) - contents = obj.read() - obj.seek(curloc) - except IOError: - raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name) - retval.write(contents) - retval.seek(curloc) - - retval.name = name - self.save(retval) - self.memoize(obj) - - def save_ellipsis(self, obj): - self.save_reduce(_gen_ellipsis, ()) - - def save_not_implemented(self, obj): - self.save_reduce(_gen_not_implemented, ()) - - try: # Python 2 - dispatch[file] = save_file - except NameError: # Python 3 - dispatch[io.TextIOWrapper] = save_file - - dispatch[type(Ellipsis)] = save_ellipsis - dispatch[type(NotImplemented)] = save_not_implemented - - def save_weakset(self, obj): - self.save_reduce(weakref.WeakSet, (list(obj),)) - - dispatch[weakref.WeakSet] = save_weakset - - def save_logger(self, obj): - self.save_reduce(logging.getLogger, (obj.name,), obj=obj) - - dispatch[logging.Logger] = save_logger - - def save_root_logger(self, obj): - self.save_reduce(logging.getLogger, (), obj=obj) - - dispatch[logging.RootLogger] = save_root_logger - - """Special functions for Add-on libraries""" - def inject_addons(self): - """Plug in system. Register additional pickling functions if modules already loaded""" - pass - - -# Tornado support - -def is_tornado_coroutine(func): - """ - Return whether *func* is a Tornado coroutine function. - Running coroutines are not supported. - """ - if 'tornado.gen' not in sys.modules: - return False - gen = sys.modules['tornado.gen'] - if not hasattr(gen, "is_coroutine_function"): - # Tornado version is too old - return False - return gen.is_coroutine_function(func) - - -def _rebuild_tornado_coroutine(func): - from tornado import gen - return gen.coroutine(func) - - -# Shorthands for legacy support - -def dump(obj, file, protocol=None): - """Serialize obj as bytes streamed into file - - protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to - pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed - between processes running the same Python version. - - Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure - compatibility with older versions of Python. - """ - CloudPickler(file, protocol=protocol).dump(obj) - - -def dumps(obj, protocol=None): - """Serialize obj as a string of bytes allocated in memory - - protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to - pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed - between processes running the same Python version. - - Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure - compatibility with older versions of Python. - """ - file = StringIO() - try: - cp = CloudPickler(file, protocol=protocol) - cp.dump(obj) - return file.getvalue() - finally: - file.close() - - -# including pickles unloading functions in this namespace -load = pickle.load -loads = pickle.loads - - -# hack for __import__ not working as desired -def subimport(name): - __import__(name) - return sys.modules[name] - - -def dynamic_subimport(name, vars): - mod = types.ModuleType(name) - mod.__dict__.update(vars) - return mod - - -# restores function attributes -def _restore_attr(obj, attr): - for key, val in attr.items(): - setattr(obj, key, val) - return obj - - -def _get_module_builtins(): - return pickle.__builtins__ - - -def print_exec(stream): - ei = sys.exc_info() - traceback.print_exception(ei[0], ei[1], ei[2], None, stream) - - -def _modules_to_main(modList): - """Force every module in modList to be placed into main""" - if not modList: - return - - main = sys.modules['__main__'] - for modname in modList: - if type(modname) is str: - try: - mod = __import__(modname) - except Exception: - sys.stderr.write('warning: could not import %s\n. ' - 'Your function may unexpectedly error due to this import failing;' - 'A version mismatch is likely. Specific error was:\n' % modname) - print_exec(sys.stderr) - else: - setattr(main, mod.__name__, mod) - - -# object generators: -def _genpartial(func, args, kwds): - if not args: - args = () - if not kwds: - kwds = {} - return partial(func, *args, **kwds) - - -def _gen_ellipsis(): - return Ellipsis - - -def _gen_not_implemented(): - return NotImplemented - - -def _get_cell_contents(cell): - try: - return cell.cell_contents - except ValueError: - # sentinel used by ``_fill_function`` which will leave the cell empty - return _empty_cell_value - - -def instance(cls): - """Create a new instance of a class. - - Parameters - ---------- - cls : type - The class to create an instance of. - - Returns - ------- - instance : cls - A new instance of ``cls``. - """ - return cls() - - -@instance -class _empty_cell_value(object): - """sentinel for empty closures - """ - @classmethod - def __reduce__(cls): - return cls.__name__ - - -def _fill_function(*args): - """Fills in the rest of function data into the skeleton function object - - The skeleton itself is create by _make_skel_func(). - """ - if len(args) == 2: - func = args[0] - state = args[1] - elif len(args) == 5: - # Backwards compat for cloudpickle v0.4.0, after which the `module` - # argument was introduced - func = args[0] - keys = ['globals', 'defaults', 'dict', 'closure_values'] - state = dict(zip(keys, args[1:])) - elif len(args) == 6: - # Backwards compat for cloudpickle v0.4.1, after which the function - # state was passed as a dict to the _fill_function it-self. - func = args[0] - keys = ['globals', 'defaults', 'dict', 'module', 'closure_values'] - state = dict(zip(keys, args[1:])) - else: - raise ValueError('Unexpected _fill_value arguments: %r' % (args,)) - - # Only set global variables that do not exist. - for k, v in state['globals'].items(): - if k not in func.__globals__: - func.__globals__[k] = v - - func.__defaults__ = state['defaults'] - func.__dict__ = state['dict'] - if 'annotations' in state: - func.__annotations__ = state['annotations'] - if 'doc' in state: - func.__doc__ = state['doc'] - if 'name' in state: - func.__name__ = state['name'] - if 'module' in state: - func.__module__ = state['module'] - if 'qualname' in state: - func.__qualname__ = state['qualname'] - - cells = func.__closure__ - if cells is not None: - for cell, value in zip(cells, state['closure_values']): - if value is not _empty_cell_value: - cell_set(cell, value) - - return func - - -def _make_empty_cell(): - if False: - # trick the compiler into creating an empty cell in our lambda - cell = None - raise AssertionError('this route should not be executed') - - return (lambda: cell).__closure__[0] - - -def _make_skel_func(code, cell_count, base_globals=None): - """ Creates a skeleton function object that contains just the provided - code and the correct number of cells in func_closure. All other - func attributes (e.g. func_globals) are empty. - """ - if base_globals is None: - base_globals = {} - elif isinstance(base_globals, str): - base_globals_name = base_globals - try: - # First try to reuse the globals from the module containing the - # function. If it is not possible to retrieve it, fallback to an - # empty dictionary. - base_globals = vars(importlib.import_module(base_globals)) - except ImportError: - base_globals = _dynamic_modules_globals.get( - base_globals_name, None) - if base_globals is None: - base_globals = _DynamicModuleFuncGlobals() - _dynamic_modules_globals[base_globals_name] = base_globals - - base_globals['__builtins__'] = __builtins__ - - closure = ( - tuple(_make_empty_cell() for _ in range(cell_count)) - if cell_count >= 0 else - None - ) - return types.FunctionType(code, base_globals, None, None, closure) - - -def _rehydrate_skeleton_class(skeleton_class, class_dict): - """Put attributes from `class_dict` back on `skeleton_class`. - - See CloudPickler.save_dynamic_class for more info. - """ - registry = None - for attrname, attr in class_dict.items(): - if attrname == "_abc_impl": - registry = attr - else: - setattr(skeleton_class, attrname, attr) - if registry is not None: - for subclass in registry: - skeleton_class.register(subclass) - - return skeleton_class - - -def _is_dynamic(module): - """ - Return True if the module is special module that cannot be imported by its - name. - """ - # Quick check: module that have __file__ attribute are not dynamic modules. - if hasattr(module, '__file__'): - return False - - if hasattr(module, '__spec__'): - return module.__spec__ is None - else: - # Backward compat for Python 2 - import imp - try: - path = None - for part in module.__name__.split('.'): - if path is not None: - path = [path] - f, path, description = imp.find_module(part, path) - if f is not None: - f.close() - except ImportError: - return True - return False - - -"""Constructors for 3rd party libraries -Note: These can never be renamed due to client compatibility issues""" - - -def _getobject(modname, attribute): - mod = __import__(modname, fromlist=[attribute]) - return mod.__dict__[attribute] - - -""" Use copy_reg to extend global pickle definitions """ - -if sys.version_info < (3, 4): - method_descriptor = type(str.upper) - - def _reduce_method_descriptor(obj): - return (getattr, (obj.__objclass__, obj.__name__)) - - try: - import copy_reg as copyreg - except ImportError: - import copyreg - copyreg.pickle(method_descriptor, _reduce_method_descriptor) diff --git a/sklearn/externals/joblib/externals/loky/__init__.py b/sklearn/externals/joblib/externals/loky/__init__.py deleted file mode 100644 index 3d7864fc5379a..0000000000000 --- a/sklearn/externals/joblib/externals/loky/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -r"""The :mod:`loky` module manages a pool of worker that can be re-used across time. -It provides a robust and dynamic implementation os the -:class:`ProcessPoolExecutor` and a function :func:`get_reusable_executor` which -hide the pool management under the hood. -""" -from ._base import Executor, Future -from ._base import wait, as_completed -from ._base import TimeoutError, CancelledError -from ._base import ALL_COMPLETED, FIRST_COMPLETED, FIRST_EXCEPTION - -from .backend.context import cpu_count -from .backend.reduction import set_loky_pickler -from .reusable_executor import get_reusable_executor -from .cloudpickle_wrapper import wrap_non_picklable_objects -from .process_executor import BrokenProcessPool, ProcessPoolExecutor - - -__all__ = ["get_reusable_executor", "cpu_count", "wait", "as_completed", - "Future", "Executor", "ProcessPoolExecutor", - "BrokenProcessPool", "CancelledError", "TimeoutError", - "FIRST_COMPLETED", "FIRST_EXCEPTION", "ALL_COMPLETED", - "wrap_non_picklable_objects", "set_loky_pickler"] - - -__version__ = '2.4.2' diff --git a/sklearn/externals/joblib/externals/loky/_base.py b/sklearn/externals/joblib/externals/loky/_base.py deleted file mode 100644 index 92422bbf3f2a4..0000000000000 --- a/sklearn/externals/joblib/externals/loky/_base.py +++ /dev/null @@ -1,627 +0,0 @@ -############################################################################### -# Backport concurrent.futures for python2.7/3.3 -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from concurrent/futures/_base.py (17/02/2017) -# * Do not use yield from -# * Use old super syntax -# -# Copyright 2009 Brian Quinlan. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -import sys -import time -import logging -import threading -import collections - - -if sys.version_info[:2] >= (3, 3): - - from concurrent.futures import wait, as_completed - from concurrent.futures import TimeoutError, CancelledError - from concurrent.futures import Executor, Future as _BaseFuture - - from concurrent.futures import FIRST_EXCEPTION - from concurrent.futures import ALL_COMPLETED, FIRST_COMPLETED - - from concurrent.futures._base import LOGGER - from concurrent.futures._base import PENDING, RUNNING, CANCELLED - from concurrent.futures._base import CANCELLED_AND_NOTIFIED, FINISHED -else: - - FIRST_COMPLETED = 'FIRST_COMPLETED' - FIRST_EXCEPTION = 'FIRST_EXCEPTION' - ALL_COMPLETED = 'ALL_COMPLETED' - _AS_COMPLETED = '_AS_COMPLETED' - - # Possible future states (for internal use by the futures package). - PENDING = 'PENDING' - RUNNING = 'RUNNING' - # The future was cancelled by the user... - CANCELLED = 'CANCELLED' - # ...and _Waiter.add_cancelled() was called by a worker. - CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED' - FINISHED = 'FINISHED' - - _FUTURE_STATES = [ - PENDING, - RUNNING, - CANCELLED, - CANCELLED_AND_NOTIFIED, - FINISHED - ] - - _STATE_TO_DESCRIPTION_MAP = { - PENDING: "pending", - RUNNING: "running", - CANCELLED: "cancelled", - CANCELLED_AND_NOTIFIED: "cancelled", - FINISHED: "finished" - } - - # Logger for internal use by the futures package. - LOGGER = logging.getLogger("concurrent.futures") - - class Error(Exception): - """Base class for all future-related exceptions.""" - pass - - class CancelledError(Error): - """The Future was cancelled.""" - pass - - class TimeoutError(Error): - """The operation exceeded the given deadline.""" - pass - - class _Waiter(object): - """Provides the event that wait() and as_completed() block on.""" - def __init__(self): - self.event = threading.Event() - self.finished_futures = [] - - def add_result(self, future): - self.finished_futures.append(future) - - def add_exception(self, future): - self.finished_futures.append(future) - - def add_cancelled(self, future): - self.finished_futures.append(future) - - class _AsCompletedWaiter(_Waiter): - """Used by as_completed().""" - - def __init__(self): - super(_AsCompletedWaiter, self).__init__() - self.lock = threading.Lock() - - def add_result(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_exception(future) - self.event.set() - - def add_cancelled(self, future): - with self.lock: - super(_AsCompletedWaiter, self).add_cancelled(future) - self.event.set() - - class _FirstCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_COMPLETED).""" - - def add_result(self, future): - super(_FirstCompletedWaiter, self).add_result(future) - self.event.set() - - def add_exception(self, future): - super(_FirstCompletedWaiter, self).add_exception(future) - self.event.set() - - def add_cancelled(self, future): - super(_FirstCompletedWaiter, self).add_cancelled(future) - self.event.set() - - class _AllCompletedWaiter(_Waiter): - """Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED).""" - - def __init__(self, num_pending_calls, stop_on_exception): - self.num_pending_calls = num_pending_calls - self.stop_on_exception = stop_on_exception - self.lock = threading.Lock() - super(_AllCompletedWaiter, self).__init__() - - def _decrement_pending_calls(self): - with self.lock: - self.num_pending_calls -= 1 - if not self.num_pending_calls: - self.event.set() - - def add_result(self, future): - super(_AllCompletedWaiter, self).add_result(future) - self._decrement_pending_calls() - - def add_exception(self, future): - super(_AllCompletedWaiter, self).add_exception(future) - if self.stop_on_exception: - self.event.set() - else: - self._decrement_pending_calls() - - def add_cancelled(self, future): - super(_AllCompletedWaiter, self).add_cancelled(future) - self._decrement_pending_calls() - - class _AcquireFutures(object): - """A context manager that does an ordered acquire of Future conditions. - """ - - def __init__(self, futures): - self.futures = sorted(futures, key=id) - - def __enter__(self): - for future in self.futures: - future._condition.acquire() - - def __exit__(self, *args): - for future in self.futures: - future._condition.release() - - def _create_and_install_waiters(fs, return_when): - if return_when == _AS_COMPLETED: - waiter = _AsCompletedWaiter() - elif return_when == FIRST_COMPLETED: - waiter = _FirstCompletedWaiter() - else: - pending_count = sum( - f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] - for f in fs) - - if return_when == FIRST_EXCEPTION: - waiter = _AllCompletedWaiter(pending_count, - stop_on_exception=True) - elif return_when == ALL_COMPLETED: - waiter = _AllCompletedWaiter(pending_count, - stop_on_exception=False) - else: - raise ValueError("Invalid return condition: %r" % return_when) - - for f in fs: - f._waiters.append(waiter) - - return waiter - - def as_completed(fs, timeout=None): - """An iterator over the given futures that yields each as it completes. - - Args: - fs: The sequence of Futures (possibly created by different - Executors) to iterate over. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - - Returns: - An iterator that yields the given Futures as they complete - (finished or cancelled). If any given Futures are duplicated, they - will be returned once. - - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - """ - if timeout is not None: - end_time = timeout + time.time() - - fs = set(fs) - with _AcquireFutures(fs): - finished = set( - f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - pending = fs - finished - waiter = _create_and_install_waiters(fs, _AS_COMPLETED) - - try: - for future in finished: - yield future - - while pending: - if timeout is None: - wait_timeout = None - else: - wait_timeout = end_time - time.time() - if wait_timeout < 0: - raise TimeoutError('%d (of %d) futures unfinished' % ( - len(pending), len(fs))) - - waiter.event.wait(wait_timeout) - - with waiter.lock: - finished = waiter.finished_futures - waiter.finished_futures = [] - waiter.event.clear() - - for future in finished: - yield future - pending.remove(future) - - finally: - for f in fs: - with f._condition: - f._waiters.remove(waiter) - - DoneAndNotDoneFutures = collections.namedtuple( - 'DoneAndNotDoneFutures', 'done not_done') - - def wait(fs, timeout=None, return_when=ALL_COMPLETED): - """Wait for the futures in the given sequence to complete. - - Args: - fs: The sequence of Futures (possibly created by different - Executors) to wait upon. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - return_when: Indicates when this function should return. The - options are: - - FIRST_COMPLETED - Return when any future finishes or is - cancelled. - FIRST_EXCEPTION - Return when any future finishes by raising an - exception. If no future raises an exception - then it is equivalent to ALL_COMPLETED. - ALL_COMPLETED - Return when all futures finish or are - cancelled. - - Returns: - A named 2-tuple of sets. The first set, named 'done', contains the - futures that completed (is finished or cancelled) before the wait - completed. The second set, named 'not_done', contains uncompleted - futures. - """ - with _AcquireFutures(fs): - done = set(f for f in fs - if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) - not_done = set(fs) - done - - if (return_when == FIRST_COMPLETED) and done: - return DoneAndNotDoneFutures(done, not_done) - elif (return_when == FIRST_EXCEPTION) and done: - if any(f for f in done - if not f.cancelled() and f.exception() is not None): - return DoneAndNotDoneFutures(done, not_done) - - if len(done) == len(fs): - return DoneAndNotDoneFutures(done, not_done) - - waiter = _create_and_install_waiters(fs, return_when) - - waiter.event.wait(timeout) - for f in fs: - with f._condition: - f._waiters.remove(waiter) - - done.update(waiter.finished_futures) - return DoneAndNotDoneFutures(done, set(fs) - done) - - class _BaseFuture(object): - """Represents the result of an asynchronous computation.""" - - def __init__(self): - """Initializes the future. Should not be called by clients.""" - self._condition = threading.Condition() - self._state = PENDING - self._result = None - self._exception = None - self._waiters = [] - self._done_callbacks = [] - - def __repr__(self): - with self._condition: - if self._state == FINISHED: - if self._exception: - return '<%s at %#x state=%s raised %s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._exception.__class__.__name__) - else: - return '<%s at %#x state=%s returned %s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state], - self._result.__class__.__name__) - return '<%s at %#x state=%s>' % ( - self.__class__.__name__, - id(self), - _STATE_TO_DESCRIPTION_MAP[self._state]) - - def cancel(self): - """Cancel the future if possible. - - Returns True if the future was cancelled, False otherwise. A future - cannot be cancelled if it is running or has already completed. - """ - with self._condition: - if self._state in [RUNNING, FINISHED]: - return False - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - return True - - self._state = CANCELLED - self._condition.notify_all() - - self._invoke_callbacks() - return True - - def cancelled(self): - """Return True if the future was cancelled.""" - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED] - - def running(self): - """Return True if the future is currently executing.""" - with self._condition: - return self._state == RUNNING - - def done(self): - """Return True of the future was cancelled or finished executing. - """ - with self._condition: - return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, - FINISHED] - - def __get_result(self): - if self._exception: - raise self._exception - else: - return self._result - - def add_done_callback(self, fn): - """Attaches a callable that will be called when the future finishes. - - Args: - fn: A callable that will be called with this future as its only - argument when the future completes or is cancelled. The - callable will always be called by a thread in the same - process in which it was added. If the future has already - completed or been cancelled then the callable will be - called immediately. These callables are called in the order - that they were added. - """ - with self._condition: - if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, - FINISHED]: - self._done_callbacks.append(fn) - return - fn(self) - - def result(self, timeout=None): - """Return the result of the call that the future represents. - - Args: - timeout: The number of seconds to wait for the result if the - future isn't done. If None, then there is no limit on the - wait time. - - Returns: - The result of the call that the future represents. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the - given timeout. - Exception: If the call raised then that exception will be - raised. - """ - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self.__get_result() - else: - raise TimeoutError() - - def exception(self, timeout=None): - """Return the exception raised by the call that the future - represents. - - Args: - timeout: The number of seconds to wait for the exception if the - future isn't done. If None, then there is no limit on the - wait time. - - Returns: - The exception raised by the call that the future represents or - None if the call completed without raising. - - Raises: - CancelledError: If the future was cancelled. - TimeoutError: If the future didn't finish executing before the - given timeout. - """ - - with self._condition: - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception - - self._condition.wait(timeout) - - if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: - raise CancelledError() - elif self._state == FINISHED: - return self._exception - else: - raise TimeoutError() - - # The following methods should only be used by Executors and in tests. - def set_running_or_notify_cancel(self): - """Mark the future as running or process any cancel notifications. - - Should only be used by Executor implementations and unit tests. - - If the future has been cancelled (cancel() was called and returned - True) then any threads waiting on the future completing (though - calls to as_completed() or wait()) are notified and False is - returned. - - If the future was not cancelled then it is put in the running state - (future calls to running() will return True) and True is returned. - - This method should be called by Executor implementations before - executing the work associated with this future. If this method - returns False then the work should not be executed. - - Returns: - False if the Future was cancelled, True otherwise. - - Raises: - RuntimeError: if this method was already called or if - set_result() or set_exception() was called. - """ - with self._condition: - if self._state == CANCELLED: - self._state = CANCELLED_AND_NOTIFIED - for waiter in self._waiters: - waiter.add_cancelled(self) - # self._condition.notify_all() is not necessary because - # self.cancel() triggers a notification. - return False - elif self._state == PENDING: - self._state = RUNNING - return True - else: - LOGGER.critical('Future %s in unexpected state: %s', - id(self), - self._state) - raise RuntimeError('Future in unexpected state') - - def set_result(self, result): - """Sets the return value of work associated with the future. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._result = result - self._state = FINISHED - for waiter in self._waiters: - waiter.add_result(self) - self._condition.notify_all() - self._invoke_callbacks() - - def set_exception(self, exception): - """Sets the result of the future as being the given exception. - - Should only be used by Executor implementations and unit tests. - """ - with self._condition: - self._exception = exception - self._state = FINISHED - for waiter in self._waiters: - waiter.add_exception(self) - self._condition.notify_all() - self._invoke_callbacks() - - class Executor(object): - """This is an abstract base class for concrete asynchronous executors. - """ - - def submit(self, fn, *args, **kwargs): - """Submits a callable to be executed with the given arguments. - - Schedules the callable to be executed as fn(*args, **kwargs) and - returns a Future instance representing the execution of the - callable. - - Returns: - A Future representing the given call. - """ - raise NotImplementedError() - - def map(self, fn, *iterables, **kwargs): - """Returns an iterator equivalent to map(fn, iter). - - Args: - fn: A callable that will take as many arguments as there are - passed iterables. - timeout: The maximum number of seconds to wait. If None, then - there is no limit on the wait time. - chunksize: The size of the chunks the iterable will be broken - into before being passed to a child process. This argument - is only used by ProcessPoolExecutor; it is ignored by - ThreadPoolExecutor. - - Returns: - An iterator equivalent to: map(func, *iterables) but the calls - may be evaluated out-of-order. - - Raises: - TimeoutError: If the entire result iterator could not be - generated before the given timeout. - Exception: If fn(*args) raises for any values. - """ - timeout = kwargs.get('timeout') - if timeout is not None: - end_time = timeout + time.time() - - fs = [self.submit(fn, *args) for args in zip(*iterables)] - - # Yield must be hidden in closure so that the futures are submitted - # before the first iterator value is required. - def result_iterator(): - try: - for future in fs: - if timeout is None: - yield future.result() - else: - yield future.result(end_time - time.time()) - finally: - for future in fs: - future.cancel() - return result_iterator() - - def shutdown(self, wait=True): - """Clean-up the resources associated with the Executor. - - It is safe to call this method several times. Otherwise, no other - methods can be called after this one. - - Args: - wait: If True then shutdown will not return until all running - futures have finished executing and the resources used by - the executor have been reclaimed. - """ - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.shutdown(wait=True) - return False - - -# To make loky._base.Future instances awaitable by concurrent.futures.wait, -# derive our custom Future class from _BaseFuture. _invoke_callback is the only -# modification made to this class in loky. -class Future(_BaseFuture): - def _invoke_callbacks(self): - for callback in self._done_callbacks: - try: - callback(self) - except BaseException: - LOGGER.exception('exception calling callback for %r', self) diff --git a/sklearn/externals/joblib/externals/loky/backend/__init__.py b/sklearn/externals/joblib/externals/loky/backend/__init__.py deleted file mode 100644 index a65ce0e8b0bb1..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import sys - -from .context import get_context - -if sys.version_info > (3, 4): - - def _make_name(): - name = '/loky-%i-%s' % (os.getpid(), next(synchronize.SemLock._rand)) - return name - - # monkey patch the name creation for multiprocessing - from multiprocessing import synchronize - synchronize.SemLock._make_name = staticmethod(_make_name) - -__all__ = ["get_context"] diff --git a/sklearn/externals/joblib/externals/loky/backend/_posix_reduction.py b/sklearn/externals/joblib/externals/loky/backend/_posix_reduction.py deleted file mode 100644 index e0e394d3cdf27..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/_posix_reduction.py +++ /dev/null @@ -1,76 +0,0 @@ -############################################################################### -# Extra reducers for Unix based system and connections objects -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/reduction.py (17/02/2017) -# * Add adapted reduction for LokyProcesses and socket/Connection -# -import os -import sys -import socket -import _socket - -from .reduction import register -from .context import get_spawning_popen - -if sys.version_info >= (3, 3): - from multiprocessing.connection import Connection -else: - from _multiprocessing import Connection - - -HAVE_SEND_HANDLE = (hasattr(socket, 'CMSG_LEN') and - hasattr(socket, 'SCM_RIGHTS') and - hasattr(socket.socket, 'sendmsg')) - - -def _mk_inheritable(fd): - if sys.version_info[:2] > (3, 3): - os.set_inheritable(fd, True) - return fd - - -def DupFd(fd): - '''Return a wrapper for an fd.''' - popen_obj = get_spawning_popen() - if popen_obj is not None: - return popen_obj.DupFd(popen_obj.duplicate_for_child(fd)) - elif HAVE_SEND_HANDLE and sys.version_info[:2] > (3, 3): - from multiprocessing import resource_sharer - return resource_sharer.DupFd(fd) - else: - raise TypeError( - 'Cannot pickle connection object. This object can only be ' - 'passed when spawning a new process' - ) - - -if sys.version_info[:2] != (3, 3): - def _reduce_socket(s): - df = DupFd(s.fileno()) - return _rebuild_socket, (df, s.family, s.type, s.proto) - - def _rebuild_socket(df, family, type, proto): - fd = df.detach() - return socket.fromfd(fd, family, type, proto) -else: - from multiprocessing.reduction import reduce_socket as _reduce_socket - - -register(socket.socket, _reduce_socket) -register(_socket.socket, _reduce_socket) - - -if sys.version_info[:2] != (3, 3): - def reduce_connection(conn): - df = DupFd(conn.fileno()) - return rebuild_connection, (df, conn.readable, conn.writable) - - def rebuild_connection(df, readable, writable): - fd = df.detach() - return Connection(fd, readable, writable) -else: - from multiprocessing.reduction import reduce_connection - -register(Connection, reduce_connection) diff --git a/sklearn/externals/joblib/externals/loky/backend/_posix_wait.py b/sklearn/externals/joblib/externals/loky/backend/_posix_wait.py deleted file mode 100644 index d935882dca5d5..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/_posix_wait.py +++ /dev/null @@ -1,105 +0,0 @@ -############################################################################### -# Compat for wait function on UNIX based system -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/connection.py (17/02/2017) -# * Backport wait function to python2.7 -# - -import platform -import select -import socket -import errno -SYSTEM = platform.system() - -try: - import ctypes -except ImportError: # pragma: no cover - ctypes = None # noqa - -if SYSTEM == 'Darwin' and ctypes is not None: - from ctypes.util import find_library - libSystem = ctypes.CDLL(find_library('libSystem.dylib')) - CoreServices = ctypes.CDLL(find_library('CoreServices'), - use_errno=True) - mach_absolute_time = libSystem.mach_absolute_time - mach_absolute_time.restype = ctypes.c_uint64 - absolute_to_nanoseconds = CoreServices.AbsoluteToNanoseconds - absolute_to_nanoseconds.restype = ctypes.c_uint64 - absolute_to_nanoseconds.argtypes = [ctypes.c_uint64] - - def monotonic(): - return absolute_to_nanoseconds(mach_absolute_time()) * 1e-9 - -elif SYSTEM == 'Linux' and ctypes is not None: - # from stackoverflow: - # questions/1205722/how-do-i-get-monotonic-time-durations-in-python - import ctypes - import os - - CLOCK_MONOTONIC = 1 # see - - class timespec(ctypes.Structure): - _fields_ = [ - ('tv_sec', ctypes.c_long), - ('tv_nsec', ctypes.c_long), - ] - - librt = ctypes.CDLL('librt.so.1', use_errno=True) - clock_gettime = librt.clock_gettime - clock_gettime.argtypes = [ - ctypes.c_int, ctypes.POINTER(timespec), - ] - - def monotonic(): # noqa - t = timespec() - if clock_gettime(CLOCK_MONOTONIC, ctypes.pointer(t)) != 0: - errno_ = ctypes.get_errno() - raise OSError(errno_, os.strerror(errno_)) - return t.tv_sec + t.tv_nsec * 1e-9 -else: # pragma: no cover - from time import time as monotonic - - -if hasattr(select, 'poll'): - def _poll(fds, timeout): - if timeout is not None: - timeout = int(timeout * 1000) # timeout is in milliseconds - fd_map = {} - pollster = select.poll() - for fd in fds: - pollster.register(fd, select.POLLIN) - if hasattr(fd, 'fileno'): - fd_map[fd.fileno()] = fd - else: - fd_map[fd] = fd - ls = [] - for fd, event in pollster.poll(timeout): - if event & select.POLLNVAL: # pragma: no cover - raise ValueError('invalid file descriptor %i' % fd) - ls.append(fd_map[fd]) - return ls -else: - def _poll(fds, timeout): - return select.select(fds, [], [], timeout)[0] - - -def wait(object_list, timeout=None): - ''' - Wait till an object in object_list is ready/readable. - Returns list of those objects which are ready/readable. - ''' - if timeout is not None: - if timeout <= 0: - return _poll(object_list, 0) - else: - deadline = monotonic() + timeout - while True: - try: - return _poll(object_list, timeout) - except (OSError, IOError, socket.error) as e: # pragma: no cover - if e.errno != errno.EINTR: - raise - if timeout is not None: - timeout = deadline - monotonic() diff --git a/sklearn/externals/joblib/externals/loky/backend/_win_reduction.py b/sklearn/externals/joblib/externals/loky/backend/_win_reduction.py deleted file mode 100644 index 142e6e7c80ddc..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/_win_reduction.py +++ /dev/null @@ -1,99 +0,0 @@ -############################################################################### -# Extra reducers for Windows system and connections objects -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/reduction.py (17/02/2017) -# * Add adapted reduction for LokyProcesses and socket/PipeConnection -# -import os -import sys -import socket -from .reduction import register - - -if sys.platform == 'win32': - if sys.version_info[:2] < (3, 3): - from _multiprocessing import PipeConnection - else: - import _winapi - from multiprocessing.connection import PipeConnection - - -if sys.version_info[:2] >= (3, 4) and sys.platform == 'win32': - class DupHandle(object): - def __init__(self, handle, access, pid=None): - # duplicate handle for process with given pid - if pid is None: - pid = os.getpid() - proc = _winapi.OpenProcess(_winapi.PROCESS_DUP_HANDLE, False, pid) - try: - self._handle = _winapi.DuplicateHandle( - _winapi.GetCurrentProcess(), - handle, proc, access, False, 0) - finally: - _winapi.CloseHandle(proc) - self._access = access - self._pid = pid - - def detach(self): - # retrieve handle from process which currently owns it - if self._pid == os.getpid(): - return self._handle - proc = _winapi.OpenProcess(_winapi.PROCESS_DUP_HANDLE, False, - self._pid) - try: - return _winapi.DuplicateHandle( - proc, self._handle, _winapi.GetCurrentProcess(), - self._access, False, _winapi.DUPLICATE_CLOSE_SOURCE) - finally: - _winapi.CloseHandle(proc) - - def reduce_pipe_connection(conn): - access = ((_winapi.FILE_GENERIC_READ if conn.readable else 0) | - (_winapi.FILE_GENERIC_WRITE if conn.writable else 0)) - dh = DupHandle(conn.fileno(), access) - return rebuild_pipe_connection, (dh, conn.readable, conn.writable) - - def rebuild_pipe_connection(dh, readable, writable): - from multiprocessing.connection import PipeConnection - handle = dh.detach() - return PipeConnection(handle, readable, writable) - register(PipeConnection, reduce_pipe_connection) - -elif sys.platform == 'win32': - # Older Python versions - from multiprocessing.reduction import reduce_pipe_connection - register(PipeConnection, reduce_pipe_connection) - - -if sys.version_info[:2] < (3, 3) and sys.platform == 'win32': - from _multiprocessing import win32 - from multiprocessing.reduction import reduce_handle, rebuild_handle - close = win32.CloseHandle - - def fromfd(handle, family, type_, proto=0): - s = socket.socket(family, type_, proto, fileno=handle) - if s.__class__ is not socket.socket: - s = socket.socket(_sock=s) - return s - - def reduce_socket(s): - if not hasattr(socket, "fromfd"): - raise TypeError("sockets cannot be pickled on this system.") - reduced_handle = reduce_handle(s.fileno()) - return _rebuild_socket, (reduced_handle, s.family, s.type, s.proto) - - def _rebuild_socket(reduced_handle, family, type_, proto): - handle = rebuild_handle(reduced_handle) - s = fromfd(handle, family, type_, proto) - close(handle) - return s - - register(socket.socket, reduce_socket) -elif sys.version_info[:2] < (3, 4): - from multiprocessing.reduction import reduce_socket - register(socket.socket, reduce_socket) -else: - from multiprocessing.reduction import _reduce_socket - register(socket.socket, _reduce_socket) diff --git a/sklearn/externals/joblib/externals/loky/backend/_win_wait.py b/sklearn/externals/joblib/externals/loky/backend/_win_wait.py deleted file mode 100644 index 73271316d05af..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/_win_wait.py +++ /dev/null @@ -1,58 +0,0 @@ -############################################################################### -# Compat for wait function on Windows system -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/connection.py (17/02/2017) -# * Backport wait function to python2.7 -# - -import ctypes -import sys -from time import sleep - - -if sys.platform == 'win32' and sys.version_info[:2] < (3, 3): - from _subprocess import WaitForSingleObject, WAIT_OBJECT_0 - - try: - from time import monotonic - except ImportError: - # Backward old for crappy old Python that did not have cross-platform - # monotonic clock by default. - - # TODO: do we want to add support for cygwin at some point? See: - # https://github.com/atdt/monotonic/blob/master/monotonic.py - GetTickCount64 = ctypes.windll.kernel32.GetTickCount64 - GetTickCount64.restype = ctypes.c_ulonglong - - def monotonic(): - """Monotonic clock, cannot go backward.""" - return GetTickCount64() / 1000.0 - - def wait(handles, timeout=None): - """Backward compat for python2.7 - - This function wait for either: - * one connection is ready for read, - * one process handle has exited or got killed, - * timeout is reached. Note that this function has a precision of 2 - msec. - """ - if timeout is not None: - deadline = monotonic() + timeout - - while True: - # We cannot use select as in windows it only support sockets - ready = [] - for h in handles: - if type(h) in [int, long]: - if WaitForSingleObject(h, 0) == WAIT_OBJECT_0: - ready += [h] - elif h.poll(0): - ready.append(h) - if len(ready) > 0: - return ready - sleep(.001) - if timeout is not None and deadline - monotonic() <= 0: - return [] diff --git a/sklearn/externals/joblib/externals/loky/backend/compat.py b/sklearn/externals/joblib/externals/loky/backend/compat.py deleted file mode 100644 index aa406c6cfdf92..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/compat.py +++ /dev/null @@ -1,41 +0,0 @@ -############################################################################### -# Compat file to import the correct modules for each platform and python -# version. -# -# author: Thomas Moreau and Olivier grisel -# -import sys - -PY3 = sys.version_info[:2] >= (3, 3) - -if PY3: - import queue -else: - import Queue as queue - -if sys.version_info >= (3, 4): - from multiprocessing.process import BaseProcess -else: - from multiprocessing.process import Process as BaseProcess - -# Platform specific compat -if sys.platform == "win32": - from .compat_win32 import wait -else: - from .compat_posix import wait - - -def set_cause(exc, cause): - exc.__cause__ = cause - - if not PY3: - # Preformat message here. - if exc.__cause__ is not None: - exc.args = ("{}\n\nThis was caused directly by {}".format( - exc.args if len(exc.args) != 1 else exc.args[0], - str(exc.__cause__)),) - - return exc - - -__all__ = ["queue", "BaseProcess", "set_cause", "wait"] diff --git a/sklearn/externals/joblib/externals/loky/backend/compat_posix.py b/sklearn/externals/joblib/externals/loky/backend/compat_posix.py deleted file mode 100644 index c8e4e4a43cec6..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/compat_posix.py +++ /dev/null @@ -1,13 +0,0 @@ -# flake8: noqa -############################################################################### -# Compat file to load the correct wait function -# -# author: Thomas Moreau and Olivier grisel -# -import sys - -# Compat wait -if sys.version_info < (3, 3): - from ._posix_wait import wait -else: - from multiprocessing.connection import wait diff --git a/sklearn/externals/joblib/externals/loky/backend/compat_win32.py b/sklearn/externals/joblib/externals/loky/backend/compat_win32.py deleted file mode 100644 index aa0a1fa919a6b..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/compat_win32.py +++ /dev/null @@ -1,59 +0,0 @@ -# flake8: noqa: F401 -import sys -import numbers - -if sys.platform == "win32": - # Avoid import error by code introspection tools such as test runners - # trying to import this module while running on non-Windows systems. - - # Compat Popen - if sys.version_info[:2] >= (3, 4): - from multiprocessing.popen_spawn_win32 import Popen - else: - from multiprocessing.forking import Popen - - # wait compat - if sys.version_info[:2] < (3, 3): - from ._win_wait import wait - else: - from multiprocessing.connection import wait - - # Compat _winapi - if sys.version_info[:2] >= (3, 4): - import _winapi - else: - import os - import msvcrt - if sys.version_info[:2] < (3, 3): - import _subprocess as win_api - from _multiprocessing import win32 - else: - import _winapi as win_api - - class _winapi: - CreateProcess = win_api.CreateProcess - - @staticmethod - def CreatePipe(*args): - rfd, wfd = os.pipe() - _current_process = win_api.GetCurrentProcess() - rhandle = win_api.DuplicateHandle( - _current_process, msvcrt.get_osfhandle(rfd), - _current_process, 0, True, - win_api.DUPLICATE_SAME_ACCESS) - if sys.version_info[:2] < (3, 3): - rhandle = rhandle.Detach() - os.close(rfd) - return rhandle, wfd - - @staticmethod - def CloseHandle(h): - if isinstance(h, numbers.Integral): - # Cast long to int for 64-bit Python 2.7 under Windows - h = int(h) - if sys.version_info[:2] < (3, 3): - if not isinstance(h, int): - h = h.Detach() - win32.CloseHandle(h) - else: - win_api.CloseHandle(h) diff --git a/sklearn/externals/joblib/externals/loky/backend/context.py b/sklearn/externals/joblib/externals/loky/backend/context.py deleted file mode 100644 index 0f744c5918b5c..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/context.py +++ /dev/null @@ -1,265 +0,0 @@ -############################################################################### -# Basic context management with LokyContext and provides -# compat for UNIX 2.7 and 3.3 -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/context.py -# * Create a context ensuring loky uses only objects that are compatible -# * Add LokyContext to the list of context of multiprocessing so loky can be -# used with multiprocessing.set_start_method -# * Add some compat function for python2.7 and 3.3. -# -from __future__ import division - -import os -import sys -import warnings -import multiprocessing as mp - - -from .process import LokyProcess, LokyInitMainProcess - -START_METHODS = ['loky', 'loky_init_main'] -_DEFAULT_START_METHOD = None - -if sys.version_info[:2] >= (3, 4): - from multiprocessing import get_context as mp_get_context - from multiprocessing.context import assert_spawning, set_spawning_popen - from multiprocessing.context import get_spawning_popen, BaseContext - - START_METHODS += ['spawn'] - if sys.platform != 'win32': - START_METHODS += ['fork', 'forkserver'] - - def get_context(method=None): - # Try to overload the default context - method = method or _DEFAULT_START_METHOD or "loky" - if method == "fork": - # If 'fork' is explicitly requested, warn user about potential - # issues. - warnings.warn("`fork` start method should not be used with " - "`loky` as it does not respect POSIX. Try using " - "`spawn` or `loky` instead.", UserWarning) - try: - context = mp_get_context(method) - except ValueError: - raise ValueError("Unknown context '{}'. Value should be in {}." - .format(method, START_METHODS)) - - return context - -else: - if sys.platform != 'win32': - import threading - # Mechanism to check that the current thread is spawning a process - _tls = threading.local() - popen_attr = 'spawning_popen' - else: - from multiprocessing.forking import Popen - _tls = Popen._tls - popen_attr = 'process_handle' - - BaseContext = object - - def get_spawning_popen(): - return getattr(_tls, popen_attr, None) - - def set_spawning_popen(popen): - setattr(_tls, popen_attr, popen) - - def assert_spawning(obj): - if get_spawning_popen() is None: - raise RuntimeError( - '%s objects should only be shared between processes' - ' through inheritance' % type(obj).__name__ - ) - - def get_context(method=None): - method = method or _DEFAULT_START_METHOD or 'loky' - if method == "loky": - return LokyContext() - elif method == "loky_init_main": - return LokyInitMainContext() - else: - raise ValueError("Unknown context '{}'. Value should be in {}." - .format(method, START_METHODS)) - - -def set_start_method(method, force=False): - global _DEFAULT_START_METHOD - if _DEFAULT_START_METHOD is not None and not force: - raise RuntimeError('context has already been set') - assert method is None or method in START_METHODS, ( - "'{}' is not a valid start_method. It should be in {}" - .format(method, START_METHODS)) - - _DEFAULT_START_METHOD = method - - -def get_start_method(): - return _DEFAULT_START_METHOD - - -def cpu_count(): - """Return the number of CPUs the current process can use. - - The returned number of CPUs accounts for: - * the number of CPUs in the system, as given by - ``multiprocessing.cpu_count``; - * the CPU affinity settings of the current process - (available with Python 3.4+ on some Unix systems); - * CFS scheduler CPU bandwidth limit (available on Linux only, typically - set by docker and similar container orchestration systems); - * the value of the LOKY_MAX_CPU_COUNT environment variable if defined. - and is given as the minimum of these constraints. - It is also always larger or equal to 1. - """ - import math - - try: - cpu_count_mp = mp.cpu_count() - except NotImplementedError: - cpu_count_mp = 1 - - # Number of available CPUs given affinity settings - cpu_count_affinity = cpu_count_mp - if hasattr(os, 'sched_getaffinity'): - try: - cpu_count_affinity = len(os.sched_getaffinity(0)) - except NotImplementedError: - pass - - # CFS scheduler CPU bandwidth limit - # available in Linux since 2.6 kernel - cpu_count_cfs = cpu_count_mp - cfs_quota_fname = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" - cfs_period_fname = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" - if os.path.exists(cfs_quota_fname) and os.path.exists(cfs_period_fname): - with open(cfs_quota_fname, 'r') as fh: - cfs_quota_us = int(fh.read()) - with open(cfs_period_fname, 'r') as fh: - cfs_period_us = int(fh.read()) - - if cfs_quota_us > 0 and cfs_period_us > 0: - # Make sure this quantity is an int as math.ceil returns a - # float in python2.7. (See issue #165) - cpu_count_cfs = int(math.ceil(cfs_quota_us / cfs_period_us)) - - # User defined soft-limit passed as an loky specific environment variable. - cpu_count_loky = int(os.environ.get('LOKY_MAX_CPU_COUNT', cpu_count_mp)) - aggregate_cpu_count = min(cpu_count_mp, cpu_count_affinity, cpu_count_cfs, - cpu_count_loky) - return max(aggregate_cpu_count, 1) - - -class LokyContext(BaseContext): - """Context relying on the LokyProcess.""" - _name = 'loky' - Process = LokyProcess - cpu_count = staticmethod(cpu_count) - - def Queue(self, maxsize=0, reducers=None): - '''Returns a queue object''' - from .queues import Queue - return Queue(maxsize, reducers=reducers, - ctx=self.get_context()) - - def SimpleQueue(self, reducers=None): - '''Returns a queue object''' - from .queues import SimpleQueue - return SimpleQueue(reducers=reducers, ctx=self.get_context()) - - if sys.version_info[:2] < (3, 4): - """Compat for python2.7/3.3 for necessary methods in Context""" - def get_context(self): - return self - - def get_start_method(self): - return self._name - - def Pipe(self, duplex=True): - '''Returns two connection object connected by a pipe''' - return mp.Pipe(duplex) - - if sys.platform != "win32": - """Use the compat Manager for python2.7/3.3 on UNIX to avoid - relying on fork processes - """ - def Manager(self): - """Returns a manager object""" - from .managers import LokyManager - m = LokyManager() - m.start() - return m - else: - """Compat for context on Windows and python2.7/3.3. Using regular - multiprocessing objects as it does not rely on fork. - """ - from multiprocessing import synchronize - Semaphore = staticmethod(synchronize.Semaphore) - BoundedSemaphore = staticmethod(synchronize.BoundedSemaphore) - Lock = staticmethod(synchronize.Lock) - RLock = staticmethod(synchronize.RLock) - Condition = staticmethod(synchronize.Condition) - Event = staticmethod(synchronize.Event) - Manager = staticmethod(mp.Manager) - - if sys.platform != "win32": - """For Unix platform, use our custom implementation of synchronize - relying on ctypes to interface with pthread semaphores. - """ - def Semaphore(self, value=1): - """Returns a semaphore object""" - from . import synchronize - return synchronize.Semaphore(value=value) - - def BoundedSemaphore(self, value): - """Returns a bounded semaphore object""" - from .synchronize import BoundedSemaphore - return BoundedSemaphore(value) - - def Lock(self): - """Returns a lock object""" - from .synchronize import Lock - return Lock() - - def RLock(self): - """Returns a recurrent lock object""" - from .synchronize import RLock - return RLock() - - def Condition(self, lock=None): - """Returns a condition object""" - from .synchronize import Condition - return Condition(lock) - - def Event(self): - """Returns an event object""" - from .synchronize import Event - return Event() - - -class LokyInitMainContext(LokyContext): - """Extra context with LokyProcess, which does load the main module - - This context is used for compatibility in the case ``cloudpickle`` is not - present on the running system. This permits to load functions defined in - the ``main`` module, using proper safeguards. The declaration of the - ``executor`` should be protected by ``if __name__ == "__main__":`` and the - functions and variable used from main should be out of this block. - - This mimics the default behavior of multiprocessing under Windows and the - behavior of the ``spawn`` start method on a posix system for python3.4+. - For more details, see the end of the following section of python doc - https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming - """ - _name = 'loky_init_main' - Process = LokyInitMainProcess - - -if sys.version_info > (3, 4): - """Register loky context so it works with multiprocessing.get_context""" - ctx_loky = LokyContext() - mp.context._concrete_contexts['loky'] = ctx_loky - mp.context._concrete_contexts['loky_init_main'] = LokyInitMainContext() diff --git a/sklearn/externals/joblib/externals/loky/backend/fork_exec.py b/sklearn/externals/joblib/externals/loky/backend/fork_exec.py deleted file mode 100644 index eee2a1c80a231..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/fork_exec.py +++ /dev/null @@ -1,43 +0,0 @@ -############################################################################### -# Launch a subprocess using forkexec and make sure only the needed fd are -# shared in the two process. -# -# author: Thomas Moreau and Olivier Grisel -# -import os -import sys - -if sys.platform == "darwin" and sys.version_info < (3, 3): - FileNotFoundError = OSError - - -def close_fds(keep_fds): # pragma: no cover - """Close all the file descriptors except those in keep_fds.""" - - # Make sure to keep stdout and stderr open for logging purpose - keep_fds = set(keep_fds).union([1, 2]) - - # We try to retrieve all the open fds - try: - open_fds = set(int(fd) for fd in os.listdir('/proc/self/fd')) - except FileNotFoundError: - import resource - max_nfds = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - open_fds = set(fd for fd in range(3, max_nfds)) - open_fds.add(0) - - for i in open_fds - keep_fds: - try: - os.close(i) - except OSError: - pass - - -def fork_exec(cmd, keep_fds): - - pid = os.fork() - if pid == 0: # pragma: no cover - close_fds(keep_fds) - os.execv(sys.executable, cmd) - else: - return pid diff --git a/sklearn/externals/joblib/externals/loky/backend/managers.py b/sklearn/externals/joblib/externals/loky/backend/managers.py deleted file mode 100644 index 081f8976e4e70..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/managers.py +++ /dev/null @@ -1,51 +0,0 @@ -############################################################################### -# compat for UNIX 2.7 and 3.3 -# Manager with LokyContext server. -# This avoids having a Manager using fork and breaks the fd. -# -# author: Thomas Moreau and Olivier Grisel -# -# based on multiprocessing/managers.py (17/02/2017) -# * Overload the start method to use LokyContext and launch a loky subprocess -# - -import multiprocessing as mp -from multiprocessing.managers import SyncManager, State -from .process import LokyProcess as Process - - -class LokyManager(SyncManager): - def start(self, initializer=None, initargs=()): - '''Spawn a server process for this manager object''' - assert self._state.value == State.INITIAL - - if (initializer is not None - and not hasattr(initializer, '__call__')): - raise TypeError('initializer must be a callable') - - # pipe over which we will retrieve address of server - reader, writer = mp.Pipe(duplex=False) - - # spawn process which runs a server - self._process = Process( - target=type(self)._run_server, - args=(self._registry, self._address, bytes(self._authkey), - self._serializer, writer, initializer, initargs), - ) - ident = ':'.join(str(i) for i in self._process._identity) - self._process.name = type(self).__name__ + '-' + ident - self._process.start() - - # get address of server - writer.close() - self._address = reader.recv() - reader.close() - - # register a finalizer - self._state.value = State.STARTED - self.shutdown = mp.util.Finalize( - self, type(self)._finalize_manager, - args=(self._process, self._address, self._authkey, - self._state, self._Client), - exitpriority=0 - ) diff --git a/sklearn/externals/joblib/externals/loky/backend/popen_loky_posix.py b/sklearn/externals/joblib/externals/loky/backend/popen_loky_posix.py deleted file mode 100644 index 35a5907d21559..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/popen_loky_posix.py +++ /dev/null @@ -1,216 +0,0 @@ -############################################################################### -# Popen for LokyProcess. -# -# author: Thomas Moreau and Olivier Grisel -# -import os -import sys -import signal -import pickle -from io import BytesIO - -from . import reduction, spawn -from .context import get_spawning_popen, set_spawning_popen -from multiprocessing import util, process - -if sys.version_info[:2] < (3, 3): - ProcessLookupError = OSError - -if sys.platform != "win32": - from . import semaphore_tracker - - -__all__ = [] - -if sys.platform != "win32": - # - # Wrapper for an fd used while launching a process - # - - class _DupFd(object): - def __init__(self, fd): - self.fd = reduction._mk_inheritable(fd) - - def detach(self): - return self.fd - - # - # Start child process using subprocess.Popen - # - - __all__.append('Popen') - - class Popen(object): - method = 'loky' - DupFd = _DupFd - - def __init__(self, process_obj): - sys.stdout.flush() - sys.stderr.flush() - self.returncode = None - self._fds = [] - self._launch(process_obj) - - if sys.version_info < (3, 4): - @classmethod - def duplicate_for_child(cls, fd): - popen = get_spawning_popen() - popen._fds.append(fd) - return reduction._mk_inheritable(fd) - - else: - def duplicate_for_child(self, fd): - self._fds.append(fd) - return reduction._mk_inheritable(fd) - - def poll(self, flag=os.WNOHANG): - if self.returncode is None: - while True: - try: - pid, sts = os.waitpid(self.pid, flag) - except OSError as e: - # Child process not yet created. See #1731717 - # e.errno == errno.ECHILD == 10 - return None - else: - break - if pid == self.pid: - if os.WIFSIGNALED(sts): - self.returncode = -os.WTERMSIG(sts) - else: - assert os.WIFEXITED(sts) - self.returncode = os.WEXITSTATUS(sts) - return self.returncode - - def wait(self, timeout=None): - if sys.version_info < (3, 3): - import time - if timeout is None: - return self.poll(0) - deadline = time.time() + timeout - delay = 0.0005 - while 1: - res = self.poll() - if res is not None: - break - remaining = deadline - time.time() - if remaining <= 0: - break - delay = min(delay * 2, remaining, 0.05) - time.sleep(delay) - return res - - if self.returncode is None: - if timeout is not None: - from multiprocessing.connection import wait - if not wait([self.sentinel], timeout): - return None - # This shouldn't block if wait() returned successfully. - return self.poll(os.WNOHANG if timeout == 0.0 else 0) - return self.returncode - - def terminate(self): - if self.returncode is None: - try: - os.kill(self.pid, signal.SIGTERM) - except ProcessLookupError: - pass - except OSError: - if self.wait(timeout=0.1) is None: - raise - - def _launch(self, process_obj): - - tracker_fd = semaphore_tracker._semaphore_tracker.getfd() - - fp = BytesIO() - set_spawning_popen(self) - try: - prep_data = spawn.get_preparation_data( - process_obj._name, - getattr(process_obj, "init_main_module", True)) - reduction.dump(prep_data, fp) - reduction.dump(process_obj, fp) - - finally: - set_spawning_popen(None) - - try: - parent_r, child_w = os.pipe() - child_r, parent_w = os.pipe() - # for fd in self._fds: - # _mk_inheritable(fd) - - cmd_python = [sys.executable] - cmd_python += ['-m', self.__module__] - cmd_python += ['--process-name', str(process_obj.name)] - cmd_python += ['--pipe', - str(reduction._mk_inheritable(child_r))] - reduction._mk_inheritable(child_w) - if tracker_fd is not None: - cmd_python += ['--semaphore', - str(reduction._mk_inheritable(tracker_fd))] - self._fds.extend([child_r, child_w, tracker_fd]) - from .fork_exec import fork_exec - pid = fork_exec(cmd_python, self._fds) - util.debug("launched python with pid {} and cmd:\n{}" - .format(pid, cmd_python)) - self.sentinel = parent_r - - method = 'getbuffer' - if not hasattr(fp, method): - method = 'getvalue' - with os.fdopen(parent_w, 'wb') as f: - f.write(getattr(fp, method)()) - self.pid = pid - finally: - if parent_r is not None: - util.Finalize(self, os.close, (parent_r,)) - for fd in (child_r, child_w): - if fd is not None: - os.close(fd) - - @staticmethod - def thread_is_spawning(): - return True - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser('Command line parser') - parser.add_argument('--pipe', type=int, required=True, - help='File handle for the pipe') - parser.add_argument('--semaphore', type=int, required=True, - help='File handle name for the semaphore tracker') - parser.add_argument('--process-name', type=str, default=None, - help='Identifier for debugging purpose') - - args = parser.parse_args() - - info = dict() - semaphore_tracker._semaphore_tracker._fd = args.semaphore - - exitcode = 1 - try: - with os.fdopen(args.pipe, 'rb') as from_parent: - process.current_process()._inheriting = True - try: - prep_data = pickle.load(from_parent) - spawn.prepare(prep_data) - process_obj = pickle.load(from_parent) - finally: - del process.current_process()._inheriting - - exitcode = process_obj._bootstrap() - except Exception as e: - print('\n\n' + '-' * 80) - print('{} failed with traceback: '.format(args.process_name)) - print('-' * 80) - import traceback - print(traceback.format_exc()) - print('\n' + '-' * 80) - finally: - if from_parent is not None: - from_parent.close() - - sys.exit(exitcode) diff --git a/sklearn/externals/joblib/externals/loky/backend/popen_loky_win32.py b/sklearn/externals/joblib/externals/loky/backend/popen_loky_win32.py deleted file mode 100644 index dccf04bf6534c..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/popen_loky_win32.py +++ /dev/null @@ -1,150 +0,0 @@ -import os -import sys -from pickle import load -from multiprocessing import process, util - -from . import spawn -from . import reduction -from .context import get_spawning_popen, set_spawning_popen - -if sys.platform == "win32": - # Avoid import error by code introspection tools such as test runners - # trying to import this module while running on non-Windows systems. - import msvcrt - from .compat_win32 import _winapi - from .compat_win32 import Popen as _Popen -else: - _Popen = object - -if sys.version_info[:2] < (3, 3): - from os import fdopen as open - -__all__ = ['Popen'] - -# -# -# - -TERMINATE = 0x10000 -WINEXE = (sys.platform == 'win32' and getattr(sys, 'frozen', False)) -WINSERVICE = sys.executable.lower().endswith("pythonservice.exe") - - -# -# We define a Popen class similar to the one from subprocess, but -# whose constructor takes a process object as its argument. -# - -class Popen(_Popen): - ''' - Start a subprocess to run the code of a process object - ''' - method = 'loky' - - def __init__(self, process_obj): - prep_data = spawn.get_preparation_data( - process_obj._name, getattr(process_obj, "init_main_module", True)) - - # read end of pipe will be "stolen" by the child process - # -- see spawn_main() in spawn.py. - rhandle, wfd = _winapi.CreatePipe(None, 0) - if sys.version_info[:2] > (3, 3): - wfd = msvcrt.open_osfhandle(wfd, 0) - - cmd = get_command_line(parent_pid=os.getpid(), pipe_handle=rhandle) - cmd = ' '.join('"%s"' % x for x in cmd) - - try: - with open(wfd, 'wb') as to_child: - # start process - try: - inherit = sys.version_info[:2] < (3, 4) - hp, ht, pid, tid = _winapi.CreateProcess( - spawn.get_executable(), cmd, - None, None, inherit, 0, - None, None, None) - _winapi.CloseHandle(ht) - except BaseException as e: - _winapi.CloseHandle(rhandle) - raise - - # set attributes of self - self.pid = pid - self.returncode = None - self._handle = hp - self.sentinel = int(hp) - util.Finalize(self, _winapi.CloseHandle, (self.sentinel,)) - - # send information to child - set_spawning_popen(self) - if sys.version_info[:2] < (3, 4): - Popen._tls.process_handle = int(hp) - try: - reduction.dump(prep_data, to_child) - reduction.dump(process_obj, to_child) - finally: - set_spawning_popen(None) - if sys.version_info[:2] < (3, 4): - del Popen._tls.process_handle - except IOError as exc: - # IOError 22 happens when the launched subprocess terminated before - # wfd.close is called. Thus we can safely ignore it. - if exc.errno != 22: - raise - util.debug("While starting {}, ignored a IOError 22" - .format(process_obj._name)) - - def duplicate_for_child(self, handle): - assert self is get_spawning_popen() - return reduction.duplicate(handle, self.sentinel) - - -if sys.version_info[:2] >= (3, 4): - from multiprocessing.spawn import get_command_line -else: - # compatibility for python2.7. Duplicate here the code from - # multiprocessing.forking.main to call our prepare function and correctly - # set the default start_methods in loky. - - def get_command_line(pipe_handle, **kwds): - ''' - Returns prefix of command line used for spawning a child process - ''' - if getattr(sys, 'frozen', False): - return ([sys.executable, '--multiprocessing-fork', pipe_handle]) - else: - prog = 'from sklearn.externals.joblib.externals.loky.backend.popen_loky_win32 import main; main()' - opts = util._args_from_interpreter_flags() - return [spawn.get_executable()] + opts + [ - '-c', prog, '--multiprocessing-fork', pipe_handle] - - def is_forking(argv): - ''' - Return whether commandline indicates we are forking - ''' - if len(argv) >= 2 and argv[1] == '--multiprocessing-fork': - assert len(argv) == 3 - return True - else: - return False - - def main(): - ''' - Run code specified by data received over pipe - ''' - assert is_forking(sys.argv) - - handle = int(sys.argv[-1]) - fd = msvcrt.open_osfhandle(handle, os.O_RDONLY) - from_parent = os.fdopen(fd, 'rb') - - process.current_process()._inheriting = True - preparation_data = load(from_parent) - spawn.prepare(preparation_data) - self = load(from_parent) - process.current_process()._inheriting = False - - from_parent.close() - - exitcode = self._bootstrap() - exit(exitcode) diff --git a/sklearn/externals/joblib/externals/loky/backend/process.py b/sklearn/externals/joblib/externals/loky/backend/process.py deleted file mode 100644 index f6a00c90e363c..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/process.py +++ /dev/null @@ -1,106 +0,0 @@ -############################################################################### -# LokyProcess implementation -# -# authors: Thomas Moreau and Olivier Grisel -# -# based on multiprocessing/process.py (17/02/2017) -# * Add some compatibility function for python2.7 and 3.3 -# -import os -import sys -from .compat import BaseProcess - - -class LokyProcess(BaseProcess): - _start_method = 'loky' - - def __init__(self, group=None, target=None, name=None, args=(), - kwargs={}, daemon=None, init_main_module=False): - if sys.version_info < (3, 3): - super(LokyProcess, self).__init__( - group=group, target=target, name=name, args=args, - kwargs=kwargs) - self.daemon = daemon - else: - super(LokyProcess, self).__init__( - group=group, target=target, name=name, args=args, - kwargs=kwargs, daemon=daemon) - self.authkey = self.authkey - self.init_main_module = init_main_module - - @staticmethod - def _Popen(process_obj): - if sys.platform == "win32": - from .popen_loky_win32 import Popen - else: - from .popen_loky_posix import Popen - return Popen(process_obj) - - if sys.version_info < (3, 3): - def start(self): - ''' - Start child process - ''' - from multiprocessing.process import _current_process, _cleanup - assert self._popen is None, 'cannot start a process twice' - assert self._parent_pid == os.getpid(), \ - 'can only start a process object created by current process' - _cleanup() - self._popen = self._Popen(self) - self._sentinel = self._popen.sentinel - _current_process._children.add(self) - - @property - def sentinel(self): - ''' - Return a file descriptor (Unix) or handle (Windows) suitable for - waiting for process termination. - ''' - try: - return self._sentinel - except AttributeError: - raise ValueError("process not started") - - if sys.version_info < (3, 4): - @property - def authkey(self): - return self._authkey - - @authkey.setter - def authkey(self, authkey): - ''' - Set authorization key of process - ''' - self._authkey = AuthenticationKey(authkey) - - def _bootstrap(self): - from .context import set_start_method - set_start_method(self._start_method) - super(LokyProcess, self)._bootstrap() - - -class LokyInitMainProcess(LokyProcess): - _start_method = 'loky_init_main' - - def __init__(self, group=None, target=None, name=None, args=(), - kwargs={}, daemon=None): - super(LokyInitMainProcess, self).__init__( - group=group, target=target, name=name, args=args, kwargs=kwargs, - daemon=daemon, init_main_module=True) - - -# -# We subclass bytes to avoid accidental transmission of auth keys over network -# - -class AuthenticationKey(bytes): - def __reduce__(self): - from .context import assert_spawning - try: - assert_spawning(self) - except RuntimeError: - raise TypeError( - 'Pickling an AuthenticationKey object is ' - 'disallowed for security reasons' - ) - return AuthenticationKey, (bytes(self),) diff --git a/sklearn/externals/joblib/externals/loky/backend/queues.py b/sklearn/externals/joblib/externals/loky/backend/queues.py deleted file mode 100644 index 0f9dfeae63877..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/queues.py +++ /dev/null @@ -1,240 +0,0 @@ -############################################################################### -# Queue and SimpleQueue implementation for loky -# -# authors: Thomas Moreau, Olivier Grisel -# -# based on multiprocessing/queues.py (16/02/2017) -# * Add some compatibility function for python2.7 and 3.3 and makes sure -# it uses the right synchronization primitive. -# * Add some custom reducers for the Queues/SimpleQueue to tweak the -# pickling process. (overload Queue._feed/SimpleQueue.put) -# -import os -import sys -import errno -import weakref -import threading - -from multiprocessing import util -from multiprocessing import connection -from multiprocessing.synchronize import SEM_VALUE_MAX -from multiprocessing.queues import Full -from multiprocessing.queues import _sentinel, Queue as mp_Queue -from multiprocessing.queues import SimpleQueue as mp_SimpleQueue - -from .reduction import loads, dumps -from .context import assert_spawning, get_context - - -__all__ = ['Queue', 'SimpleQueue', 'Full'] - - -class Queue(mp_Queue): - - def __init__(self, maxsize=0, reducers=None, ctx=None): - - if sys.version_info[:2] >= (3, 4): - super().__init__(maxsize=maxsize, ctx=ctx) - else: - if maxsize <= 0: - # Can raise ImportError (see issues #3770 and #23400) - maxsize = SEM_VALUE_MAX - if ctx is None: - ctx = get_context() - self._maxsize = maxsize - self._reader, self._writer = connection.Pipe(duplex=False) - self._rlock = ctx.Lock() - self._opid = os.getpid() - if sys.platform == 'win32': - self._wlock = None - else: - self._wlock = ctx.Lock() - self._sem = ctx.BoundedSemaphore(maxsize) - - # For use by concurrent.futures - self._ignore_epipe = False - - self._after_fork() - - if sys.platform != 'win32': - util.register_after_fork(self, Queue._after_fork) - - self._reducers = reducers - - # Use custom queue set/get state to be able to reduce the custom reducers - def __getstate__(self): - assert_spawning(self) - return (self._ignore_epipe, self._maxsize, self._reader, self._writer, - self._reducers, self._rlock, self._wlock, self._sem, - self._opid) - - def __setstate__(self, state): - (self._ignore_epipe, self._maxsize, self._reader, self._writer, - self._reducers, self._rlock, self._wlock, self._sem, - self._opid) = state - self._after_fork() - - # Overload _start_thread to correctly call our custom _feed - def _start_thread(self): - util.debug('Queue._start_thread()') - - # Start thread which transfers data from buffer to pipe - self._buffer.clear() - self._thread = threading.Thread( - target=Queue._feed, - args=(self._buffer, self._notempty, self._send_bytes, - self._wlock, self._writer.close, self._reducers, - self._ignore_epipe, self._on_queue_feeder_error, self._sem), - name='QueueFeederThread' - ) - self._thread.daemon = True - - util.debug('doing self._thread.start()') - self._thread.start() - util.debug('... done self._thread.start()') - - # On process exit we will wait for data to be flushed to pipe. - # - # However, if this process created the queue then all - # processes which use the queue will be descendants of this - # process. Therefore waiting for the queue to be flushed - # is pointless once all the child processes have been joined. - created_by_this_process = (self._opid == os.getpid()) - if not self._joincancelled and not created_by_this_process: - self._jointhread = util.Finalize( - self._thread, Queue._finalize_join, - [weakref.ref(self._thread)], - exitpriority=-5 - ) - - # Send sentinel to the thread queue object when garbage collected - self._close = util.Finalize( - self, Queue._finalize_close, - [self._buffer, self._notempty], - exitpriority=10 - ) - - # Overload the _feed methods to use our custom pickling strategy. - @staticmethod - def _feed(buffer, notempty, send_bytes, writelock, close, reducers, - ignore_epipe, onerror, queue_sem): - util.debug('starting thread to feed data to pipe') - nacquire = notempty.acquire - nrelease = notempty.release - nwait = notempty.wait - bpopleft = buffer.popleft - sentinel = _sentinel - if sys.platform != 'win32': - wacquire = writelock.acquire - wrelease = writelock.release - else: - wacquire = None - - while 1: - try: - nacquire() - try: - if not buffer: - nwait() - finally: - nrelease() - try: - while 1: - obj = bpopleft() - if obj is sentinel: - util.debug('feeder thread got sentinel -- exiting') - close() - return - - # serialize the data before acquiring the lock - obj_ = dumps(obj, reducers=reducers) - if wacquire is None: - send_bytes(obj_) - else: - wacquire() - try: - send_bytes(obj_) - finally: - wrelease() - # Remove references early to avoid leaking memory - del obj, obj_ - except IndexError: - pass - except BaseException as e: - if ignore_epipe and getattr(e, 'errno', 0) == errno.EPIPE: - return - # Since this runs in a daemon thread the resources it uses - # may be become unusable while the process is cleaning up. - # We ignore errors which happen after the process has - # started to cleanup. - if util.is_exiting(): - util.info('error in queue thread: %s', e) - return - else: - queue_sem.release() - onerror(e, obj) - - def _on_queue_feeder_error(self, e, obj): - """ - Private API hook called when feeding data in the background thread - raises an exception. For overriding by concurrent.futures. - """ - import traceback - traceback.print_exc() - - if sys.version_info[:2] < (3, 4): - # Compat for python2.7/3.3 that use _send instead of _send_bytes - def _after_fork(self): - super(Queue, self)._after_fork() - self._send_bytes = self._writer.send_bytes - - -class SimpleQueue(mp_SimpleQueue): - - def __init__(self, reducers=None, ctx=None): - if sys.version_info[:2] >= (3, 4): - super().__init__(ctx=ctx) - else: - # Use the context to create the sync objects for python2.7/3.3 - if ctx is None: - ctx = get_context() - self._reader, self._writer = connection.Pipe(duplex=False) - self._rlock = ctx.Lock() - self._poll = self._reader.poll - if sys.platform == 'win32': - self._wlock = None - else: - self._wlock = ctx.Lock() - - # Add possiblity to use custom reducers - self._reducers = reducers - - # Use custom queue set/get state to be able to reduce the custom reducers - def __getstate__(self): - assert_spawning(self) - return (self._reader, self._writer, self._reducers, self._rlock, - self._wlock) - - def __setstate__(self, state): - (self._reader, self._writer, self._reducers, self._rlock, - self._wlock) = state - - if sys.version_info[:2] < (3, 4): - # For python2.7/3.3, overload get to avoid creating deadlocks with - # unpickling errors. - def get(self): - with self._rlock: - res = self._reader.recv_bytes() - # unserialize the data after having released the lock - return loads(res) - - # Overload put to use our customizable reducer - def put(self, obj): - # serialize the data before acquiring the lock - obj = dumps(obj, reducers=self._reducers) - if self._wlock is None: - # writes to a message oriented win32 pipe are atomic - self._writer.send_bytes(obj) - else: - with self._wlock: - self._writer.send_bytes(obj) diff --git a/sklearn/externals/joblib/externals/loky/backend/reduction.py b/sklearn/externals/joblib/externals/loky/backend/reduction.py deleted file mode 100644 index 2a8347590a67e..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/reduction.py +++ /dev/null @@ -1,252 +0,0 @@ -############################################################################### -# Customizable Pickler with some basic reducers -# -# author: Thomas Moreau -# -# adapted from multiprocessing/reduction.py (17/02/2017) -# * Replace the ForkingPickler with a similar _LokyPickler, -# * Add CustomizableLokyPickler to allow customizing pickling process -# on the fly. -# -import io -import os -import sys -import functools -from multiprocessing import util -try: - # Python 2 compat - from cPickle import loads as pickle_loads -except ImportError: - from pickle import loads as pickle_loads - import copyreg - -from pickle import HIGHEST_PROTOCOL - - -if sys.platform == "win32": - if sys.version_info[:2] > (3, 3): - from multiprocessing.reduction import duplicate - else: - from multiprocessing.forking import duplicate - - -############################################################################### -# Enable custom pickling in Loky. -# To allow instance customization of the pickling process, we use 2 classes. -# _ReducerRegistry gives module level customization and CustomizablePickler -# permits to use instance base custom reducers. Only CustomizablePickler -# should be used. - -class _ReducerRegistry(object): - """Registry for custom reducers. - - HIGHEST_PROTOCOL is selected by default as this pickler is used - to pickle ephemeral datastructures for interprocess communication - hence no backward compatibility is required. - - """ - - # We override the pure Python pickler as its the only way to be able to - # customize the dispatch table without side effects in Python 2.6 - # to 3.2. For Python 3.3+ leverage the new dispatch_table - # feature from http://bugs.python.org/issue14166 that makes it possible - # to use the C implementation of the Pickler which is faster. - - dispatch_table = {} - - @classmethod - def register(cls, type, reduce_func): - """Attach a reducer function to a given type in the dispatch table.""" - if sys.version_info < (3,): - # Python 2 pickler dispatching is not explicitly customizable. - # Let us use a closure to workaround this limitation. - def dispatcher(cls, obj): - reduced = reduce_func(obj) - cls.save_reduce(obj=obj, *reduced) - cls.dispatch_table[type] = dispatcher - else: - cls.dispatch_table[type] = reduce_func - - -############################################################################### -# Registers extra pickling routines to improve picklization for loky - -register = _ReducerRegistry.register - - -# make methods picklable -def _reduce_method(m): - if m.__self__ is None: - return getattr, (m.__class__, m.__func__.__name__) - else: - return getattr, (m.__self__, m.__func__.__name__) - - -class _C: - def f(self): - pass - - @classmethod - def h(cls): - pass - - -register(type(_C().f), _reduce_method) -register(type(_C.h), _reduce_method) - - -if not hasattr(sys, "pypy_version_info"): - # PyPy uses functions instead of method_descriptors and wrapper_descriptors - def _reduce_method_descriptor(m): - return getattr, (m.__objclass__, m.__name__) - - register(type(list.append), _reduce_method_descriptor) - register(type(int.__add__), _reduce_method_descriptor) - - -# Make partial func pickable -def _reduce_partial(p): - return _rebuild_partial, (p.func, p.args, p.keywords or {}) - - -def _rebuild_partial(func, args, keywords): - return functools.partial(func, *args, **keywords) - - -register(functools.partial, _reduce_partial) - -if sys.platform != "win32": - from ._posix_reduction import _mk_inheritable # noqa: F401 -else: - from . import _win_reduction # noqa: F401 - -# global variable to change the pickler behavior -try: - from sklearn.externals.joblib.externals import cloudpickle # noqa: F401 - DEFAULT_ENV = "cloudpickle" -except ImportError: - # If cloudpickle is not present, fallback to pickle - DEFAULT_ENV = "pickle" - -ENV_LOKY_PICKLER = os.environ.get("LOKY_PICKLER", DEFAULT_ENV) -_LokyPickler = None -_loky_pickler_name = None - - -def set_loky_pickler(loky_pickler=None): - global _LokyPickler, _loky_pickler_name - - if loky_pickler is None: - loky_pickler = ENV_LOKY_PICKLER - - loky_pickler_cls = None - - # The default loky_pickler is cloudpickle - if loky_pickler in ["", None]: - loky_pickler = "cloudpickle" - - if loky_pickler == _loky_pickler_name: - return - - if loky_pickler == "cloudpickle": - from sklearn.externals.joblib.externals.cloudpickle import CloudPickler as loky_pickler_cls - else: - try: - from importlib import import_module - module_pickle = import_module(loky_pickler) - loky_pickler_cls = module_pickle.Pickler - except (ImportError, AttributeError) as e: - extra_info = ("\nThis error occurred while setting loky_pickler to" - " '{}', as required by the env variable LOKY_PICKLER" - " or the function set_loky_pickler." - .format(loky_pickler)) - e.args = (e.args[0] + extra_info,) + e.args[1:] - e.msg = e.args[0] - raise e - - util.debug("Using '{}' for serialization." - .format(loky_pickler if loky_pickler else "cloudpickle")) - - class CustomizablePickler(loky_pickler_cls): - _loky_pickler_cls = loky_pickler_cls - - if sys.version_info < (3,): - # Make the dispatch registry an instance level attribute instead of - # a reference to the class dictionary under Python 2 - _dispatch = loky_pickler_cls.dispatch.copy() - _dispatch.update(_ReducerRegistry.dispatch_table) - else: - # Under Python 3 initialize the dispatch table with a copy of the - # default registry - _dispatch_table = copyreg.dispatch_table.copy() - _dispatch_table.update(_ReducerRegistry.dispatch_table) - - def __init__(self, writer, reducers=None, protocol=HIGHEST_PROTOCOL): - loky_pickler_cls.__init__(self, writer, protocol=protocol) - if reducers is None: - reducers = {} - if sys.version_info < (3,): - self.dispatch = self._dispatch.copy() - else: - self.dispatch_table = self._dispatch_table.copy() - for type, reduce_func in reducers.items(): - self.register(type, reduce_func) - - def register(self, type, reduce_func): - """Attach a reducer function to a given type in the dispatch table. - """ - if sys.version_info < (3,): - # Python 2 pickler dispatching is not explicitly customizable. - # Let us use a closure to workaround this limitation. - def dispatcher(self, obj): - reduced = reduce_func(obj) - self.save_reduce(obj=obj, *reduced) - self.dispatch[type] = dispatcher - else: - self.dispatch_table[type] = reduce_func - - _LokyPickler = CustomizablePickler - _loky_pickler_name = loky_pickler - - -def get_loky_pickler_name(): - global _loky_pickler_name - return _loky_pickler_name - - -def get_loky_pickler(): - global _LokyPickler - return _LokyPickler - - -# Set it to its default value -set_loky_pickler() - - -def loads(buf): - # Compat for python2.7 version - if sys.version_info < (3, 3) and isinstance(buf, io.BytesIO): - buf = buf.getvalue() - return pickle_loads(buf) - - -def dump(obj, file, reducers=None, protocol=None): - '''Replacement for pickle.dump() using _LokyPickler.''' - global _LokyPickler - _LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj) - - -def dumps(obj, reducers=None, protocol=None): - global _LokyPickler - - buf = io.BytesIO() - dump(obj, buf, reducers=reducers, protocol=protocol) - if sys.version_info < (3, 3): - return buf.getvalue() - return buf.getbuffer() - - -__all__ = ["dump", "dumps", "loads", "register", "set_loky_pickler"] - -if sys.platform == "win32": - __all__ += ["duplicate"] diff --git a/sklearn/externals/joblib/externals/loky/backend/semaphore_tracker.py b/sklearn/externals/joblib/externals/loky/backend/semaphore_tracker.py deleted file mode 100644 index 7d3f23e5f8e4f..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/semaphore_tracker.py +++ /dev/null @@ -1,238 +0,0 @@ -############################################################################### -# Server process to keep track of unlinked semaphores and clean them. -# -# author: Thomas Moreau -# -# adapted from multiprocessing/semaphore_tracker.py (17/02/2017) -# * include custom spawnv_passfds to start the process -# * use custom unlink from our own SemLock implementation -# * add some VERBOSE logging -# - -# -# On Unix we run a server process which keeps track of unlinked -# semaphores. The server ignores SIGINT and SIGTERM and reads from a -# pipe. Every other process of the program has a copy of the writable -# end of the pipe, so we get EOF when all other processes have exited. -# Then the server process unlinks any remaining semaphore names. -# -# This is important because the system only supports a limited number -# of named semaphores, and they will not be automatically removed till -# the next reboot. Without this semaphore tracker process, "killall -# python" would probably leave unlinked semaphores. -# - -import os -import sys -import signal -import warnings -import threading - -from . import spawn -from multiprocessing import util - -try: - from _multiprocessing import sem_unlink -except ImportError: - from .semlock import sem_unlink - -if sys.version_info < (3,): - BrokenPipeError = IOError - -__all__ = ['ensure_running', 'register', 'unregister'] - -VERBOSE = False - - -class SemaphoreTracker(object): - - def __init__(self): - self._lock = threading.Lock() - self._fd = None - self._pid = None - - def getfd(self): - self.ensure_running() - return self._fd - - def ensure_running(self): - '''Make sure that semaphore tracker process is running. - - This can be run from any process. Usually a child process will use - the semaphore created by its parent.''' - with self._lock: - if self._fd is not None: - # semaphore tracker was launched before, is it still running? - if self._check_alive(): - # => still alive - return - # => dead, launch it again - os.close(self._fd) - self._fd = None - self._pid = None - - warnings.warn('semaphore_tracker: process died unexpectedly, ' - 'relaunching. Some semaphores might leak.') - - fds_to_pass = [] - try: - fds_to_pass.append(sys.stderr.fileno()) - except Exception: - pass - - cmd = 'from {} import main; main(%d)'.format(main.__module__) - r, w = os.pipe() - try: - fds_to_pass.append(r) - # process will out live us, so no need to wait on pid - exe = spawn.get_executable() - args = [exe] + util._args_from_interpreter_flags() - # In python 3.3, there is a bug which put `-RRRRR..` instead of - # `-R` in args. Replace it to get the correct flags. - # See https://github.com/python/cpython/blob/3.3/Lib/subprocess.py#L488 - if sys.version_info[:2] <= (3, 3): - import re - for i in range(1, len(args)): - args[i] = re.sub("-R+", "-R", args[i]) - args += ['-c', cmd % r] - util.debug("launching Semaphore tracker: {}".format(args)) - pid = spawnv_passfds(exe, args, fds_to_pass) - except BaseException: - os.close(w) - raise - else: - self._fd = w - self._pid = pid - finally: - os.close(r) - - def _check_alive(self): - '''Check for the existence of the semaphore tracker process.''' - try: - self._send('PROBE', '') - except BrokenPipeError: - return False - else: - return True - - def register(self, name): - '''Register name of semaphore with semaphore tracker.''' - self.ensure_running() - self._send('REGISTER', name) - - def unregister(self, name): - '''Unregister name of semaphore with semaphore tracker.''' - self.ensure_running() - self._send('UNREGISTER', name) - - def _send(self, cmd, name): - msg = '{0}:{1}\n'.format(cmd, name).encode('ascii') - if len(name) > 512: - # posix guarantees that writes to a pipe of less than PIPE_BUF - # bytes are atomic, and that PIPE_BUF >= 512 - raise ValueError('name too long') - nbytes = os.write(self._fd, msg) - assert nbytes == len(msg) - - -_semaphore_tracker = SemaphoreTracker() -ensure_running = _semaphore_tracker.ensure_running -register = _semaphore_tracker.register -unregister = _semaphore_tracker.unregister -getfd = _semaphore_tracker.getfd - - -def main(fd): - '''Run semaphore tracker.''' - # protect the process from ^C and "killall python" etc - signal.signal(signal.SIGINT, signal.SIG_IGN) - signal.signal(signal.SIGTERM, signal.SIG_IGN) - - for f in (sys.stdin, sys.stdout): - try: - f.close() - except Exception: - pass - - if VERBOSE: # pragma: no cover - sys.stderr.write("Main semaphore tracker is running\n") - sys.stderr.flush() - - cache = set() - try: - # keep track of registered/unregistered semaphores - with os.fdopen(fd, 'rb') as f: - for line in f: - try: - cmd, name = line.strip().split(b':') - if cmd == b'REGISTER': - name = name.decode('ascii') - cache.add(name) - if VERBOSE: # pragma: no cover - sys.stderr.write("[SemaphoreTracker] register {}\n" - .format(name)) - sys.stderr.flush() - elif cmd == b'UNREGISTER': - name = name.decode('ascii') - cache.remove(name) - if VERBOSE: # pragma: no cover - sys.stderr.write("[SemaphoreTracker] unregister {}" - ": cache({})\n" - .format(name, len(cache))) - sys.stderr.flush() - elif cmd == b'PROBE': - pass - else: - raise RuntimeError('unrecognized command %r' % cmd) - except BaseException: - try: - sys.excepthook(*sys.exc_info()) - except BaseException: - pass - finally: - # all processes have terminated; cleanup any remaining semaphores - if cache: - try: - warnings.warn('semaphore_tracker: There appear to be %d ' - 'leaked semaphores to clean up at shutdown' % - len(cache)) - except Exception: - pass - for name in cache: - # For some reason the process which created and registered this - # semaphore has failed to unregister it. Presumably it has died. - # We therefore unlink it. - try: - try: - sem_unlink(name) - if VERBOSE: # pragma: no cover - sys.stderr.write("[SemaphoreTracker] unlink {}\n" - .format(name)) - sys.stderr.flush() - except Exception as e: - warnings.warn('semaphore_tracker: %r: %r' % (name, e)) - finally: - pass - - if VERBOSE: # pragma: no cover - sys.stderr.write("semaphore tracker shut down\n") - sys.stderr.flush() - - -# -# Start a program with only specified fds kept open -# - -def spawnv_passfds(path, args, passfds): - passfds = sorted(passfds) - errpipe_read, errpipe_write = os.pipe() - try: - from .reduction import _mk_inheritable - _pass = [] - for fd in passfds: - _pass += [_mk_inheritable(fd)] - from .fork_exec import fork_exec - return fork_exec(args, _pass) - finally: - os.close(errpipe_read) - os.close(errpipe_write) diff --git a/sklearn/externals/joblib/externals/loky/backend/semlock.py b/sklearn/externals/joblib/externals/loky/backend/semlock.py deleted file mode 100644 index 2d35f6a2715a5..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/semlock.py +++ /dev/null @@ -1,274 +0,0 @@ -############################################################################### -# Ctypes implementation for posix semaphore. -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from cpython/Modules/_multiprocessing/semaphore.c (17/02/2017) -# * use ctypes to access pthread semaphores and provide a full python -# semaphore management. -# * For OSX, as no sem_getvalue is not implemented, Semaphore with value > 1 -# are not guaranteed to work. -# * Only work with LokyProcess on posix -# -import os -import sys -import time -import errno -import ctypes -import tempfile -import threading -from ctypes.util import find_library - -# As we need to use ctypes return types for semlock object, failure value -# needs to be cast to proper python value. Unix failure convention is to -# return 0, whereas OSX returns -1 -SEM_FAILURE = ctypes.c_void_p(0).value -if sys.platform == 'darwin': - SEM_FAILURE = ctypes.c_void_p(-1).value - -# Semaphore types -RECURSIVE_MUTEX = 0 -SEMAPHORE = 1 - -# Semaphore constants -SEM_OFLAG = ctypes.c_int(os.O_CREAT | os.O_EXCL) -SEM_PERM = ctypes.c_int(384) - - -class timespec(ctypes.Structure): - _fields_ = [("tv_sec", ctypes.c_long), ("tv_nsec", ctypes.c_long)] - - -if sys.platform != 'win32': - pthread = ctypes.CDLL(find_library('pthread'), use_errno=True) - pthread.sem_open.restype = ctypes.c_void_p - pthread.sem_close.argtypes = [ctypes.c_void_p] - pthread.sem_wait.argtypes = [ctypes.c_void_p] - pthread.sem_trywait.argtypes = [ctypes.c_void_p] - pthread.sem_post.argtypes = [ctypes.c_void_p] - pthread.sem_getvalue.argtypes = [ctypes.c_void_p, ctypes.c_void_p] - pthread.sem_unlink.argtypes = [ctypes.c_char_p] - if sys.platform != "darwin": - pthread.sem_timedwait.argtypes = [ctypes.c_void_p, - ctypes.POINTER(timespec)] - -try: - from threading import get_ident -except ImportError: - def get_ident(): - return threading.current_thread().ident - - -if sys.version_info[:2] < (3, 3): - class FileExistsError(OSError): - pass - - class FileNotFoundError(OSError): - pass - - -def sem_unlink(name): - if pthread.sem_unlink(name.encode('ascii')) < 0: - raiseFromErrno() - - -def _sem_open(name, value=None): - """ Construct or retrieve a semaphore with the given name - - If value is None, try to retrieve an existing named semaphore. - Else create a new semaphore with the given value - """ - if value is None: - handle = pthread.sem_open(ctypes.c_char_p(name), 0) - else: - handle = pthread.sem_open(ctypes.c_char_p(name), SEM_OFLAG, SEM_PERM, - ctypes.c_int(value)) - - if handle == SEM_FAILURE: - e = ctypes.get_errno() - if e == errno.EEXIST: - raise FileExistsError("a semaphore named %s already exists" % name) - elif e == errno.ENOENT: - raise FileNotFoundError('cannot find semaphore named %s' % name) - elif e == errno.ENOSYS: - raise NotImplementedError('No semaphore implementation on this ' - 'system') - else: - raiseFromErrno() - - return handle - - -def _sem_timedwait(handle, timeout): - t_start = time.time() - if sys.platform != "darwin": - sec = int(timeout) - tv_sec = int(t_start) - nsec = int(1e9 * (timeout - sec) + .5) - tv_nsec = int(1e9 * (t_start - tv_sec) + .5) - deadline = timespec(sec+tv_sec, nsec+tv_nsec) - deadline.tv_sec += int(deadline.tv_nsec / 1000000000) - deadline.tv_nsec %= 1000000000 - return pthread.sem_timedwait(handle, ctypes.pointer(deadline)) - - # PERFORMANCE WARNING - # No sem_timedwait on OSX so we implement our own method. This method can - # degrade performances has the wait can have a latency up to 20 msecs - deadline = t_start + timeout - delay = 0 - now = time.time() - while True: - # Poll the sem file - res = pthread.sem_trywait(handle) - if res == 0: - return 0 - else: - e = ctypes.get_errno() - if e != errno.EAGAIN: - raiseFromErrno() - - # check for timeout - now = time.time() - if now > deadline: - ctypes.set_errno(errno.ETIMEDOUT) - return -1 - - # calculate how much time left and check the delay is not too long - # -- maximum is 20 msecs - difference = (deadline - now) - delay = min(delay, 20e-3, difference) - - # Sleep and increase delay - time.sleep(delay) - delay += 1e-3 - - -class SemLock(object): - """ctypes wrapper to the unix semaphore""" - - _rand = tempfile._RandomNameSequence() - - def __init__(self, kind, value, maxvalue, name=None, unlink_now=False): - self.count = 0 - self.ident = 0 - self.kind = kind - self.maxvalue = maxvalue - self.name = name - self.handle = _sem_open(self.name.encode('ascii'), value) - - def __del__(self): - try: - res = pthread.sem_close(self.handle) - assert res == 0, "Issue while closing semaphores" - except AttributeError: - pass - - def _is_mine(self): - return self.count > 0 and get_ident() == self.ident - - def acquire(self, block=True, timeout=None): - if self.kind == RECURSIVE_MUTEX and self._is_mine(): - self.count += 1 - return True - - if block and timeout is None: - res = pthread.sem_wait(self.handle) - elif not block or timeout <= 0: - res = pthread.sem_trywait(self.handle) - else: - res = _sem_timedwait(self.handle, timeout) - if res < 0: - e = ctypes.get_errno() - if e == errno.EINTR: - return None - elif e in [errno.EAGAIN, errno.ETIMEDOUT]: - return False - raiseFromErrno() - self.count += 1 - self.ident = get_ident() - return True - - def release(self): - if self.kind == RECURSIVE_MUTEX: - assert self._is_mine(), ( - "attempt to release recursive lock not owned by thread") - if self.count > 1: - self.count -= 1 - return - assert self.count == 1 - else: - if sys.platform == 'darwin': - # Handle broken get_value for mac ==> only Lock will work - # as sem_get_value do not work properly - if self.maxvalue == 1: - if pthread.sem_trywait(self.handle) < 0: - e = ctypes.get_errno() - if e != errno.EAGAIN: - raise OSError(e, errno.errorcode[e]) - else: - if pthread.sem_post(self.handle) < 0: - raiseFromErrno() - else: - raise ValueError( - "semaphore or lock released too many times") - else: - import warnings - warnings.warn("semaphore are broken on OSX, release might " - "increase its maximal value", RuntimeWarning) - else: - value = self._get_value() - if value >= self.maxvalue: - raise ValueError( - "semaphore or lock released too many times") - - if pthread.sem_post(self.handle) < 0: - raiseFromErrno() - - self.count -= 1 - - def _get_value(self): - value = ctypes.pointer(ctypes.c_int(-1)) - if pthread.sem_getvalue(self.handle, value) < 0: - raiseFromErrno() - return value.contents.value - - def _count(self): - return self.count - - def _is_zero(self): - if sys.platform == 'darwin': - # Handle broken get_value for mac ==> only Lock will work - # as sem_get_value do not work properly - if pthread.sem_trywait(self.handle) < 0: - e = ctypes.get_errno() - if e == errno.EAGAIN: - return True - raise OSError(e, errno.errorcode[e]) - else: - if pthread.sem_post(self.handle) < 0: - raiseFromErrno() - return False - else: - value = ctypes.pointer(ctypes.c_int(-1)) - if pthread.sem_getvalue(self.handle, value) < 0: - raiseFromErrno() - return value.contents.value == 0 - - def _after_fork(self): - self.count = 0 - - @staticmethod - def _rebuild(handle, kind, maxvalue, name): - self = SemLock.__new__(SemLock) - self.count = 0 - self.ident = 0 - self.kind = kind - self.maxvalue = maxvalue - self.name = name - self.handle = _sem_open(name.encode('ascii')) - return self - - -def raiseFromErrno(): - e = ctypes.get_errno() - raise OSError(e, errno.errorcode[e]) diff --git a/sklearn/externals/joblib/externals/loky/backend/spawn.py b/sklearn/externals/joblib/externals/loky/backend/spawn.py deleted file mode 100644 index d92d189ddc193..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/spawn.py +++ /dev/null @@ -1,223 +0,0 @@ -############################################################################### -# Prepares and processes the data to setup the new process environment -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/spawn.py (17/02/2017) -# * Improve logging data -# -import os -import sys -import runpy -import types -from multiprocessing import process, util - -from sklearn.externals.joblib.externals.loky.backend import context - - -if sys.platform != 'win32': - WINEXE = False - WINSERVICE = False -else: - WINEXE = (sys.platform == 'win32' and getattr(sys, 'frozen', False)) - WINSERVICE = sys.executable.lower().endswith("pythonservice.exe") - -if WINSERVICE: - _python_exe = os.path.join(sys.exec_prefix, 'python.exe') -else: - _python_exe = sys.executable - - -def get_executable(): - return _python_exe - - -def _check_not_importing_main(): - if getattr(process.current_process(), '_inheriting', False): - raise RuntimeError(''' - An attempt has been made to start a new process before the - current process has finished its bootstrapping phase. - - This probably means that you are not using fork to start your - child processes and you have forgotten to use the proper idiom - in the main module: - - if __name__ == '__main__': - freeze_support() - ... - - The "freeze_support()" line can be omitted if the program - is not going to be frozen to produce an executable.''') - - -def get_preparation_data(name, init_main_module=True): - ''' - Return info about parent needed by child to unpickle process object - ''' - _check_not_importing_main() - d = dict( - log_to_stderr=util._log_to_stderr, - authkey=bytes(process.current_process().authkey), - ) - - if util._logger is not None: - d['log_level'] = util._logger.getEffectiveLevel() - if len(util._logger.handlers) > 0: - h = util._logger.handlers[0] - d['log_fmt'] = h.formatter._fmt - - sys_path = [p for p in sys.path] - try: - i = sys_path.index('') - except ValueError: - pass - else: - sys_path[i] = process.ORIGINAL_DIR - - d.update( - name=name, - sys_path=sys_path, - sys_argv=sys.argv, - orig_dir=process.ORIGINAL_DIR, - dir=os.getcwd() - ) - - if sys.platform != "win32": - # Pass the semaphore_tracker pid to avoid re-spawning it in every child - from . import semaphore_tracker - semaphore_tracker.ensure_running() - d['tracker_pid'] = semaphore_tracker._semaphore_tracker._pid - - # Figure out whether to initialise main in the subprocess as a module - # or through direct execution (or to leave it alone entirely) - if init_main_module: - main_module = sys.modules['__main__'] - try: - main_mod_name = getattr(main_module.__spec__, "name", None) - except BaseException: - main_mod_name = None - if main_mod_name is not None: - d['init_main_from_name'] = main_mod_name - elif sys.platform != 'win32' or (not WINEXE and not WINSERVICE): - main_path = getattr(main_module, '__file__', None) - if main_path is not None: - if (not os.path.isabs(main_path) and - process.ORIGINAL_DIR is not None): - main_path = os.path.join(process.ORIGINAL_DIR, main_path) - d['init_main_from_path'] = os.path.normpath(main_path) - # Compat for python2.7 - d['main_path'] = d['init_main_from_path'] - - return d - - -# -# Prepare current process -# -old_main_modules = [] - - -def prepare(data): - ''' - Try to get current process ready to unpickle process object - ''' - if 'name' in data: - process.current_process().name = data['name'] - - if 'authkey' in data: - process.current_process().authkey = data['authkey'] - - if 'log_to_stderr' in data and data['log_to_stderr']: - util.log_to_stderr() - - if 'log_level' in data: - util.get_logger().setLevel(data['log_level']) - - if 'log_fmt' in data: - import logging - util.get_logger().handlers[0].setFormatter( - logging.Formatter(data['log_fmt']) - ) - - if 'sys_path' in data: - sys.path = data['sys_path'] - - if 'sys_argv' in data: - sys.argv = data['sys_argv'] - - if 'dir' in data: - os.chdir(data['dir']) - - if 'orig_dir' in data: - process.ORIGINAL_DIR = data['orig_dir'] - - if 'tacker_pid' in data: - from . import semaphore_tracker - semaphore_tracker._semaphore_tracker._pid = data["tracker_pid"] - - if 'init_main_from_name' in data: - _fixup_main_from_name(data['init_main_from_name']) - elif 'init_main_from_path' in data: - _fixup_main_from_path(data['init_main_from_path']) - - -# Multiprocessing module helpers to fix up the main module in -# spawned subprocesses -def _fixup_main_from_name(mod_name): - # __main__.py files for packages, directories, zip archives, etc, run - # their "main only" code unconditionally, so we don't even try to - # populate anything in __main__, nor do we make any changes to - # __main__ attributes - current_main = sys.modules['__main__'] - if mod_name == "__main__" or mod_name.endswith(".__main__"): - return - - # If this process was forked, __main__ may already be populated - if getattr(current_main.__spec__, "name", None) == mod_name: - return - - # Otherwise, __main__ may contain some non-main code where we need to - # support unpickling it properly. We rerun it as __mp_main__ and make - # the normal __main__ an alias to that - old_main_modules.append(current_main) - main_module = types.ModuleType("__mp_main__") - main_content = runpy.run_module(mod_name, - run_name="__mp_main__", - alter_sys=True) - main_module.__dict__.update(main_content) - sys.modules['__main__'] = sys.modules['__mp_main__'] = main_module - - -def _fixup_main_from_path(main_path): - # If this process was forked, __main__ may already be populated - current_main = sys.modules['__main__'] - - # Unfortunately, the main ipython launch script historically had no - # "if __name__ == '__main__'" guard, so we work around that - # by treating it like a __main__.py file - # See https://github.com/ipython/ipython/issues/4698 - main_name = os.path.splitext(os.path.basename(main_path))[0] - if main_name == 'ipython': - return - - # Otherwise, if __file__ already has the setting we expect, - # there's nothing more to do - if getattr(current_main, '__file__', None) == main_path: - return - - # If the parent process has sent a path through rather than a module - # name we assume it is an executable script that may contain - # non-main code that needs to be executed - old_main_modules.append(current_main) - main_module = types.ModuleType("__mp_main__") - main_content = runpy.run_path(main_path, - run_name="__mp_main__") - main_module.__dict__.update(main_content) - sys.modules['__main__'] = sys.modules['__mp_main__'] = main_module - - -def import_main_path(main_path): - ''' - Set sys.modules['__main__'] to module at main_path - ''' - _fixup_main_from_path(main_path) diff --git a/sklearn/externals/joblib/externals/loky/backend/synchronize.py b/sklearn/externals/joblib/externals/loky/backend/synchronize.py deleted file mode 100644 index 4773b9dc87c5e..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/synchronize.py +++ /dev/null @@ -1,381 +0,0 @@ -############################################################################### -# Synchronization primitives based on our SemLock implementation -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from multiprocessing/synchronize.py (17/02/2017) -# * Remove ctx argument for compatibility reason -# * Implementation of Condition/Event are necessary for compatibility -# with python2.7/3.3, Barrier should be reimplemented to for those -# version (but it is not used in loky). -# - -import os -import sys -import tempfile -import threading -import _multiprocessing -from time import time as _time - -from .context import assert_spawning -from . import semaphore_tracker -from multiprocessing import process -from multiprocessing import util - -__all__ = [ - 'Lock', 'RLock', 'Semaphore', 'BoundedSemaphore', 'Condition', 'Event' - ] -# Try to import the mp.synchronize module cleanly, if it fails -# raise ImportError for platforms lacking a working sem_open implementation. -# See issue 3770 -try: - if sys.version_info < (3, 4): - from .semlock import SemLock as _SemLock - from .semlock import sem_unlink - else: - from _multiprocessing import SemLock as _SemLock - from _multiprocessing import sem_unlink -except (ImportError): - raise ImportError("This platform lacks a functioning sem_open" + - " implementation, therefore, the required" + - " synchronization primitives needed will not" + - " function, see issue 3770.") - -if sys.version_info[:2] < (3, 3): - FileExistsError = OSError - -# -# Constants -# - -RECURSIVE_MUTEX, SEMAPHORE = list(range(2)) -SEM_VALUE_MAX = _multiprocessing.SemLock.SEM_VALUE_MAX - - -# -# Base class for semaphores and mutexes; wraps `_multiprocessing.SemLock` -# - -class SemLock(object): - - _rand = tempfile._RandomNameSequence() - - def __init__(self, kind, value, maxvalue): - # unlink_now is only used on win32 or when we are using fork. - unlink_now = False - for i in range(100): - try: - self._semlock = _SemLock( - kind, value, maxvalue, SemLock._make_name(), - unlink_now) - except FileExistsError: # pragma: no cover - pass - else: - break - else: # pragma: no cover - raise FileExistsError('cannot find name for semaphore') - - util.debug('created semlock with handle %s and name "%s"' - % (self._semlock.handle, self._semlock.name)) - - self._make_methods() - - def _after_fork(obj): - obj._semlock._after_fork() - - util.register_after_fork(self, _after_fork) - - # When the object is garbage collected or the - # process shuts down we unlink the semaphore name - semaphore_tracker.register(self._semlock.name) - util.Finalize(self, SemLock._cleanup, (self._semlock.name,), - exitpriority=0) - - @staticmethod - def _cleanup(name): - sem_unlink(name) - semaphore_tracker.unregister(name) - - def _make_methods(self): - self.acquire = self._semlock.acquire - self.release = self._semlock.release - - def __enter__(self): - return self._semlock.acquire() - - def __exit__(self, *args): - return self._semlock.release() - - def __getstate__(self): - assert_spawning(self) - sl = self._semlock - h = sl.handle - return (h, sl.kind, sl.maxvalue, sl.name) - - def __setstate__(self, state): - self._semlock = _SemLock._rebuild(*state) - util.debug('recreated blocker with handle %r and name "%s"' - % (state[0], state[3])) - self._make_methods() - - @staticmethod - def _make_name(): - # OSX does not support long names for semaphores - return '/loky-%i-%s' % (os.getpid(), next(SemLock._rand)) - - -# -# Semaphore -# - -class Semaphore(SemLock): - - def __init__(self, value=1): - SemLock.__init__(self, SEMAPHORE, value, SEM_VALUE_MAX) - - def get_value(self): - if sys.platform == 'darwin': - raise NotImplementedError("OSX does not implement sem_getvalue") - return self._semlock._get_value() - - def __repr__(self): - try: - value = self._semlock._get_value() - except Exception: - value = 'unknown' - return '<%s(value=%s)>' % (self.__class__.__name__, value) - - -# -# Bounded semaphore -# - -class BoundedSemaphore(Semaphore): - - def __init__(self, value=1): - SemLock.__init__(self, SEMAPHORE, value, value) - - def __repr__(self): - try: - value = self._semlock._get_value() - except Exception: - value = 'unknown' - return '<%s(value=%s, maxvalue=%s)>' % \ - (self.__class__.__name__, value, self._semlock.maxvalue) - - -# -# Non-recursive lock -# - -class Lock(SemLock): - - def __init__(self): - super(Lock, self).__init__(SEMAPHORE, 1, 1) - - def __repr__(self): - try: - if self._semlock._is_mine(): - name = process.current_process().name - if threading.current_thread().name != 'MainThread': - name += '|' + threading.current_thread().name - elif self._semlock._get_value() == 1: - name = 'None' - elif self._semlock._count() > 0: - name = 'SomeOtherThread' - else: - name = 'SomeOtherProcess' - except Exception: - name = 'unknown' - return '<%s(owner=%s)>' % (self.__class__.__name__, name) - - -# -# Recursive lock -# - -class RLock(SemLock): - - def __init__(self): - super(RLock, self).__init__(RECURSIVE_MUTEX, 1, 1) - - def __repr__(self): - try: - if self._semlock._is_mine(): - name = process.current_process().name - if threading.current_thread().name != 'MainThread': - name += '|' + threading.current_thread().name - count = self._semlock._count() - elif self._semlock._get_value() == 1: - name, count = 'None', 0 - elif self._semlock._count() > 0: - name, count = 'SomeOtherThread', 'nonzero' - else: - name, count = 'SomeOtherProcess', 'nonzero' - except Exception: - name, count = 'unknown', 'unknown' - return '<%s(%s, %s)>' % (self.__class__.__name__, name, count) - - -# -# Condition variable -# - -class Condition(object): - - def __init__(self, lock=None): - self._lock = lock or RLock() - self._sleeping_count = Semaphore(0) - self._woken_count = Semaphore(0) - self._wait_semaphore = Semaphore(0) - self._make_methods() - - def __getstate__(self): - assert_spawning(self) - return (self._lock, self._sleeping_count, - self._woken_count, self._wait_semaphore) - - def __setstate__(self, state): - (self._lock, self._sleeping_count, - self._woken_count, self._wait_semaphore) = state - self._make_methods() - - def __enter__(self): - return self._lock.__enter__() - - def __exit__(self, *args): - return self._lock.__exit__(*args) - - def _make_methods(self): - self.acquire = self._lock.acquire - self.release = self._lock.release - - def __repr__(self): - try: - num_waiters = (self._sleeping_count._semlock._get_value() - - self._woken_count._semlock._get_value()) - except Exception: - num_waiters = 'unknown' - return '<%s(%s, %s)>' % (self.__class__.__name__, - self._lock, num_waiters) - - def wait(self, timeout=None): - assert self._lock._semlock._is_mine(), \ - 'must acquire() condition before using wait()' - - # indicate that this thread is going to sleep - self._sleeping_count.release() - - # release lock - count = self._lock._semlock._count() - for i in range(count): - self._lock.release() - - try: - # wait for notification or timeout - return self._wait_semaphore.acquire(True, timeout) - finally: - # indicate that this thread has woken - self._woken_count.release() - - # reacquire lock - for i in range(count): - self._lock.acquire() - - def notify(self): - assert self._lock._semlock._is_mine(), 'lock is not owned' - assert not self._wait_semaphore.acquire(False) - - # to take account of timeouts since last notify() we subtract - # woken_count from sleeping_count and rezero woken_count - while self._woken_count.acquire(False): - res = self._sleeping_count.acquire(False) - assert res - - if self._sleeping_count.acquire(False): # try grabbing a sleeper - self._wait_semaphore.release() # wake up one sleeper - self._woken_count.acquire() # wait for the sleeper to wake - - # rezero _wait_semaphore in case a timeout just happened - self._wait_semaphore.acquire(False) - - def notify_all(self): - assert self._lock._semlock._is_mine(), 'lock is not owned' - assert not self._wait_semaphore.acquire(False) - - # to take account of timeouts since last notify*() we subtract - # woken_count from sleeping_count and rezero woken_count - while self._woken_count.acquire(False): - res = self._sleeping_count.acquire(False) - assert res - - sleepers = 0 - while self._sleeping_count.acquire(False): - self._wait_semaphore.release() # wake up one sleeper - sleepers += 1 - - if sleepers: - for i in range(sleepers): - self._woken_count.acquire() # wait for a sleeper to wake - - # rezero wait_semaphore in case some timeouts just happened - while self._wait_semaphore.acquire(False): - pass - - def wait_for(self, predicate, timeout=None): - result = predicate() - if result: - return result - if timeout is not None: - endtime = _time() + timeout - else: - endtime = None - waittime = None - while not result: - if endtime is not None: - waittime = endtime - _time() - if waittime <= 0: - break - self.wait(waittime) - result = predicate() - return result - - -# -# Event -# - -class Event(object): - - def __init__(self): - self._cond = Condition(Lock()) - self._flag = Semaphore(0) - - def is_set(self): - with self._cond: - if self._flag.acquire(False): - self._flag.release() - return True - return False - - def set(self): - with self._cond: - self._flag.acquire(False) - self._flag.release() - self._cond.notify_all() - - def clear(self): - with self._cond: - self._flag.acquire(False) - - def wait(self, timeout=None): - with self._cond: - if self._flag.acquire(False): - self._flag.release() - else: - self._cond.wait(timeout) - - if self._flag.acquire(False): - self._flag.release() - return True - return False diff --git a/sklearn/externals/joblib/externals/loky/backend/utils.py b/sklearn/externals/joblib/externals/loky/backend/utils.py deleted file mode 100644 index 4874947b7bb8f..0000000000000 --- a/sklearn/externals/joblib/externals/loky/backend/utils.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -import sys -import time -import errno -import signal -import warnings -import threading -import subprocess -try: - import psutil -except ImportError: - psutil = None - - -WIN32 = sys.platform == "win32" - - -def _flag_current_thread_clean_exit(): - """Put a ``_clean_exit`` flag on the current thread""" - thread = threading.current_thread() - thread._clean_exit = True - - -def recursive_terminate(process, use_psutil=True): - if use_psutil and psutil is not None: - _recursive_terminate_with_psutil(process) - else: - _recursive_terminate_without_psutil(process) - - -def _recursive_terminate_with_psutil(process, retries=5): - try: - children = psutil.Process(process.pid).children(recursive=True) - except psutil.NoSuchProcess: - return - - # Kill the children in reverse order to avoid killing the parents before - # the children in cases where there are more processes nested. - for child in children[::-1]: - try: - child.kill() - except psutil.NoSuchProcess: - pass - - process.terminate() - process.join() - - -def _recursive_terminate_without_psutil(process): - """Terminate a process and its descendants. - """ - try: - _recursive_terminate(process.pid) - except OSError as e: - warnings.warn("Failed to kill subprocesses on this platform. Please" - "install psutil: https://github.com/giampaolo/psutil") - # In case we cannot introspect the children, we fall back to the - # classic Process.terminate. - process.terminate() - process.join() - - -def _recursive_terminate(pid): - """Recursively kill the descendants of a process before killing it. - """ - - if sys.platform == "win32": - # On windows, the taskkill function with option `/T` terminate a given - # process pid and its children. - try: - subprocess.check_output( - ["taskkill", "/F", "/T", "/PID", str(pid)], - stderr=None) - except subprocess.CalledProcessError as e: - # In windows, taskkill return 1 for permission denied and 128, 255 - # for no process found. - if e.returncode not in [1, 128, 255]: - raise - elif e.returncode == 1: - # Try to kill the process without its descendants if taskkill - # was denied permission. If this fails too, with an error - # different from process not found, let the top level function - # raise a warning and retry to kill the process. - try: - os.kill(pid, signal.SIGTERM) - except OSError as e: - if e.errno != errno.ESRCH: - raise - - else: - try: - children_pids = subprocess.check_output( - ["pgrep", "-P", str(pid)], - stderr=None - ) - except subprocess.CalledProcessError as e: - # `ps` returns 1 when no child process has been found - if e.returncode == 1: - children_pids = b'' - else: - raise - - # Decode the result, split the cpid and remove the trailing line - children_pids = children_pids.decode().split('\n')[:-1] - for cpid in children_pids: - cpid = int(cpid) - _recursive_terminate(cpid) - - try: - os.kill(pid, signal.SIGTERM) - except OSError as e: - # if OSError is raised with [Errno 3] no such process, the process - # is already terminated, else, raise the error and let the top - # level function raise a warning and retry to kill the process. - if e.errno != errno.ESRCH: - raise - - -def get_exitcodes_terminated_worker(processes): - """Return a formated string with the exitcodes of terminated workers. - - If necessary, wait (up to .25s) for the system to correctly set the - exitcode of one terminated worker. - """ - patience = 5 - - # Catch the exitcode of the terminated workers. There should at least be - # one. If not, wait a bit for the system to correctly set the exitcode of - # the terminated worker. - exitcodes = [p.exitcode for p in processes.values() - if p.exitcode is not None] - while len(exitcodes) == 0 and patience > 0: - patience -= 1 - exitcodes = [p.exitcode for p in processes.values() - if p.exitcode is not None] - time.sleep(.05) - - return _format_exitcodes(exitcodes) - - -def _format_exitcodes(exitcodes): - """Format a list of exit code with names of the signals if possible""" - str_exitcodes = ["{}({})".format(_get_exitcode_name(e), e) - for e in exitcodes if e is not None] - return "{" + ", ".join(str_exitcodes) + "}" - - -def _get_exitcode_name(exitcode): - if sys.platform == "win32": - # The exitcode are unreliable on windows (see bpo-31863). - # For this case, return UNKNOWN - return "UNKNOWN" - - if exitcode < 0: - try: - import signal - if sys.version_info > (3, 5): - return signal.Signals(-exitcode).name - - # construct an inverse lookup table - for v, k in signal.__dict__.items(): - if (v.startswith('SIG') and not v.startswith('SIG_') and - k == -exitcode): - return v - except ValueError: - return "UNKNOWN" - elif exitcode != 255: - # The exitcode are unreliable on forkserver were 255 is always returned - # (see bpo-30589). For this case, return UNKNOWN - return "EXIT" - - return "UNKNOWN" diff --git a/sklearn/externals/joblib/externals/loky/cloudpickle_wrapper.py b/sklearn/externals/joblib/externals/loky/cloudpickle_wrapper.py deleted file mode 100644 index 9edf9240f21f4..0000000000000 --- a/sklearn/externals/joblib/externals/loky/cloudpickle_wrapper.py +++ /dev/null @@ -1,113 +0,0 @@ -import inspect -from functools import partial - -try: - from sklearn.externals.joblib.externals.cloudpickle import dumps, loads - cloudpickle = True -except ImportError: - cloudpickle = False - - -WRAP_CACHE = dict() - - -class CloudpickledObjectWrapper(object): - def __init__(self, obj, keep_wrapper=False): - self._obj = obj - self._keep_wrapper = keep_wrapper - - def __reduce__(self): - _pickled_object = dumps(self._obj) - if not self._keep_wrapper: - return loads, (_pickled_object,) - - return _reconstruct_wrapper, (_pickled_object, self._keep_wrapper) - - def __getattr__(self, attr): - # Ensure that the wrapped object can be used seemlessly as the - # previous object. - if attr not in ['_obj', '_keep_wrapper']: - return getattr(self._obj, attr) - return getattr(self, attr) - - -# Make sure the wrapped object conserves the callable property -class CallableObjectWrapper(CloudpickledObjectWrapper): - - def __call__(self, *args, **kwargs): - return self._obj(*args, **kwargs) - - -def _wrap_non_picklable_objects(obj, keep_wrapper): - if callable(obj): - return CallableObjectWrapper(obj, keep_wrapper=keep_wrapper) - return CloudpickledObjectWrapper(obj, keep_wrapper=keep_wrapper) - - -def _reconstruct_wrapper(_pickled_object, keep_wrapper): - obj = loads(_pickled_object) - return _wrap_non_picklable_objects(obj, keep_wrapper) - - -def _wrap_objects_when_needed(obj): - # Function to introspect an object and decide if it should be wrapped or - # not. - if not cloudpickle: - return obj - - need_wrap = "__main__" in getattr(obj, "__module__", "") - if isinstance(obj, partial): - return partial( - _wrap_objects_when_needed(obj.func), - *[_wrap_objects_when_needed(a) for a in obj.args], - **{k: _wrap_objects_when_needed(v) - for k, v in obj.keywords.items()} - ) - if callable(obj): - # Need wrap if the object is a function defined in a local scope of - # another function. - func_code = getattr(obj, "__code__", "") - need_wrap |= getattr(func_code, "co_flags", 0) & inspect.CO_NESTED - - # Need wrap if the obj is a lambda expression - func_name = getattr(obj, "__name__", "") - need_wrap |= "" in func_name - - if not need_wrap: - return obj - - wrapped_obj = WRAP_CACHE.get(obj) - if wrapped_obj is None: - wrapped_obj = _wrap_non_picklable_objects(obj, keep_wrapper=False) - WRAP_CACHE[obj] = wrapped_obj - return wrapped_obj - - -def wrap_non_picklable_objects(obj, keep_wrapper=True): - """Wrapper for non-picklable object to use cloudpickle to serialize them. - - Note that this wrapper tends to slow down the serialization process as it - is done with cloudpickle which is typically slower compared to pickle. The - proper way to solve serialization issues is to avoid defining functions and - objects in the main scripts and to implement __reduce__ functions for - complex classes. - """ - if not cloudpickle: - raise ImportError("could not from sklearn.externals.joblib.externals import cloudpickle. Please install " - "cloudpickle to allow extended serialization. " - "(`pip install cloudpickle`).") - - # If obj is a class, create a CloudpickledClassWrapper which instantiates - # the object internally and wrap it directly in a CloudpickledObjectWrapper - if inspect.isclass(obj): - class CloudpickledClassWrapper(CloudpickledObjectWrapper): - def __init__(self, *args, **kwargs): - self._obj = obj(*args, **kwargs) - self._keep_wrapper = keep_wrapper - - CloudpickledClassWrapper.__name__ = obj.__name__ - return CloudpickledClassWrapper - - # If obj is an instance of a class, just wrap it in a regular - # CloudpickledObjectWrapper - return _wrap_non_picklable_objects(obj, keep_wrapper=keep_wrapper) diff --git a/sklearn/externals/joblib/externals/loky/process_executor.py b/sklearn/externals/joblib/externals/loky/process_executor.py deleted file mode 100644 index 73672a8aa850f..0000000000000 --- a/sklearn/externals/joblib/externals/loky/process_executor.py +++ /dev/null @@ -1,1113 +0,0 @@ -############################################################################### -# Re-implementation of the ProcessPoolExecutor more robust to faults -# -# author: Thomas Moreau and Olivier Grisel -# -# adapted from concurrent/futures/process_pool_executor.py (17/02/2017) -# * Backport for python2.7/3.3, -# * Add an extra management thread to detect queue_management_thread failures, -# * Improve the shutdown process to avoid deadlocks, -# * Add timeout for workers, -# * More robust pickling process. -# -# Copyright 2009 Brian Quinlan. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -"""Implements ProcessPoolExecutor. - -The follow diagram and text describe the data-flow through the system: - -|======================= In-process =====================|== Out-of-process ==| - -+----------+ +----------+ +--------+ +-----------+ +---------+ -| | => | Work Ids | | | | Call Q | | Process | -| | +----------+ | | +-----------+ | Pool | -| | | ... | | | | ... | +---------+ -| | | 6 | => | | => | 5, call() | => | | -| | | 7 | | | | ... | | | -| Process | | ... | | Local | +-----------+ | Process | -| Pool | +----------+ | Worker | | #1..n | -| Executor | | Thread | | | -| | +----------- + | | +-----------+ | | -| | <=> | Work Items | <=> | | <= | Result Q | <= | | -| | +------------+ | | +-----------+ | | -| | | 6: call() | | | | ... | | | -| | | future | +--------+ | 4, result | | | -| | | ... | | 3, except | | | -+----------+ +------------+ +-----------+ +---------+ - -Executor.submit() called: -- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict -- adds the id of the _WorkItem to the "Work Ids" queue - -Local worker thread: -- reads work ids from the "Work Ids" queue and looks up the corresponding - WorkItem from the "Work Items" dict: if the work item has been cancelled then - it is simply removed from the dict, otherwise it is repackaged as a - _CallItem and put in the "Call Q". New _CallItems are put in the "Call Q" - until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because - calls placed in the "Call Q" can no longer be cancelled with Future.cancel(). -- reads _ResultItems from "Result Q", updates the future stored in the - "Work Items" dict and deletes the dict entry - -Process #1..n: -- reads _CallItems from "Call Q", executes the calls, and puts the resulting - _ResultItems in "Result Q" -""" - - -__author__ = 'Thomas Moreau (thomas.moreau.2010@gmail.com)' - - -import os -import gc -import sys -import struct -import weakref -import warnings -import itertools -import traceback -import threading -from time import time -import multiprocessing as mp -from functools import partial -from pickle import PicklingError - -from . import _base -from .backend import get_context -from .backend.compat import queue -from .backend.compat import wait -from .backend.compat import set_cause -from .backend.context import cpu_count -from .backend.queues import Queue, SimpleQueue, Full -from .backend.reduction import set_loky_pickler, get_loky_pickler_name -from .backend.utils import recursive_terminate, get_exitcodes_terminated_worker - -try: - from concurrent.futures.process import BrokenProcessPool as _BPPException -except ImportError: - _BPPException = RuntimeError - - -# Compatibility for python2.7 -if sys.version_info[0] == 2: - ProcessLookupError = OSError - - -# Workers are created as daemon threads and processes. This is done to allow -# the interpreter to exit when there are still idle processes in a -# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However, -# allowing workers to die with the interpreter has two undesirable properties: -# - The workers would still be running during interpreter shutdown, -# meaning that they would fail in unpredictable ways. -# - The workers could be killed while evaluating a work item, which could -# be bad if the callable being evaluated has external side-effects e.g. -# writing to a file. -# -# To work around this problem, an exit handler is installed which tells the -# workers to exit when their work queues are empty and then waits until the -# threads/processes finish. - -_threads_wakeups = weakref.WeakKeyDictionary() -_global_shutdown = False - -# Mechanism to prevent infinite process spawning. When a worker of a -# ProcessPoolExecutor nested in MAX_DEPTH Executor tries to create a new -# Executor, a LokyRecursionError is raised -MAX_DEPTH = int(os.environ.get("LOKY_MAX_DEPTH", 10)) -_CURRENT_DEPTH = 0 - -# Minimum time interval between two consecutive memory leak protection checks. -_MEMORY_LEAK_CHECK_DELAY = 1. - -# Number of bytes of memory usage allowed over the reference process size. -_MAX_MEMORY_LEAK_SIZE = int(1e8) - - -try: - from psutil import Process - _USE_PSUTIL = True - - def _get_memory_usage(pid, force_gc=False): - if force_gc: - gc.collect() - - return Process(pid).memory_info().rss - -except ImportError: - _USE_PSUTIL = False - - -class _ThreadWakeup: - def __init__(self): - self._reader, self._writer = mp.Pipe(duplex=False) - - def close(self): - self._writer.close() - self._reader.close() - - def wakeup(self): - if sys.platform == "win32" and sys.version_info[:2] < (3, 4): - # Compat for python2.7 on windows, where poll return false for - # b"" messages. Use the slightly larger message b"0". - self._writer.send_bytes(b"0") - else: - self._writer.send_bytes(b"") - - def clear(self): - while self._reader.poll(): - self._reader.recv_bytes() - - -class _ExecutorFlags(object): - """necessary references to maintain executor states without preventing gc - - It permits to keep the information needed by queue_management_thread - and crash_detection_thread to maintain the pool without preventing the - garbage collection of unreferenced executors. - """ - def __init__(self): - - self.shutdown = False - self.broken = None - self.kill_workers = False - self.shutdown_lock = threading.Lock() - - def flag_as_shutting_down(self, kill_workers=False): - with self.shutdown_lock: - self.shutdown = True - self.kill_workers = kill_workers - - def flag_as_broken(self, broken): - with self.shutdown_lock: - self.shutdown = True - self.broken = broken - - -def _python_exit(): - global _global_shutdown - _global_shutdown = True - items = list(_threads_wakeups.items()) - mp.util.debug("Interpreter shutting down. Waking up queue_manager_threads " - "{}".format(items)) - for thread, thread_wakeup in items: - if thread.is_alive(): - thread_wakeup.wakeup() - for thread, _ in items: - thread.join() - - -# Module variable to register the at_exit call -process_pool_executor_at_exit = None - -# Controls how many more calls than processes will be queued in the call queue. -# A smaller number will mean that processes spend more time idle waiting for -# work while a larger number will make Future.cancel() succeed less frequently -# (Futures in the call queue cannot be cancelled). -EXTRA_QUEUED_CALLS = 1 - - -class _RemoteTraceback(Exception): - """Embed stringification of remote traceback in local traceback - """ - def __init__(self, tb=None): - self.tb = tb - - def __str__(self): - return self.tb - - -class _ExceptionWithTraceback(BaseException): - - def __init__(self, exc): - tb = getattr(exc, "__traceback__", None) - if tb is None: - _, _, tb = sys.exc_info() - tb = traceback.format_exception(type(exc), exc, tb) - tb = ''.join(tb) - self.exc = exc - self.tb = '\n"""\n%s"""' % tb - - def __reduce__(self): - return _rebuild_exc, (self.exc, self.tb) - - -def _rebuild_exc(exc, tb): - exc = set_cause(exc, _RemoteTraceback(tb)) - return exc - - -class _WorkItem(object): - - __slots__ = ["future", "fn", "args", "kwargs"] - - def __init__(self, future, fn, args, kwargs): - self.future = future - self.fn = fn - self.args = args - self.kwargs = kwargs - - -class _ResultItem(object): - - def __init__(self, work_id, exception=None, result=None): - self.work_id = work_id - self.exception = exception - self.result = result - - -class _CallItem(object): - - def __init__(self, work_id, fn, args, kwargs): - self.work_id = work_id - self.fn = fn - self.args = args - self.kwargs = kwargs - - # Store the current loky_pickler so it is correctly set in the worker - self.loky_pickler = get_loky_pickler_name() - - def __call__(self): - set_loky_pickler(self.loky_pickler) - return self.fn(*self.args, **self.kwargs) - - def __repr__(self): - return "CallItem({}, {}, {}, {})".format( - self.work_id, self.fn, self.args, self.kwargs) - - -class _SafeQueue(Queue): - """Safe Queue set exception to the future object linked to a job""" - def __init__(self, max_size=0, ctx=None, pending_work_items=None, - running_work_items=None, thread_wakeup=None, reducers=None): - self.thread_wakeup = thread_wakeup - self.pending_work_items = pending_work_items - self.running_work_items = running_work_items - super(_SafeQueue, self).__init__(max_size, reducers=reducers, ctx=ctx) - - def _on_queue_feeder_error(self, e, obj): - if isinstance(obj, _CallItem): - # format traceback only works on python3 - if isinstance(e, struct.error): - raised_error = RuntimeError( - "The task could not be sent to the workers as it is too " - "large for `send_bytes`.") - else: - raised_error = PicklingError( - "Could not pickle the task to send it to the workers.") - tb = traceback.format_exception( - type(e), e, getattr(e, "__traceback__", None)) - raised_error = set_cause(raised_error, _RemoteTraceback( - '\n"""\n{}"""'.format(''.join(tb)))) - work_item = self.pending_work_items.pop(obj.work_id, None) - self.running_work_items.remove(obj.work_id) - # work_item can be None if another process terminated. In this - # case, the queue_manager_thread fails all work_items with - # BrokenProcessPool - if work_item is not None: - work_item.future.set_exception(raised_error) - del work_item - self.thread_wakeup.wakeup() - else: - super(_SafeQueue, self)._on_queue_feeder_error(e, obj) - - -def _get_chunks(chunksize, *iterables): - """Iterates over zip()ed iterables in chunks. """ - if sys.version_info < (3, 3): - it = itertools.izip(*iterables) - else: - it = zip(*iterables) - while True: - chunk = tuple(itertools.islice(it, chunksize)) - if not chunk: - return - yield chunk - - -def _process_chunk(fn, chunk): - """Processes a chunk of an iterable passed to map. - - Runs the function passed to map() on a chunk of the - iterable passed to map. - - This function is run in a separate process. - - """ - return [fn(*args) for args in chunk] - - -def _sendback_result(result_queue, work_id, result=None, exception=None): - """Safely send back the given result or exception""" - try: - result_queue.put(_ResultItem(work_id, result=result, - exception=exception)) - except BaseException as e: - exc = _ExceptionWithTraceback(e) - result_queue.put(_ResultItem(work_id, exception=exc)) - - -def _process_worker(call_queue, result_queue, initializer, initargs, - processes_management_lock, timeout, worker_exit_lock, - current_depth): - """Evaluates calls from call_queue and places the results in result_queue. - - This worker is run in a separate process. - - Args: - call_queue: A ctx.Queue of _CallItems that will be read and - evaluated by the worker. - result_queue: A ctx.Queue of _ResultItems that will written - to by the worker. - initializer: A callable initializer, or None - initargs: A tuple of args for the initializer - process_management_lock: A ctx.Lock avoiding worker timeout while some - workers are being spawned. - timeout: maximum time to wait for a new item in the call_queue. If that - time is expired, the worker will shutdown. - worker_exit_lock: Lock to avoid flagging the executor as broken on - workers timeout. - current_depth: Nested parallelism level, to avoid infinite spawning. - """ - if initializer is not None: - try: - initializer(*initargs) - except BaseException: - _base.LOGGER.critical('Exception in initializer:', exc_info=True) - # The parent will notice that the process stopped and - # mark the pool broken - return - - # set the global _CURRENT_DEPTH mechanism to limit recursive call - global _CURRENT_DEPTH - _CURRENT_DEPTH = current_depth - _process_reference_size = None - _last_memory_leak_check = None - pid = os.getpid() - - mp.util.debug('Worker started with timeout=%s' % timeout) - while True: - try: - call_item = call_queue.get(block=True, timeout=timeout) - if call_item is None: - mp.util.info("Shutting down worker on sentinel") - except queue.Empty: - mp.util.info("Shutting down worker after timeout %0.3fs" - % timeout) - if processes_management_lock.acquire(block=False): - processes_management_lock.release() - call_item = None - else: - mp.util.info("Could not acquire processes_management_lock") - continue - except BaseException as e: - previous_tb = traceback.format_exc() - try: - result_queue.put(_RemoteTraceback(previous_tb)) - except BaseException: - # If we cannot format correctly the exception, at least print - # the traceback. - print(previous_tb) - sys.exit(1) - if call_item is None: - # Notify queue management thread about clean worker shutdown - result_queue.put(pid) - with worker_exit_lock: - return - try: - r = call_item() - except BaseException as e: - exc = _ExceptionWithTraceback(e) - result_queue.put(_ResultItem(call_item.work_id, exception=exc)) - else: - _sendback_result(result_queue, call_item.work_id, result=r) - del r - - # Free the resource as soon as possible, to avoid holding onto - # open files or shared memory that is not needed anymore - del call_item - - if _USE_PSUTIL: - if _process_reference_size is None: - # Make reference measurement after the first call - _process_reference_size = _get_memory_usage(pid, force_gc=True) - _last_memory_leak_check = time() - continue - if time() - _last_memory_leak_check > _MEMORY_LEAK_CHECK_DELAY: - mem_usage = _get_memory_usage(pid) - _last_memory_leak_check = time() - if mem_usage - _process_reference_size < _MAX_MEMORY_LEAK_SIZE: - # Memory usage stays within bounds: everything is fine. - continue - - # Check again memory usage; this time take the measurement - # after a forced garbage collection to break any reference - # cycles. - mem_usage = _get_memory_usage(pid, force_gc=True) - _last_memory_leak_check = time() - if mem_usage - _process_reference_size < _MAX_MEMORY_LEAK_SIZE: - # The GC managed to free the memory: everything is fine. - continue - - # The process is leaking memory: let the master process - # know that we need to start a new worker. - mp.util.info("Memory leak detected: shutting down worker") - result_queue.put(pid) - with worker_exit_lock: - return - else: - # if psutil is not installed, trigger gc.collect events - # regularly to limit potential memory leaks due to reference cycles - if ((_last_memory_leak_check is None) or - (time() - _last_memory_leak_check > - _MEMORY_LEAK_CHECK_DELAY)): - gc.collect() - _last_memory_leak_check = time() - - -def _add_call_item_to_queue(pending_work_items, - running_work_items, - work_ids, - call_queue): - """Fills call_queue with _WorkItems from pending_work_items. - - This function never blocks. - - Args: - pending_work_items: A dict mapping work ids to _WorkItems e.g. - {5: <_WorkItem...>, 6: <_WorkItem...>, ...} - work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids - are consumed and the corresponding _WorkItems from - pending_work_items are transformed into _CallItems and put in - call_queue. - call_queue: A ctx.Queue that will be filled with _CallItems - derived from _WorkItems. - """ - while True: - if call_queue.full(): - return - try: - work_id = work_ids.get(block=False) - except queue.Empty: - return - else: - work_item = pending_work_items[work_id] - - if work_item.future.set_running_or_notify_cancel(): - running_work_items += [work_id] - call_queue.put(_CallItem(work_id, - work_item.fn, - work_item.args, - work_item.kwargs), - block=True) - else: - del pending_work_items[work_id] - continue - - -def _queue_management_worker(executor_reference, - executor_flags, - processes, - pending_work_items, - running_work_items, - work_ids_queue, - call_queue, - result_queue, - thread_wakeup, - processes_management_lock): - """Manages the communication between this process and the worker processes. - - This function is run in a local thread. - - Args: - executor_reference: A weakref.ref to the ProcessPoolExecutor that owns - this thread. Used to determine if the ProcessPoolExecutor has been - garbage collected and that this function can exit. - executor_flags: A ExecutorFlags holding internal states of the - ProcessPoolExecutor. It permits to know if the executor is broken - even the object has been gc. - process: A list of the ctx.Process instances used as - workers. - pending_work_items: A dict mapping work ids to _WorkItems e.g. - {5: <_WorkItem...>, 6: <_WorkItem...>, ...} - work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]). - call_queue: A ctx.Queue that will be filled with _CallItems - derived from _WorkItems for processing by the process workers. - result_queue: A ctx.SimpleQueue of _ResultItems generated by the - process workers. - thread_wakeup: A _ThreadWakeup to allow waking up the - queue_manager_thread from the main Thread and avoid deadlocks - caused by permanently locked queues. - """ - executor = None - - def is_shutting_down(): - # No more work items can be added if: - # - The interpreter is shutting down OR - # - The executor that own this worker is not broken AND - # * The executor that owns this worker has been collected OR - # * The executor that owns this worker has been shutdown. - # If the executor is broken, it should be detected in the next loop. - return (_global_shutdown or - ((executor is None or executor_flags.shutdown) - and not executor_flags.broken)) - - def shutdown_all_workers(): - mp.util.debug("queue management thread shutting down") - executor_flags.flag_as_shutting_down() - # Create a list to avoid RuntimeError due to concurrent modification of - # processes. nb_children_alive is thus an upper bound. Also release the - # processes' _worker_exit_lock to accelerate the shutdown procedure, as - # there is no need for hand-shake here. - with processes_management_lock: - n_children_alive = 0 - for p in list(processes.values()): - p._worker_exit_lock.release() - n_children_alive += 1 - n_children_to_stop = n_children_alive - n_sentinels_sent = 0 - # Send the right number of sentinels, to make sure all children are - # properly terminated. - while n_sentinels_sent < n_children_to_stop and n_children_alive > 0: - for i in range(n_children_to_stop - n_sentinels_sent): - try: - call_queue.put_nowait(None) - n_sentinels_sent += 1 - except Full: - break - with processes_management_lock: - n_children_alive = sum( - p.is_alive() for p in list(processes.values()) - ) - - # Release the queue's resources as soon as possible. Flag the feeder - # thread for clean exit to avoid having the crash detection thread flag - # the Executor as broken during the shutdown. This is safe as either: - # * We don't need to communicate with the workers anymore - # * There is nothing left in the Queue buffer except None sentinels - mp.util.debug("closing call_queue") - call_queue.close() - - mp.util.debug("joining processes") - # If .join() is not called on the created processes then - # some ctx.Queue methods may deadlock on Mac OS X. - while processes: - _, p = processes.popitem() - p.join() - mp.util.debug("queue management thread clean shutdown of worker " - "processes: {}".format(list(processes))) - - result_reader = result_queue._reader - wakeup_reader = thread_wakeup._reader - readers = [result_reader, wakeup_reader] - - while True: - _add_call_item_to_queue(pending_work_items, - running_work_items, - work_ids_queue, - call_queue) - # Wait for a result to be ready in the result_queue while checking - # that all worker processes are still running, or for a wake up - # signal send. The wake up signals come either from new tasks being - # submitted, from the executor being shutdown/gc-ed, or from the - # shutdown of the python interpreter. - worker_sentinels = [p.sentinel for p in processes.values()] - ready = wait(readers + worker_sentinels) - - broken = ("A worker process managed by the executor was unexpectedly " - "terminated. This could be caused by a segmentation fault " - "while calling the function or by an excessive memory usage " - "causing the Operating System to kill the worker.", None, - TerminatedWorkerError) - if result_reader in ready: - try: - result_item = result_reader.recv() - broken = None - if isinstance(result_item, _RemoteTraceback): - broken = ("A task has failed to un-serialize. Please " - "ensure that the arguments of the function are " - "all picklable.", result_item.tb, - BrokenProcessPool) - except BaseException as e: - tb = getattr(e, "__traceback__", None) - if tb is None: - _, _, tb = sys.exc_info() - broken = ("A result has failed to un-serialize. Please " - "ensure that the objects returned by the function " - "are always picklable.", - traceback.format_exception(type(e), e, tb), - BrokenProcessPool) - elif wakeup_reader in ready: - broken = None - result_item = None - thread_wakeup.clear() - if broken is not None: - msg, cause_tb, exc_type = broken - if (issubclass(exc_type, TerminatedWorkerError) and - (sys.platform != "win32")): - # In Windows, introspecting terminated workers exitcodes seems - # unstable, therefore they are not appended in the exception - # message. - msg += " The exit codes of the workers are {}".format( - get_exitcodes_terminated_worker(processes)) - - bpe = exc_type(msg) - if cause_tb is not None: - bpe = set_cause(bpe, _RemoteTraceback( - "\n'''\n{}'''".format(''.join(cause_tb)))) - # Mark the process pool broken so that submits fail right now. - executor_flags.flag_as_broken(bpe) - - # All futures in flight must be marked failed - for work_id, work_item in pending_work_items.items(): - work_item.future.set_exception(bpe) - # Delete references to object. See issue16284 - del work_item - pending_work_items.clear() - - # Terminate remaining workers forcibly: the queues or their - # locks may be in a dirty state and block forever. - while processes: - _, p = processes.popitem() - mp.util.debug('terminate process {}'.format(p.name)) - try: - recursive_terminate(p) - except ProcessLookupError: # pragma: no cover - pass - - shutdown_all_workers() - return - if isinstance(result_item, int): - # Clean shutdown of a worker using its PID, either on request - # by the executor.shutdown method or by the timeout of the worker - # itself: we should not mark the executor as broken. - with processes_management_lock: - p = processes.pop(result_item, None) - - # p can be None is the executor is concurrently shutting down. - if p is not None: - p._worker_exit_lock.release() - p.join() - del p - - # Make sure the executor have the right number of worker, even if a - # worker timeout while some jobs were submitted. If some work is - # pending or there is less processes than running items, we need to - # start a new Process and raise a warning. - n_pending = len(pending_work_items) - n_running = len(running_work_items) - if (n_pending - n_running > 0 or n_running > len(processes)): - executor = executor_reference() - if (executor is not None - and len(processes) < executor._max_workers): - warnings.warn( - "A worker stopped while some jobs were given to the " - "executor. This can be caused by a too short worker " - "timeout or by a memory leak.", UserWarning - ) - executor._adjust_process_count() - executor = None - - elif result_item is not None: - work_item = pending_work_items.pop(result_item.work_id, None) - # work_item can be None if another process terminated - if work_item is not None: - if result_item.exception: - work_item.future.set_exception(result_item.exception) - else: - work_item.future.set_result(result_item.result) - # Delete references to object. See issue16284 - del work_item - running_work_items.remove(result_item.work_id) - # Delete reference to result_item - del result_item - - # Check whether we should start shutting down. - executor = executor_reference() - # No more work items can be added if: - # - The interpreter is shutting down OR - # - The executor that owns this worker has been collected OR - # - The executor that owns this worker has been shutdown. - if is_shutting_down(): - # bpo-33097: Make sure that the executor is flagged as shutting - # down even if it is shutdown by the interpreter exiting. - with executor_flags.shutdown_lock: - executor_flags.shutdown = True - if executor_flags.kill_workers: - while pending_work_items: - _, work_item = pending_work_items.popitem() - work_item.future.set_exception(ShutdownExecutorError( - "The Executor was shutdown before this job could " - "complete.")) - del work_item - # Terminate remaining workers forcibly: the queues or their - # locks may be in a dirty state and block forever. - while processes: - _, p = processes.popitem() - recursive_terminate(p) - shutdown_all_workers() - return - # Since no new work items can be added, it is safe to shutdown - # this thread if there are no pending work items. - if not pending_work_items: - shutdown_all_workers() - return - elif executor_flags.broken: - return - executor = None - - -_system_limits_checked = False -_system_limited = None - - -def _check_system_limits(): - global _system_limits_checked, _system_limited - if _system_limits_checked: - if _system_limited: - raise NotImplementedError(_system_limited) - _system_limits_checked = True - try: - nsems_max = os.sysconf("SC_SEM_NSEMS_MAX") - except (AttributeError, ValueError): - # sysconf not available or setting not available - return - if nsems_max == -1: - # undetermined limit, assume that limit is determined - # by available memory only - return - if nsems_max >= 256: - # minimum number of semaphores available - # according to POSIX - return - _system_limited = ("system provides too few semaphores (%d available, " - "256 necessary)" % nsems_max) - raise NotImplementedError(_system_limited) - - -def _chain_from_iterable_of_lists(iterable): - """ - Specialized implementation of itertools.chain.from_iterable. - Each item in *iterable* should be a list. This function is - careful not to keep references to yielded objects. - """ - for element in iterable: - element.reverse() - while element: - yield element.pop() - - -def _check_max_depth(context): - # Limit the maxmal recursion level - global _CURRENT_DEPTH - if context.get_start_method() == "fork" and _CURRENT_DEPTH > 0: - raise LokyRecursionError( - "Could not spawn extra nested processes at depth superior to " - "MAX_DEPTH=1. It is not possible to increase this limit when " - "using the 'fork' start method.") - - if 0 < MAX_DEPTH and _CURRENT_DEPTH + 1 > MAX_DEPTH: - raise LokyRecursionError( - "Could not spawn extra nested processes at depth superior to " - "MAX_DEPTH={}. If this is intendend, you can change this limit " - "with the LOKY_MAX_DEPTH environment variable.".format(MAX_DEPTH)) - - -class LokyRecursionError(RuntimeError): - """Raised when a process try to spawn too many levels of nested processes. - """ - - -class BrokenProcessPool(_BPPException): - """ - Raised when the executor is broken while a future was in the running state. - The cause can an error raised when unpickling the task in the worker - process or when unpickling the result value in the parent process. It can - also be caused by a worker process being terminated unexpectedly. - """ - - -class TerminatedWorkerError(BrokenProcessPool): - """ - Raised when a process in a ProcessPoolExecutor terminated abruptly - while a future was in the running state. - """ - - -# Alias for backward compat (for code written for loky 1.1.4 and earlier). Do -# not use in new code. -BrokenExecutor = BrokenProcessPool - - -class ShutdownExecutorError(RuntimeError): - - """ - Raised when a ProcessPoolExecutor is shutdown while a future was in the - running or pending state. - """ - - -class ProcessPoolExecutor(_base.Executor): - - _at_exit = None - - def __init__(self, max_workers=None, job_reducers=None, - result_reducers=None, timeout=None, context=None, - initializer=None, initargs=()): - """Initializes a new ProcessPoolExecutor instance. - - Args: - max_workers: int, optional (default: cpu_count()) - The maximum number of processes that can be used to execute the - given calls. If None or not given then as many worker processes - will be created as the number of CPUs the current process - can use. - job_reducers, result_reducers: dict(type: reducer_func) - Custom reducer for pickling the jobs and the results from the - Executor. If only `job_reducers` is provided, `result_reducer` - will use the same reducers - timeout: int, optional (default: None) - Idle workers exit after timeout seconds. If a new job is - submitted after the timeout, the executor will start enough - new Python processes to make sure the pool of workers is full. - context: A multiprocessing context to launch the workers. This - object should provide SimpleQueue, Queue and Process. - initializer: An callable used to initialize worker processes. - initargs: A tuple of arguments to pass to the initializer. - """ - _check_system_limits() - - if max_workers is None: - self._max_workers = cpu_count() - else: - if max_workers <= 0: - raise ValueError("max_workers must be greater than 0") - self._max_workers = max_workers - - if context is None: - context = get_context() - self._context = context - - if initializer is not None and not callable(initializer): - raise TypeError("initializer must be a callable") - self._initializer = initializer - self._initargs = initargs - - _check_max_depth(self._context) - - if result_reducers is None: - result_reducers = job_reducers - - # Timeout - self._timeout = timeout - - # Internal variables of the ProcessPoolExecutor - self._processes = {} - self._queue_count = 0 - self._pending_work_items = {} - self._running_work_items = [] - self._work_ids = queue.Queue() - self._processes_management_lock = self._context.Lock() - self._queue_management_thread = None - - # _ThreadWakeup is a communication channel used to interrupt the wait - # of the main loop of queue_manager_thread from another thread (e.g. - # when calling executor.submit or executor.shutdown). We do not use the - # _result_queue to send the wakeup signal to the queue_manager_thread - # as it could result in a deadlock if a worker process dies with the - # _result_queue write lock still acquired. - self._queue_management_thread_wakeup = _ThreadWakeup() - - # Flag to hold the state of the Executor. This permits to introspect - # the Executor state even once it has been garbage collected. - self._flags = _ExecutorFlags() - - # Finally setup the queues for interprocess communication - self._setup_queues(job_reducers, result_reducers) - - mp.util.debug('ProcessPoolExecutor is setup') - - def _setup_queues(self, job_reducers, result_reducers, queue_size=None): - # Make the call queue slightly larger than the number of processes to - # prevent the worker processes from idling. But don't make it too big - # because futures in the call queue cannot be cancelled. - if queue_size is None: - queue_size = 2 * self._max_workers + EXTRA_QUEUED_CALLS - self._call_queue = _SafeQueue( - max_size=queue_size, pending_work_items=self._pending_work_items, - running_work_items=self._running_work_items, - thread_wakeup=self._queue_management_thread_wakeup, - reducers=job_reducers, ctx=self._context) - # Killed worker processes can produce spurious "broken pipe" - # tracebacks in the queue's own worker thread. But we detect killed - # processes anyway, so silence the tracebacks. - self._call_queue._ignore_epipe = True - - self._result_queue = SimpleQueue(reducers=result_reducers, - ctx=self._context) - - def _start_queue_management_thread(self): - if self._queue_management_thread is None: - mp.util.debug('_start_queue_management_thread called') - - # When the executor gets garbarge collected, the weakref callback - # will wake up the queue management thread so that it can terminate - # if there is no pending work item. - def weakref_cb(_, - thread_wakeup=self._queue_management_thread_wakeup): - mp.util.debug('Executor collected: triggering callback for' - ' QueueManager wakeup') - thread_wakeup.wakeup() - - # Start the processes so that their sentinels are known. - self._queue_management_thread = threading.Thread( - target=_queue_management_worker, - args=(weakref.ref(self, weakref_cb), - self._flags, - self._processes, - self._pending_work_items, - self._running_work_items, - self._work_ids, - self._call_queue, - self._result_queue, - self._queue_management_thread_wakeup, - self._processes_management_lock), - name="QueueManagerThread") - self._queue_management_thread.daemon = True - self._queue_management_thread.start() - - # register this executor in a mechanism that ensures it will wakeup - # when the interpreter is exiting. - _threads_wakeups[self._queue_management_thread] = \ - self._queue_management_thread_wakeup - - global process_pool_executor_at_exit - if process_pool_executor_at_exit is None: - # Ensure that the _python_exit function will be called before - # the multiprocessing.Queue._close finalizers which have an - # exitpriority of 10. - process_pool_executor_at_exit = mp.util.Finalize( - None, _python_exit, exitpriority=20) - - def _adjust_process_count(self): - for _ in range(len(self._processes), self._max_workers): - worker_exit_lock = self._context.BoundedSemaphore(1) - worker_exit_lock.acquire() - p = self._context.Process( - target=_process_worker, - args=(self._call_queue, - self._result_queue, - self._initializer, - self._initargs, - self._processes_management_lock, - self._timeout, - worker_exit_lock, - _CURRENT_DEPTH + 1)) - p._worker_exit_lock = worker_exit_lock - p.start() - self._processes[p.pid] = p - mp.util.debug('Adjust process count : {}'.format(self._processes)) - - def _ensure_executor_running(self): - """ensures all workers and management thread are running - """ - with self._processes_management_lock: - if len(self._processes) != self._max_workers: - self._adjust_process_count() - self._start_queue_management_thread() - - def submit(self, fn, *args, **kwargs): - with self._flags.shutdown_lock: - if self._flags.broken is not None: - raise self._flags.broken - if self._flags.shutdown: - raise ShutdownExecutorError( - 'cannot schedule new futures after shutdown') - - # Cannot submit a new calls once the interpreter is shutting down. - # This check avoids spawning new processes at exit. - if _global_shutdown: - raise RuntimeError('cannot schedule new futures after ' - 'interpreter shutdown') - - f = _base.Future() - w = _WorkItem(f, fn, args, kwargs) - - self._pending_work_items[self._queue_count] = w - self._work_ids.put(self._queue_count) - self._queue_count += 1 - # Wake up queue management thread - self._queue_management_thread_wakeup.wakeup() - - self._ensure_executor_running() - return f - submit.__doc__ = _base.Executor.submit.__doc__ - - def map(self, fn, *iterables, **kwargs): - """Returns an iterator equivalent to map(fn, iter). - - Args: - fn: A callable that will take as many arguments as there are - passed iterables. - timeout: The maximum number of seconds to wait. If None, then there - is no limit on the wait time. - chunksize: If greater than one, the iterables will be chopped into - chunks of size chunksize and submitted to the process pool. - If set to one, the items in the list will be sent one at a - time. - - Returns: - An iterator equivalent to: map(func, *iterables) but the calls may - be evaluated out-of-order. - - Raises: - TimeoutError: If the entire result iterator could not be generated - before the given timeout. - Exception: If fn(*args) raises for any values. - """ - timeout = kwargs.get('timeout', None) - chunksize = kwargs.get('chunksize', 1) - if chunksize < 1: - raise ValueError("chunksize must be >= 1.") - - results = super(ProcessPoolExecutor, self).map( - partial(_process_chunk, fn), _get_chunks(chunksize, *iterables), - timeout=timeout) - return _chain_from_iterable_of_lists(results) - - def shutdown(self, wait=True, kill_workers=False): - mp.util.debug('shutting down executor %s' % self) - - self._flags.flag_as_shutting_down(kill_workers) - qmt = self._queue_management_thread - qmtw = self._queue_management_thread_wakeup - if qmt: - self._queue_management_thread = None - if qmtw: - self._queue_management_thread_wakeup = None - # Wake up queue management thread - if qmtw is not None: - try: - qmtw.wakeup() - except OSError: - # Can happen in case of concurrent calls to shutdown. - pass - if wait: - qmt.join() - - cq = self._call_queue - if cq: - self._call_queue = None - cq.close() - if wait: - cq.join_thread() - self._result_queue = None - self._processes_management_lock = None - - if qmtw: - try: - qmtw.close() - except OSError: - # Can happen in case of concurrent calls to shutdown. - pass - shutdown.__doc__ = _base.Executor.shutdown.__doc__ diff --git a/sklearn/externals/joblib/externals/loky/reusable_executor.py b/sklearn/externals/joblib/externals/loky/reusable_executor.py deleted file mode 100644 index 30b217fd4113c..0000000000000 --- a/sklearn/externals/joblib/externals/loky/reusable_executor.py +++ /dev/null @@ -1,205 +0,0 @@ -############################################################################### -# Reusable ProcessPoolExecutor -# -# author: Thomas Moreau and Olivier Grisel -# -import time -import warnings -import threading -import multiprocessing as mp - -from .process_executor import ProcessPoolExecutor, EXTRA_QUEUED_CALLS -from .backend.context import cpu_count -from .backend import get_context - -__all__ = ['get_reusable_executor'] - -# Python 2 compat helper -STRING_TYPE = type("") - -# Singleton executor and id management -_executor_lock = threading.RLock() -_next_executor_id = 0 -_executor = None -_executor_args = None - - -def _get_next_executor_id(): - """Ensure that each successive executor instance has a unique, monotonic id. - - The purpose of this monotonic id is to help debug and test automated - instance creation. - """ - global _next_executor_id - with _executor_lock: - executor_id = _next_executor_id - _next_executor_id += 1 - return executor_id - - -def get_reusable_executor(max_workers=None, context=None, timeout=10, - kill_workers=False, reuse="auto", - job_reducers=None, result_reducers=None, - initializer=None, initargs=()): - """Return the current ReusableExectutor instance. - - Start a new instance if it has not been started already or if the previous - instance was left in a broken state. - - If the previous instance does not have the requested number of workers, the - executor is dynamically resized to adjust the number of workers prior to - returning. - - Reusing a singleton instance spares the overhead of starting new worker - processes and importing common python packages each time. - - ``max_workers`` controls the maximum number of tasks that can be running in - parallel in worker processes. By default this is set to the number of - CPUs on the host. - - Setting ``timeout`` (in seconds) makes idle workers automatically shutdown - so as to release system resources. New workers are respawn upon submission - of new tasks so that ``max_workers`` are available to accept the newly - submitted tasks. Setting ``timeout`` to around 100 times the time required - to spawn new processes and import packages in them (on the order of 100ms) - ensures that the overhead of spawning workers is negligible. - - Setting ``kill_workers=True`` makes it possible to forcibly interrupt - previously spawned jobs to get a new instance of the reusable executor - with new constructor argument values. - - The ``job_reducers`` and ``result_reducers`` are used to customize the - pickling of tasks and results send to the executor. - - When provided, the ``initializer`` is run first in newly spawned - processes with argument ``initargs``. - """ - with _executor_lock: - global _executor, _executor_kwargs - executor = _executor - - if max_workers is None: - if reuse is True and executor is not None: - max_workers = executor._max_workers - else: - max_workers = cpu_count() - elif max_workers <= 0: - raise ValueError( - "max_workers must be greater than 0, got {}." - .format(max_workers)) - - if isinstance(context, STRING_TYPE): - context = get_context(context) - if context is not None and context.get_start_method() == "fork": - raise ValueError("Cannot use reusable executor with the 'fork' " - "context") - - kwargs = dict(context=context, timeout=timeout, - job_reducers=job_reducers, - result_reducers=result_reducers, - initializer=initializer, initargs=initargs) - if executor is None: - mp.util.debug("Create a executor with max_workers={}." - .format(max_workers)) - executor_id = _get_next_executor_id() - _executor_kwargs = kwargs - _executor = executor = _ReusablePoolExecutor( - _executor_lock, max_workers=max_workers, - executor_id=executor_id, **kwargs) - else: - if reuse == 'auto': - reuse = kwargs == _executor_kwargs - if (executor._flags.broken or executor._flags.shutdown - or not reuse): - if executor._flags.broken: - reason = "broken" - elif executor._flags.shutdown: - reason = "shutdown" - else: - reason = "arguments have changed" - mp.util.debug( - "Creating a new executor with max_workers={} as the " - "previous instance cannot be reused ({})." - .format(max_workers, reason)) - executor.shutdown(wait=True, kill_workers=kill_workers) - _executor = executor = _executor_kwargs = None - # Recursive call to build a new instance - return get_reusable_executor(max_workers=max_workers, - **kwargs) - else: - mp.util.debug("Reusing existing executor with max_workers={}." - .format(executor._max_workers)) - executor._resize(max_workers) - - return executor - - -class _ReusablePoolExecutor(ProcessPoolExecutor): - def __init__(self, submit_resize_lock, max_workers=None, context=None, - timeout=None, executor_id=0, job_reducers=None, - result_reducers=None, initializer=None, initargs=()): - super(_ReusablePoolExecutor, self).__init__( - max_workers=max_workers, context=context, timeout=timeout, - job_reducers=job_reducers, result_reducers=result_reducers, - initializer=initializer, initargs=initargs) - self.executor_id = executor_id - self._submit_resize_lock = submit_resize_lock - - def submit(self, fn, *args, **kwargs): - with self._submit_resize_lock: - return super(_ReusablePoolExecutor, self).submit( - fn, *args, **kwargs) - - def _resize(self, max_workers): - with self._submit_resize_lock: - if max_workers is None: - raise ValueError("Trying to resize with max_workers=None") - elif max_workers == self._max_workers: - return - - if self._queue_management_thread is None: - # If the queue_management_thread has not been started - # then no processes have been spawned and we can just - # update _max_workers and return - self._max_workers = max_workers - return - - self._wait_job_completion() - - # Some process might have returned due to timeout so check how many - # children are still alive. Use the _process_management_lock to - # ensure that no process are spawned or timeout during the resize. - with self._processes_management_lock: - processes = list(self._processes.values()) - nb_children_alive = sum(p.is_alive() for p in processes) - self._max_workers = max_workers - for _ in range(max_workers, nb_children_alive): - self._call_queue.put(None) - while (len(self._processes) > max_workers - and not self._flags.broken): - time.sleep(1e-3) - - self._adjust_process_count() - processes = list(self._processes.values()) - while not all([p.is_alive() for p in processes]): - time.sleep(1e-3) - - def _wait_job_completion(self): - """Wait for the cache to be empty before resizing the pool.""" - # Issue a warning to the user about the bad effect of this usage. - if len(self._pending_work_items) > 0: - warnings.warn("Trying to resize an executor with running jobs: " - "waiting for jobs completion before resizing.", - UserWarning) - mp.util.debug("Executor {} waiting for jobs completion before" - " resizing".format(self.executor_id)) - # Wait for the completion of the jobs - while len(self._pending_work_items) > 0: - time.sleep(1e-3) - - def _setup_queues(self, job_reducers, result_reducers): - # As this executor can be resized, use a large queue size to avoid - # underestimating capacity and introducing overhead - queue_size = 2 * cpu_count() + EXTRA_QUEUED_CALLS - super(_ReusablePoolExecutor, self)._setup_queues( - job_reducers, result_reducers, queue_size=queue_size) diff --git a/sklearn/externals/joblib/format_stack.py b/sklearn/externals/joblib/format_stack.py deleted file mode 100644 index 949ac7d9575c8..0000000000000 --- a/sklearn/externals/joblib/format_stack.py +++ /dev/null @@ -1,401 +0,0 @@ -""" -Represent an exception with a lot of information. - -Provides 2 useful functions: - -format_exc: format an exception into a complete traceback, with full - debugging instruction. - -format_outer_frames: format the current position in the stack call. - -Adapted from IPython's VerboseTB. -""" -# Authors: Gael Varoquaux < gael dot varoquaux at normalesup dot org > -# Nathaniel Gray -# Fernando Perez -# Copyright: 2010, Gael Varoquaux -# 2001-2004, Fernando Perez -# 2001 Nathaniel Gray -# License: BSD 3 clause - - -import inspect -import keyword -import linecache -import os -import pydoc -import sys -import time -import tokenize -import traceback - -try: # Python 2 - generate_tokens = tokenize.generate_tokens -except AttributeError: # Python 3 - generate_tokens = tokenize.tokenize - -INDENT = ' ' * 8 - - -############################################################################### -# some internal-use functions -def safe_repr(value): - """Hopefully pretty robust repr equivalent.""" - # this is pretty horrible but should always return *something* - try: - return pydoc.text.repr(value) - except KeyboardInterrupt: - raise - except: - try: - return repr(value) - except KeyboardInterrupt: - raise - except: - try: - # all still in an except block so we catch - # getattr raising - name = getattr(value, '__name__', None) - if name: - # ick, recursion - return safe_repr(name) - klass = getattr(value, '__class__', None) - if klass: - return '%s instance' % safe_repr(klass) - except KeyboardInterrupt: - raise - except: - return 'UNRECOVERABLE REPR FAILURE' - - -def eq_repr(value, repr=safe_repr): - return '=%s' % repr(value) - - -############################################################################### -def uniq_stable(elems): - """uniq_stable(elems) -> list - - Return from an iterable, a list of all the unique elements in the input, - but maintaining the order in which they first appear. - - A naive solution to this problem which just makes a dictionary with the - elements as keys fails to respect the stability condition, since - dictionaries are unsorted by nature. - - Note: All elements in the input must be hashable. - """ - unique = [] - unique_set = set() - for nn in elems: - if nn not in unique_set: - unique.append(nn) - unique_set.add(nn) - return unique - - -############################################################################### -def fix_frame_records_filenames(records): - """Try to fix the filenames in each record from inspect.getinnerframes(). - - Particularly, modules loaded from within zip files have useless filenames - attached to their code object, and inspect.getinnerframes() just uses it. - """ - fixed_records = [] - for frame, filename, line_no, func_name, lines, index in records: - # Look inside the frame's globals dictionary for __file__, which should - # be better. - better_fn = frame.f_globals.get('__file__', None) - if isinstance(better_fn, str): - # Check the type just in case someone did something weird with - # __file__. It might also be None if the error occurred during - # import. - filename = better_fn - fixed_records.append((frame, filename, line_no, func_name, lines, - index)) - return fixed_records - - -def _fixed_getframes(etb, context=1, tb_offset=0): - LNUM_POS, LINES_POS, INDEX_POS = 2, 4, 5 - - records = fix_frame_records_filenames(inspect.getinnerframes(etb, context)) - - # If the error is at the console, don't build any context, since it would - # otherwise produce 5 blank lines printed out (there is no file at the - # console) - rec_check = records[tb_offset:] - try: - rname = rec_check[0][1] - if rname == '' or rname.endswith(''): - return rec_check - except IndexError: - pass - - aux = traceback.extract_tb(etb) - assert len(records) == len(aux) - for i, (file, lnum, _, _) in enumerate(aux): - maybe_start = lnum - 1 - context // 2 - start = max(maybe_start, 0) - end = start + context - lines = linecache.getlines(file)[start:end] - buf = list(records[i]) - buf[LNUM_POS] = lnum - buf[INDEX_POS] = lnum - 1 - start - buf[LINES_POS] = lines - records[i] = tuple(buf) - return records[tb_offset:] - - -def _format_traceback_lines(lnum, index, lines, lvals=None): - numbers_width = 7 - res = [] - i = lnum - index - - for line in lines: - if i == lnum: - # This is the line with the error - pad = numbers_width - len(str(i)) - if pad >= 3: - marker = '-' * (pad - 3) + '-> ' - elif pad == 2: - marker = '> ' - elif pad == 1: - marker = '>' - else: - marker = '' - num = marker + str(i) - else: - num = '%*s' % (numbers_width, i) - line = '%s %s' % (num, line) - - res.append(line) - if lvals and i == lnum: - res.append(lvals + '\n') - i = i + 1 - return res - - -def format_records(records): # , print_globals=False): - # Loop over all records printing context and info - frames = [] - abspath = os.path.abspath - for frame, file, lnum, func, lines, index in records: - try: - file = file and abspath(file) or '?' - except OSError: - # if file is '' or something not in the filesystem, - # the abspath call will throw an OSError. Just ignore it and - # keep the original file string. - pass - - if file.endswith('.pyc'): - file = file[:-4] + '.py' - - link = file - - args, varargs, varkw, locals = inspect.getargvalues(frame) - - if func == '?': - call = '' - else: - # Decide whether to include variable details or not - try: - call = 'in %s%s' % (func, inspect.formatargvalues(args, - varargs, varkw, locals, - formatvalue=eq_repr)) - except KeyError: - # Very odd crash from inspect.formatargvalues(). The - # scenario under which it appeared was a call to - # view(array,scale) in NumTut.view.view(), where scale had - # been defined as a scalar (it should be a tuple). Somehow - # inspect messes up resolving the argument list of view() - # and barfs out. At some point I should dig into this one - # and file a bug report about it. - print("\nJoblib's exception reporting continues...\n") - call = 'in %s(***failed resolving arguments***)' % func - - # Initialize a list of names on the current line, which the - # tokenizer below will populate. - names = [] - - def tokeneater(token_type, token, start, end, line): - """Stateful tokeneater which builds dotted names. - - The list of names it appends to (from the enclosing scope) can - contain repeated composite names. This is unavoidable, since - there is no way to disambiguate partial dotted structures until - the full list is known. The caller is responsible for pruning - the final list of duplicates before using it.""" - - # build composite names - if token == '.': - try: - names[-1] += '.' - # store state so the next token is added for x.y.z names - tokeneater.name_cont = True - return - except IndexError: - pass - if token_type == tokenize.NAME and token not in keyword.kwlist: - if tokeneater.name_cont: - # Dotted names - names[-1] += token - tokeneater.name_cont = False - else: - # Regular new names. We append everything, the caller - # will be responsible for pruning the list later. It's - # very tricky to try to prune as we go, b/c composite - # names can fool us. The pruning at the end is easy - # to do (or the caller can print a list with repeated - # names if so desired. - names.append(token) - elif token_type == tokenize.NEWLINE: - raise IndexError - # we need to store a bit of state in the tokenizer to build - # dotted names - tokeneater.name_cont = False - - def linereader(file=file, lnum=[lnum], getline=linecache.getline): - line = getline(file, lnum[0]) - lnum[0] += 1 - return line - - # Build the list of names on this line of code where the exception - # occurred. - try: - # This builds the names list in-place by capturing it from the - # enclosing scope. - for token in generate_tokens(linereader): - tokeneater(*token) - except (IndexError, UnicodeDecodeError, SyntaxError): - # signals exit of tokenizer - # SyntaxError can happen when trying to tokenize - # a compiled (e.g. .so or .pyd) extension - pass - except tokenize.TokenError as msg: - _m = ("An unexpected error occurred while tokenizing input file %s\n" - "The following traceback may be corrupted or invalid\n" - "The error message is: %s\n" % (file, msg)) - print(_m) - - # prune names list of duplicates, but keep the right order - unique_names = uniq_stable(names) - - # Start loop over vars - lvals = [] - for name_full in unique_names: - name_base = name_full.split('.', 1)[0] - if name_base in frame.f_code.co_varnames: - if name_base in locals.keys(): - try: - value = safe_repr(eval(name_full, locals)) - except: - value = "undefined" - else: - value = "undefined" - name = name_full - lvals.append('%s = %s' % (name, value)) - #elif print_globals: - # if frame.f_globals.has_key(name_base): - # try: - # value = safe_repr(eval(name_full,frame.f_globals)) - # except: - # value = "undefined" - # else: - # value = "undefined" - # name = 'global %s' % name_full - # lvals.append('%s = %s' % (name,value)) - if lvals: - lvals = '%s%s' % (INDENT, ('\n%s' % INDENT).join(lvals)) - else: - lvals = '' - - level = '%s\n%s %s\n' % (75 * '.', link, call) - - if index is None: - frames.append(level) - else: - frames.append('%s%s' % (level, ''.join( - _format_traceback_lines(lnum, index, lines, lvals)))) - - return frames - - -############################################################################### -def format_exc(etype, evalue, etb, context=5, tb_offset=0): - """ Return a nice text document describing the traceback. - - Parameters - ----------- - etype, evalue, etb: as returned by sys.exc_info - context: number of lines of the source file to plot - tb_offset: the number of stack frame not to use (0 = use all) - - """ - # some locals - try: - etype = etype.__name__ - except AttributeError: - pass - - # Header with the exception type, python version, and date - pyver = 'Python ' + sys.version.split()[0] + ': ' + sys.executable - date = time.ctime(time.time()) - pid = 'PID: %i' % os.getpid() - - head = '%s%s%s\n%s%s%s' % ( - etype, ' ' * (75 - len(str(etype)) - len(date)), - date, pid, ' ' * (75 - len(str(pid)) - len(pyver)), - pyver) - - # Drop topmost frames if requested - records = _fixed_getframes(etb, context, tb_offset) - - # Get (safely) a string form of the exception info - try: - etype_str, evalue_str = map(str, (etype, evalue)) - except BaseException: - # User exception is improperly defined. - etype, evalue = str, sys.exc_info()[:2] - etype_str, evalue_str = map(str, (etype, evalue)) - # ... and format it - exception = ['%s: %s' % (etype_str, evalue_str)] - frames = format_records(records) - return '%s\n%s\n%s' % (head, '\n'.join(frames), ''.join(exception[0])) - - -############################################################################### -def format_outer_frames(context=5, stack_start=None, stack_end=None, - ignore_ipython=True): - LNUM_POS, LINES_POS, INDEX_POS = 2, 4, 5 - records = inspect.getouterframes(inspect.currentframe()) - output = list() - - for i, (frame, filename, line_no, func_name, lines, index) \ - in enumerate(records): - # Look inside the frame's globals dictionary for __file__, which should - # be better. - better_fn = frame.f_globals.get('__file__', None) - if isinstance(better_fn, str): - # Check the type just in case someone did something weird with - # __file__. It might also be None if the error occurred during - # import. - filename = better_fn - if filename.endswith('.pyc'): - filename = filename[:-4] + '.py' - if ignore_ipython: - # Hack to avoid printing the internals of IPython - if (os.path.basename(filename) in ('iplib.py', 'py3compat.py') - and func_name in ('execfile', 'safe_execfile', 'runcode')): - break - maybe_start = line_no - 1 - context // 2 - start = max(maybe_start, 0) - end = start + context - lines = linecache.getlines(filename)[start:end] - buf = list(records[i]) - buf[LNUM_POS] = line_no - buf[INDEX_POS] = line_no - 1 - start - buf[LINES_POS] = lines - output.append(tuple(buf)) - return '\n'.join(format_records(output[stack_end:stack_start:-1])) diff --git a/sklearn/externals/joblib/func_inspect.py b/sklearn/externals/joblib/func_inspect.py deleted file mode 100644 index d4d2670943e5c..0000000000000 --- a/sklearn/externals/joblib/func_inspect.py +++ /dev/null @@ -1,361 +0,0 @@ -""" -My own variation on function-specific inspect-like features. -""" - -# Author: Gael Varoquaux -# Copyright (c) 2009 Gael Varoquaux -# License: BSD Style, 3 clauses. - -from itertools import islice -import inspect -import warnings -import re -import os -import collections - -from ._compat import _basestring -from .logger import pformat -from ._memory_helpers import open_py_source -from ._compat import PY3_OR_LATER - -full_argspec_fields = ('args varargs varkw defaults kwonlyargs ' - 'kwonlydefaults annotations') -full_argspec_type = collections.namedtuple('FullArgSpec', full_argspec_fields) - - -def get_func_code(func): - """ Attempts to retrieve a reliable function code hash. - - The reason we don't use inspect.getsource is that it caches the - source, whereas we want this to be modified on the fly when the - function is modified. - - Returns - ------- - func_code: string - The function code - source_file: string - The path to the file in which the function is defined. - first_line: int - The first line of the code in the source file. - - Notes - ------ - This function does a bit more magic than inspect, and is thus - more robust. - """ - source_file = None - try: - code = func.__code__ - source_file = code.co_filename - if not os.path.exists(source_file): - # Use inspect for lambda functions and functions defined in an - # interactive shell, or in doctests - source_code = ''.join(inspect.getsourcelines(func)[0]) - line_no = 1 - if source_file.startswith('', source_file).groups() - line_no = int(line_no) - source_file = '' % source_file - return source_code, source_file, line_no - # Try to retrieve the source code. - with open_py_source(source_file) as source_file_obj: - first_line = code.co_firstlineno - # All the lines after the function definition: - source_lines = list(islice(source_file_obj, first_line - 1, None)) - return ''.join(inspect.getblock(source_lines)), source_file, first_line - except: - # If the source code fails, we use the hash. This is fragile and - # might change from one session to another. - if hasattr(func, '__code__'): - # Python 3.X - return str(func.__code__.__hash__()), source_file, -1 - else: - # Weird objects like numpy ufunc don't have __code__ - # This is fragile, as quite often the id of the object is - # in the repr, so it might not persist across sessions, - # however it will work for ufuncs. - return repr(func), source_file, -1 - - -def _clean_win_chars(string): - """Windows cannot encode some characters in filename.""" - import urllib - if hasattr(urllib, 'quote'): - quote = urllib.quote - else: - # In Python 3, quote is elsewhere - import urllib.parse - quote = urllib.parse.quote - for char in ('<', '>', '!', ':', '\\'): - string = string.replace(char, quote(char)) - return string - - -def get_func_name(func, resolv_alias=True, win_characters=True): - """ Return the function import path (as a list of module names), and - a name for the function. - - Parameters - ---------- - func: callable - The func to inspect - resolv_alias: boolean, optional - If true, possible local aliases are indicated. - win_characters: boolean, optional - If true, substitute special characters using urllib.quote - This is useful in Windows, as it cannot encode some filenames - """ - if hasattr(func, '__module__'): - module = func.__module__ - else: - try: - module = inspect.getmodule(func) - except TypeError: - if hasattr(func, '__class__'): - module = func.__class__.__module__ - else: - module = 'unknown' - if module is None: - # Happens in doctests, eg - module = '' - if module == '__main__': - try: - filename = os.path.abspath(inspect.getsourcefile(func)) - except: - filename = None - if filename is not None: - # mangling of full path to filename - parts = filename.split(os.sep) - if parts[-1].startswith(' 1500: - formatted_arg = '%s...' % formatted_arg[:700] - return formatted_arg - - -def format_signature(func, *args, **kwargs): - # XXX: Should this use inspect.formatargvalues/formatargspec? - module, name = get_func_name(func) - module = [m for m in module if m] - if module: - module.append(name) - module_path = '.'.join(module) - else: - module_path = name - arg_str = list() - previous_length = 0 - for arg in args: - formatted_arg = _format_arg(arg) - if previous_length > 80: - formatted_arg = '\n%s' % formatted_arg - previous_length = len(formatted_arg) - arg_str.append(formatted_arg) - arg_str.extend(['%s=%s' % (v, _format_arg(i)) for v, i in kwargs.items()]) - arg_str = ', '.join(arg_str) - - signature = '%s(%s)' % (name, arg_str) - return module_path, signature - - -def format_call(func, args, kwargs, object_name="Memory"): - """ Returns a nicely formatted statement displaying the function - call with the given arguments. - """ - path, signature = format_signature(func, *args, **kwargs) - msg = '%s\n[%s] Calling %s...\n%s' % (80 * '_', object_name, - path, signature) - return msg - # XXX: Not using logging framework - # self.debug(msg) diff --git a/sklearn/externals/joblib/hashing.py b/sklearn/externals/joblib/hashing.py deleted file mode 100644 index 88bd6cfdefeab..0000000000000 --- a/sklearn/externals/joblib/hashing.py +++ /dev/null @@ -1,263 +0,0 @@ -""" -Fast cryptographic hash of Python objects, with a special case for fast -hashing of numpy arrays. -""" - -# Author: Gael Varoquaux -# Copyright (c) 2009 Gael Varoquaux -# License: BSD Style, 3 clauses. - -import pickle -import hashlib -import sys -import types -import struct -import io -import decimal - -from ._compat import _bytes_or_unicode, PY3_OR_LATER - - -if PY3_OR_LATER: - Pickler = pickle._Pickler -else: - Pickler = pickle.Pickler - - -class _ConsistentSet(object): - """ Class used to ensure the hash of Sets is preserved - whatever the order of its items. - """ - def __init__(self, set_sequence): - # Forces order of elements in set to ensure consistent hash. - try: - # Trying first to order the set assuming the type of elements is - # consistent and orderable. - # This fails on python 3 when elements are unorderable - # but we keep it in a try as it's faster. - self._sequence = sorted(set_sequence) - except (TypeError, decimal.InvalidOperation): - # If elements are unorderable, sorting them using their hash. - # This is slower but works in any case. - self._sequence = sorted((hash(e) for e in set_sequence)) - - -class _MyHash(object): - """ Class used to hash objects that won't normally pickle """ - - def __init__(self, *args): - self.args = args - - -class Hasher(Pickler): - """ A subclass of pickler, to do cryptographic hashing, rather than - pickling. - """ - - def __init__(self, hash_name='md5'): - self.stream = io.BytesIO() - # By default we want a pickle protocol that only changes with - # the major python version and not the minor one - protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER - else pickle.HIGHEST_PROTOCOL) - Pickler.__init__(self, self.stream, protocol=protocol) - # Initialise the hash obj - self._hash = hashlib.new(hash_name) - - def hash(self, obj, return_digest=True): - try: - self.dump(obj) - except pickle.PicklingError as e: - e.args += ('PicklingError while hashing %r: %r' % (obj, e),) - raise - dumps = self.stream.getvalue() - self._hash.update(dumps) - if return_digest: - return self._hash.hexdigest() - - def save(self, obj): - if isinstance(obj, (types.MethodType, type({}.pop))): - # the Pickler cannot pickle instance methods; here we decompose - # them into components that make them uniquely identifiable - if hasattr(obj, '__func__'): - func_name = obj.__func__.__name__ - else: - func_name = obj.__name__ - inst = obj.__self__ - if type(inst) == type(pickle): - obj = _MyHash(func_name, inst.__name__) - elif inst is None: - # type(None) or type(module) do not pickle - obj = _MyHash(func_name, inst) - else: - cls = obj.__self__.__class__ - obj = _MyHash(func_name, inst, cls) - Pickler.save(self, obj) - - def memoize(self, obj): - # We want hashing to be sensitive to value instead of reference. - # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] - # to hash to the same value and that's why we disable memoization - # for strings - if isinstance(obj, _bytes_or_unicode): - return - Pickler.memoize(self, obj) - - # The dispatch table of the pickler is not accessible in Python - # 3, as these lines are only bugware for IPython, we skip them. - def save_global(self, obj, name=None, pack=struct.pack): - # We have to override this method in order to deal with objects - # defined interactively in IPython that are not injected in - # __main__ - kwargs = dict(name=name, pack=pack) - if sys.version_info >= (3, 4): - del kwargs['pack'] - try: - Pickler.save_global(self, obj, **kwargs) - except pickle.PicklingError: - Pickler.save_global(self, obj, **kwargs) - module = getattr(obj, "__module__", None) - if module == '__main__': - my_name = name - if my_name is None: - my_name = obj.__name__ - mod = sys.modules[module] - if not hasattr(mod, my_name): - # IPython doesn't inject the variables define - # interactively in __main__ - setattr(mod, my_name, obj) - - dispatch = Pickler.dispatch.copy() - # builtin - dispatch[type(len)] = save_global - # type - dispatch[type(object)] = save_global - # classobj - dispatch[type(Pickler)] = save_global - # function - dispatch[type(pickle.dump)] = save_global - - def _batch_setitems(self, items): - # forces order of keys in dict to ensure consistent hash. - try: - # Trying first to compare dict assuming the type of keys is - # consistent and orderable. - # This fails on python 3 when keys are unorderable - # but we keep it in a try as it's faster. - Pickler._batch_setitems(self, iter(sorted(items))) - except TypeError: - # If keys are unorderable, sorting them using their hash. This is - # slower but works in any case. - Pickler._batch_setitems(self, iter(sorted((hash(k), v) - for k, v in items))) - - def save_set(self, set_items): - # forces order of items in Set to ensure consistent hash - Pickler.save(self, _ConsistentSet(set_items)) - - dispatch[type(set())] = save_set - - -class NumpyHasher(Hasher): - """ Special case the hasher for when numpy is loaded. - """ - - def __init__(self, hash_name='md5', coerce_mmap=False): - """ - Parameters - ---------- - hash_name: string - The hash algorithm to be used - coerce_mmap: boolean - Make no difference between np.memmap and np.ndarray - objects. - """ - self.coerce_mmap = coerce_mmap - Hasher.__init__(self, hash_name=hash_name) - # delayed import of numpy, to avoid tight coupling - import numpy as np - self.np = np - if hasattr(np, 'getbuffer'): - self._getbuffer = np.getbuffer - else: - self._getbuffer = memoryview - - def save(self, obj): - """ Subclass the save method, to hash ndarray subclass, rather - than pickling them. Off course, this is a total abuse of - the Pickler class. - """ - if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: - # Compute a hash of the object - # The update function of the hash requires a c_contiguous buffer. - if obj.shape == (): - # 0d arrays need to be flattened because viewing them as bytes - # raises a ValueError exception. - obj_c_contiguous = obj.flatten() - elif obj.flags.c_contiguous: - obj_c_contiguous = obj - elif obj.flags.f_contiguous: - obj_c_contiguous = obj.T - else: - # Cater for non-single-segment arrays: this creates a - # copy, and thus aleviates this issue. - # XXX: There might be a more efficient way of doing this - obj_c_contiguous = obj.flatten() - - # memoryview is not supported for some dtypes, e.g. datetime64, see - # https://github.com/numpy/numpy/issues/4983. The - # workaround is to view the array as bytes before - # taking the memoryview. - self._hash.update( - self._getbuffer(obj_c_contiguous.view(self.np.uint8))) - - # We store the class, to be able to distinguish between - # Objects with the same binary content, but different - # classes. - if self.coerce_mmap and isinstance(obj, self.np.memmap): - # We don't make the difference between memmap and - # normal ndarrays, to be able to reload previously - # computed results with memmap. - klass = self.np.ndarray - else: - klass = obj.__class__ - # We also return the dtype and the shape, to distinguish - # different views on the same data with different dtypes. - - # The object will be pickled by the pickler hashed at the end. - obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) - elif isinstance(obj, self.np.dtype): - # Atomic dtype objects are interned by their default constructor: - # np.dtype('f8') is np.dtype('f8') - # This interning is not maintained by a - # pickle.loads + pickle.dumps cycle, because __reduce__ - # uses copy=True in the dtype constructor. This - # non-deterministic behavior causes the internal memoizer - # of the hasher to generate different hash values - # depending on the history of the dtype object. - # To prevent the hash from being sensitive to this, we use - # .descr which is a full (and never interned) description of - # the array dtype according to the numpy doc. - klass = obj.__class__ - obj = (klass, ('HASHED', obj.descr)) - Hasher.save(self, obj) - - -def hash(obj, hash_name='md5', coerce_mmap=False): - """ Quick calculation of a hash to identify uniquely Python objects - containing numpy arrays. - - - Parameters - ----------- - hash_name: 'md5' or 'sha1' - Hashing algorithm used. sha1 is supposedly safer, but md5 is - faster. - coerce_mmap: boolean - Make no difference between np.memmap and np.ndarray - """ - if 'numpy' in sys.modules: - hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) - else: - hasher = Hasher(hash_name=hash_name) - return hasher.hash(obj) diff --git a/sklearn/externals/joblib/logger.py b/sklearn/externals/joblib/logger.py deleted file mode 100644 index f30efef8535d2..0000000000000 --- a/sklearn/externals/joblib/logger.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -Helpers for logging. - -This module needs much love to become useful. -""" - -# Author: Gael Varoquaux -# Copyright (c) 2008 Gael Varoquaux -# License: BSD Style, 3 clauses. - -from __future__ import print_function - -import time -import sys -import os -import shutil -import logging -import pprint - -from .disk import mkdirp - - -def _squeeze_time(t): - """Remove .1s to the time under Windows: this is the time it take to - stat files. This is needed to make results similar to timings under - Unix, for tests - """ - if sys.platform.startswith('win'): - return max(0, t - .1) - else: - return t - - -def format_time(t): - t = _squeeze_time(t) - return "%.1fs, %.1fmin" % (t, t / 60.) - - -def short_format_time(t): - t = _squeeze_time(t) - if t > 60: - return "%4.1fmin" % (t / 60.) - else: - return " %5.1fs" % (t) - - -def pformat(obj, indent=0, depth=3): - if 'numpy' in sys.modules: - import numpy as np - print_options = np.get_printoptions() - np.set_printoptions(precision=6, threshold=64, edgeitems=1) - else: - print_options = None - out = pprint.pformat(obj, depth=depth, indent=indent) - if print_options: - np.set_printoptions(**print_options) - return out - - -############################################################################### -# class `Logger` -############################################################################### -class Logger(object): - """ Base class for logging messages. - """ - - def __init__(self, depth=3): - """ - Parameters - ---------- - depth: int, optional - The depth of objects printed. - """ - self.depth = depth - - def warn(self, msg): - logging.warning("[%s]: %s" % (self, msg)) - - def debug(self, msg): - # XXX: This conflicts with the debug flag used in children class - logging.debug("[%s]: %s" % (self, msg)) - - def format(self, obj, indent=0): - """Return the formatted representation of the object.""" - return pformat(obj, indent=indent, depth=self.depth) - - -############################################################################### -# class `PrintTime` -############################################################################### -class PrintTime(object): - """ Print and log messages while keeping track of time. - """ - - def __init__(self, logfile=None, logdir=None): - if logfile is not None and logdir is not None: - raise ValueError('Cannot specify both logfile and logdir') - # XXX: Need argument docstring - self.last_time = time.time() - self.start_time = self.last_time - if logdir is not None: - logfile = os.path.join(logdir, 'joblib.log') - self.logfile = logfile - if logfile is not None: - mkdirp(os.path.dirname(logfile)) - if os.path.exists(logfile): - # Rotate the logs - for i in range(1, 9): - try: - shutil.move(logfile + '.%i' % i, - logfile + '.%i' % (i + 1)) - except: - "No reason failing here" - # Use a copy rather than a move, so that a process - # monitoring this file does not get lost. - try: - shutil.copy(logfile, logfile + '.1') - except: - "No reason failing here" - try: - with open(logfile, 'w') as logfile: - logfile.write('\nLogging joblib python script\n') - logfile.write('\n---%s---\n' % time.ctime(self.last_time)) - except: - """ Multiprocessing writing to files can create race - conditions. Rather fail silently than crash the - computation. - """ - # XXX: We actually need a debug flag to disable this - # silent failure. - - def __call__(self, msg='', total=False): - """ Print the time elapsed between the last call and the current - call, with an optional message. - """ - if not total: - time_lapse = time.time() - self.last_time - full_msg = "%s: %s" % (msg, format_time(time_lapse)) - else: - # FIXME: Too much logic duplicated - time_lapse = time.time() - self.start_time - full_msg = "%s: %.2fs, %.1f min" % (msg, time_lapse, - time_lapse / 60) - print(full_msg, file=sys.stderr) - if self.logfile is not None: - try: - with open(self.logfile, 'a') as f: - print(full_msg, file=f) - except: - """ Multiprocessing writing to files can create race - conditions. Rather fail silently than crash the - calculation. - """ - # XXX: We actually need a debug flag to disable this - # silent failure. - self.last_time = time.time() diff --git a/sklearn/externals/joblib/memory.py b/sklearn/externals/joblib/memory.py deleted file mode 100644 index f744aaae2196a..0000000000000 --- a/sklearn/externals/joblib/memory.py +++ /dev/null @@ -1,997 +0,0 @@ -""" -A context object for caching a function's return value each time it -is called with the same input arguments. - -""" - -# Author: Gael Varoquaux -# Copyright (c) 2009 Gael Varoquaux -# License: BSD Style, 3 clauses. - - -from __future__ import with_statement -import os -import time -import pydoc -import re -import functools -import traceback -import warnings -import inspect -import weakref - -# Local imports -from . import hashing -from .func_inspect import get_func_code, get_func_name, filter_args -from .func_inspect import format_call -from .func_inspect import format_signature -from ._memory_helpers import open_py_source -from .logger import Logger, format_time, pformat -from ._compat import _basestring, PY3_OR_LATER -from ._store_backends import StoreBackendBase, FileSystemStoreBackend - - -FIRST_LINE_TEXT = "# first line:" - -# TODO: The following object should have a data store object as a sub -# object, and the interface to persist and query should be separated in -# the data store. -# -# This would enable creating 'Memory' objects with a different logic for -# pickling that would simply span a MemorizedFunc with the same -# store (or do we want to copy it to avoid cross-talks?), for instance to -# implement HDF5 pickling. - -# TODO: Same remark for the logger, and probably use the Python logging -# mechanism. - - -def extract_first_line(func_code): - """ Extract the first line information from the function code - text if available. - """ - if func_code.startswith(FIRST_LINE_TEXT): - func_code = func_code.split('\n') - first_line = int(func_code[0][len(FIRST_LINE_TEXT):]) - func_code = '\n'.join(func_code[1:]) - else: - first_line = -1 - return func_code, first_line - - -class JobLibCollisionWarning(UserWarning): - """ Warn that there might be a collision between names of functions. - """ - - -_STORE_BACKENDS = {'local': FileSystemStoreBackend} - - -def register_store_backend(backend_name, backend): - """Extend available store backends. - - The Memory, MemorizeResult and MemorizeFunc objects are designed to be - agnostic to the type of store used behind. By default, the local file - system is used but this function gives the possibility to extend joblib's - memory pattern with other types of storage such as cloud storage (S3, GCS, - OpenStack, HadoopFS, etc) or blob DBs. - - Parameters - ---------- - backend_name: str - The name identifying the store backend being registered. For example, - 'local' is used with FileSystemStoreBackend. - backend: StoreBackendBase subclass - The name of a class that implements the StoreBackendBase interface. - - """ - if not isinstance(backend_name, _basestring): - raise ValueError("Store backend name should be a string, " - "'{0}' given.".format(backend_name)) - if backend is None or not issubclass(backend, StoreBackendBase): - raise ValueError("Store backend should inherit " - "StoreBackendBase, " - "'{0}' given.".format(backend)) - - _STORE_BACKENDS[backend_name] = backend - - -def _store_backend_factory(backend, location, verbose=0, backend_options=None): - """Return the correct store object for the given location.""" - if backend_options is None: - backend_options = {} - - if isinstance(location, StoreBackendBase): - return location - elif isinstance(location, _basestring): - obj = None - location = os.path.expanduser(location) - # The location is not a local file system, we look in the - # registered backends if there's one matching the given backend - # name. - for backend_key, backend_obj in _STORE_BACKENDS.items(): - if backend == backend_key: - obj = backend_obj() - - # By default, we assume the FileSystemStoreBackend can be used if no - # matching backend could be found. - if obj is None: - raise TypeError('Unknown location {0} or backend {1}'.format( - location, backend)) - - # The store backend is configured with the extra named parameters, - # some of them are specific to the underlying store backend. - obj.configure(location, verbose=verbose, - backend_options=backend_options) - return obj - - return None - - -def _get_func_fullname(func): - """Compute the part of part associated with a function.""" - modules, funcname = get_func_name(func) - modules.append(funcname) - return os.path.join(*modules) - - -def _build_func_identifier(func): - """Build a roughly unique identifier for the cached function.""" - parts = [] - if isinstance(func, _basestring): - parts.append(func) - else: - parts.append(_get_func_fullname(func)) - - # We reuse historical fs-like way of building a function identifier - return os.path.join(*parts) - - -def _format_load_msg(func_id, args_id, timestamp=None, metadata=None): - """ Helper function to format the message when loading the results. - """ - signature = "" - try: - if metadata is not None: - args = ", ".join(['%s=%s' % (name, value) - for name, value - in metadata['input_args'].items()]) - signature = "%s(%s)" % (os.path.basename(func_id), args) - else: - signature = os.path.basename(func_id) - except KeyError: - pass - - if timestamp is not None: - ts_string = "{0: <16}".format(format_time(time.time() - timestamp)) - else: - ts_string = "" - return '[Memory]{0}: Loading {1}'.format(ts_string, str(signature)) - - -# An in-memory store to avoid looking at the disk-based function -# source code to check if a function definition has changed -_FUNCTION_HASHES = weakref.WeakKeyDictionary() - - -############################################################################### -# class `MemorizedResult` -############################################################################### -class MemorizedResult(Logger): - """Object representing a cached value. - - Attributes - ---------- - location: str - The location of joblib cache. Depends on the store backend used. - - func: function or str - function whose output is cached. The string case is intended only for - instanciation based on the output of repr() on another instance. - (namely eval(repr(memorized_instance)) works). - - argument_hash: str - hash of the function arguments. - - backend: str - Type of store backend for reading/writing cache files. - Default is 'local'. - - mmap_mode: {None, 'r+', 'r', 'w+', 'c'} - The memmapping mode used when loading from cache numpy arrays. See - numpy.load for the meaning of the different values. - - verbose: int - verbosity level (0 means no message). - - timestamp, metadata: string - for internal use only. - """ - def __init__(self, location, func, args_id, backend='local', - mmap_mode=None, verbose=0, timestamp=None, metadata=None): - Logger.__init__(self) - self.func_id = _build_func_identifier(func) - if isinstance(func, _basestring): - self.func = func - else: - self.func = self.func_id - self.args_id = args_id - self.store_backend = _store_backend_factory(backend, location, - verbose=verbose) - self.mmap_mode = mmap_mode - - if metadata is not None: - self.metadata = metadata - else: - self.metadata = self.store_backend.get_metadata( - [self.func_id, self.args_id]) - - self.duration = self.metadata.get('duration', None) - self.verbose = verbose - self.timestamp = timestamp - - @property - def argument_hash(self): - warnings.warn( - "The 'argument_hash' attribute has been deprecated in version " - "0.12 and will be removed in version 0.14.\n" - "Use `args_id` attribute instead.", - DeprecationWarning, stacklevel=2) - return self.args_id - - def get(self): - """Read value from cache and return it.""" - if self.verbose: - msg = _format_load_msg(self.func_id, self.args_id, - timestamp=self.timestamp, - metadata=self.metadata) - else: - msg = None - - try: - return self.store_backend.load_item( - [self.func_id, self.args_id], msg=msg, verbose=self.verbose) - except (ValueError, KeyError) as exc: - # KeyError is expected under Python 2.7, ValueError under Python 3 - new_exc = KeyError( - "Error while trying to load a MemorizedResult's value. " - "It seems that this folder is corrupted : {}".format( - os.path.join( - self.store_backend.location, self.func_id, - self.args_id) - )) - new_exc.__cause__ = exc - raise new_exc - - def clear(self): - """Clear value from cache""" - self.store_backend.clear_item([self.func_id, self.args_id]) - - def __repr__(self): - return ('{class_name}(location="{location}", func="{func}", ' - 'args_id="{args_id}")' - .format(class_name=self.__class__.__name__, - location=self.store_backend.location, - func=self.func, - args_id=self.args_id - )) - - def __getstate__(self): - state = self.__dict__.copy() - state['timestamp'] = None - return state - - -class NotMemorizedResult(object): - """Class representing an arbitrary value. - - This class is a replacement for MemorizedResult when there is no cache. - """ - __slots__ = ('value', 'valid') - - def __init__(self, value): - self.value = value - self.valid = True - - def get(self): - if self.valid: - return self.value - else: - raise KeyError("No value stored.") - - def clear(self): - self.valid = False - self.value = None - - def __repr__(self): - if self.valid: - return ('{class_name}({value})' - .format(class_name=self.__class__.__name__, - value=pformat(self.value))) - else: - return self.__class__.__name__ + ' with no value' - - # __getstate__ and __setstate__ are required because of __slots__ - def __getstate__(self): - return {"valid": self.valid, "value": self.value} - - def __setstate__(self, state): - self.valid = state["valid"] - self.value = state["value"] - - -############################################################################### -# class `NotMemorizedFunc` -############################################################################### -class NotMemorizedFunc(object): - """No-op object decorating a function. - - This class replaces MemorizedFunc when there is no cache. It provides an - identical API but does not write anything on disk. - - Attributes - ---------- - func: callable - Original undecorated function. - """ - # Should be a light as possible (for speed) - def __init__(self, func): - self.func = func - - def __call__(self, *args, **kwargs): - return self.func(*args, **kwargs) - - def call_and_shelve(self, *args, **kwargs): - return NotMemorizedResult(self.func(*args, **kwargs)) - - def __repr__(self): - return '{0}(func={1})'.format(self.__class__.__name__, self.func) - - def clear(self, warn=True): - # Argument "warn" is for compatibility with MemorizedFunc.clear - pass - - -############################################################################### -# class `MemorizedFunc` -############################################################################### -class MemorizedFunc(Logger): - """Callable object decorating a function for caching its return value - each time it is called. - - Methods are provided to inspect the cache or clean it. - - Attributes - ---------- - func: callable - The original, undecorated, function. - - location: string - The location of joblib cache. Depends on the store backend used. - - backend: str - Type of store backend for reading/writing cache files. - Default is 'local', in which case the location is the path to a - disk storage. - - ignore: list or None - List of variable names to ignore when choosing whether to - recompute. - - mmap_mode: {None, 'r+', 'r', 'w+', 'c'} - The memmapping mode used when loading from cache - numpy arrays. See numpy.load for the meaning of the different - values. - - compress: boolean, or integer - Whether to zip the stored data on disk. If an integer is - given, it should be between 1 and 9, and sets the amount - of compression. Note that compressed arrays cannot be - read by memmapping. - - verbose: int, optional - The verbosity flag, controls messages that are issued as - the function is evaluated. - """ - # ------------------------------------------------------------------------ - # Public interface - # ------------------------------------------------------------------------ - - def __init__(self, func, location, backend='local', ignore=None, - mmap_mode=None, compress=False, verbose=1, timestamp=None): - Logger.__init__(self) - self.mmap_mode = mmap_mode - self.compress = compress - self.func = func - - if ignore is None: - ignore = [] - self.ignore = ignore - self._verbose = verbose - - # retrieve store object from backend type and location. - self.store_backend = _store_backend_factory(backend, location, - verbose=verbose, - backend_options=dict( - compress=compress, - mmap_mode=mmap_mode), - ) - if self.store_backend is not None: - # Create func directory on demand. - self.store_backend.\ - store_cached_func_code([_build_func_identifier(self.func)]) - - if timestamp is None: - timestamp = time.time() - self.timestamp = timestamp - try: - functools.update_wrapper(self, func) - except: - " Objects like ufunc don't like that " - if inspect.isfunction(func): - doc = pydoc.TextDoc().document(func) - # Remove blank line - doc = doc.replace('\n', '\n\n', 1) - # Strip backspace-overprints for compatibility with autodoc - doc = re.sub('\x08.', '', doc) - else: - # Pydoc does a poor job on other objects - doc = func.__doc__ - self.__doc__ = 'Memoized version of %s' % doc - - def _cached_call(self, args, kwargs, shelving=False): - """Call wrapped function and cache result, or read cache if available. - - This function returns the wrapped function output and some metadata. - - Arguments: - ---------- - - args, kwargs: list and dict - input arguments for wrapped function - - shelving: bool - True when called via the call_and_shelve function. - - - Returns - ------- - output: value or tuple or None - Output of the wrapped function. - If shelving is True and the call has been already cached, - output is None. - - argument_hash: string - Hash of function arguments. - - metadata: dict - Some metadata about wrapped function call (see _persist_input()). - """ - func_id, args_id = self._get_output_identifiers(*args, **kwargs) - metadata = None - msg = None - - # Wether or not the memorized function must be called - must_call = False - - # FIXME: The statements below should be try/excepted - # Compare the function code with the previous to see if the - # function code has changed - if not (self._check_previous_func_code(stacklevel=4) and - self.store_backend.contains_item([func_id, args_id])): - if self._verbose > 10: - _, name = get_func_name(self.func) - self.warn('Computing func {0}, argument hash {1} ' - 'in location {2}' - .format(name, args_id, - self.store_backend. - get_cached_func_info([func_id])['location'])) - must_call = True - else: - try: - t0 = time.time() - if self._verbose: - msg = _format_load_msg(func_id, args_id, - timestamp=self.timestamp, - metadata=metadata) - - if not shelving: - # When shelving, we do not need to load the output - out = self.store_backend.load_item( - [func_id, args_id], - msg=msg, - verbose=self._verbose) - else: - out = None - - if self._verbose > 4: - t = time.time() - t0 - _, name = get_func_name(self.func) - msg = '%s cache loaded - %s' % (name, format_time(t)) - print(max(0, (80 - len(msg))) * '_' + msg) - except Exception: - # XXX: Should use an exception logger - _, signature = format_signature(self.func, *args, **kwargs) - self.warn('Exception while loading results for ' - '{}\n {}'.format(signature, traceback.format_exc())) - - must_call = True - - if must_call: - out, metadata = self.call(*args, **kwargs) - if self.mmap_mode is not None: - # Memmap the output at the first call to be consistent with - # later calls - if self._verbose: - msg = _format_load_msg(func_id, args_id, - timestamp=self.timestamp, - metadata=metadata) - out = self.store_backend.load_item([func_id, args_id], msg=msg, - verbose=self._verbose) - - return (out, args_id, metadata) - - def call_and_shelve(self, *args, **kwargs): - """Call wrapped function, cache result and return a reference. - - This method returns a reference to the cached result instead of the - result itself. The reference object is small and pickeable, allowing - to send or store it easily. Call .get() on reference object to get - result. - - Returns - ------- - cached_result: MemorizedResult or NotMemorizedResult - reference to the value returned by the wrapped function. The - class "NotMemorizedResult" is used when there is no cache - activated (e.g. location=None in Memory). - """ - _, args_id, metadata = self._cached_call(args, kwargs, shelving=True) - return MemorizedResult(self.store_backend, self.func, args_id, - metadata=metadata, verbose=self._verbose - 1, - timestamp=self.timestamp) - - def __call__(self, *args, **kwargs): - return self._cached_call(args, kwargs)[0] - - def __getstate__(self): - """ We don't store the timestamp when pickling, to avoid the hash - depending from it. - """ - state = self.__dict__.copy() - state['timestamp'] = None - return state - - # ------------------------------------------------------------------------ - # Private interface - # ------------------------------------------------------------------------ - - def _get_argument_hash(self, *args, **kwargs): - return hashing.hash(filter_args(self.func, self.ignore, args, kwargs), - coerce_mmap=(self.mmap_mode is not None)) - - def _get_output_identifiers(self, *args, **kwargs): - """Return the func identifier and input parameter hash of a result.""" - func_id = _build_func_identifier(self.func) - argument_hash = self._get_argument_hash(*args, **kwargs) - return func_id, argument_hash - - def _hash_func(self): - """Hash a function to key the online cache""" - func_code_h = hash(getattr(self.func, '__code__', None)) - return id(self.func), hash(self.func), func_code_h - - def _write_func_code(self, func_code, first_line): - """ Write the function code and the filename to a file. - """ - # We store the first line because the filename and the function - # name is not always enough to identify a function: people - # sometimes have several functions named the same way in a - # file. This is bad practice, but joblib should be robust to bad - # practice. - func_id = _build_func_identifier(self.func) - func_code = u'%s %i\n%s' % (FIRST_LINE_TEXT, first_line, func_code) - self.store_backend.store_cached_func_code([func_id], func_code) - - # Also store in the in-memory store of function hashes - is_named_callable = False - if PY3_OR_LATER: - is_named_callable = (hasattr(self.func, '__name__') and - self.func.__name__ != '') - else: - is_named_callable = (hasattr(self.func, 'func_name') and - self.func.func_name != '') - if is_named_callable: - # Don't do this for lambda functions or strange callable - # objects, as it ends up being too fragile - func_hash = self._hash_func() - try: - _FUNCTION_HASHES[self.func] = func_hash - except TypeError: - # Some callable are not hashable - pass - - def _check_previous_func_code(self, stacklevel=2): - """ - stacklevel is the depth a which this function is called, to - issue useful warnings to the user. - """ - # First check if our function is in the in-memory store. - # Using the in-memory store not only makes things faster, but it - # also renders us robust to variations of the files when the - # in-memory version of the code does not vary - try: - if self.func in _FUNCTION_HASHES: - # We use as an identifier the id of the function and its - # hash. This is more likely to falsely change than have hash - # collisions, thus we are on the safe side. - func_hash = self._hash_func() - if func_hash == _FUNCTION_HASHES[self.func]: - return True - except TypeError: - # Some callables are not hashable - pass - - # Here, we go through some effort to be robust to dynamically - # changing code and collision. We cannot inspect.getsource - # because it is not reliable when using IPython's magic "%run". - func_code, source_file, first_line = get_func_code(self.func) - func_id = _build_func_identifier(self.func) - - try: - old_func_code, old_first_line =\ - extract_first_line( - self.store_backend.get_cached_func_code([func_id])) - except (IOError, OSError): # some backend can also raise OSError - self._write_func_code(func_code, first_line) - return False - if old_func_code == func_code: - return True - - # We have differing code, is this because we are referring to - # different functions, or because the function we are referring to has - # changed? - - _, func_name = get_func_name(self.func, resolv_alias=False, - win_characters=False) - if old_first_line == first_line == -1 or func_name == '': - if not first_line == -1: - func_description = ("{0} ({1}:{2})" - .format(func_name, source_file, - first_line)) - else: - func_description = func_name - warnings.warn(JobLibCollisionWarning( - "Cannot detect name collisions for function '{0}'" - .format(func_description)), stacklevel=stacklevel) - - # Fetch the code at the old location and compare it. If it is the - # same than the code store, we have a collision: the code in the - # file has not changed, but the name we have is pointing to a new - # code block. - if not old_first_line == first_line and source_file is not None: - possible_collision = False - if os.path.exists(source_file): - _, func_name = get_func_name(self.func, resolv_alias=False) - num_lines = len(func_code.split('\n')) - with open_py_source(source_file) as f: - on_disk_func_code = f.readlines()[ - old_first_line - 1:old_first_line - 1 + num_lines - 1] - on_disk_func_code = ''.join(on_disk_func_code) - possible_collision = (on_disk_func_code.rstrip() == - old_func_code.rstrip()) - else: - possible_collision = source_file.startswith(' 10: - _, func_name = get_func_name(self.func, resolv_alias=False) - self.warn("Function {0} (identified by {1}) has changed" - ".".format(func_name, func_id)) - self.clear(warn=True) - return False - - def clear(self, warn=True): - """Empty the function's cache.""" - func_id = _build_func_identifier(self.func) - - if self._verbose > 0 and warn: - self.warn("Clearing function cache identified by %s" % func_id) - self.store_backend.clear_path([func_id, ]) - - func_code, _, first_line = get_func_code(self.func) - self._write_func_code(func_code, first_line) - - def call(self, *args, **kwargs): - """ Force the execution of the function with the given arguments and - persist the output values. - """ - start_time = time.time() - func_id, args_id = self._get_output_identifiers(*args, **kwargs) - if self._verbose > 0: - print(format_call(self.func, args, kwargs)) - output = self.func(*args, **kwargs) - self.store_backend.dump_item( - [func_id, args_id], output, verbose=self._verbose) - - duration = time.time() - start_time - metadata = self._persist_input(duration, args, kwargs) - - if self._verbose > 0: - _, name = get_func_name(self.func) - msg = '%s - %s' % (name, format_time(duration)) - print(max(0, (80 - len(msg))) * '_' + msg) - return output, metadata - - def _persist_input(self, duration, args, kwargs, this_duration_limit=0.5): - """ Save a small summary of the call using json format in the - output directory. - - output_dir: string - directory where to write metadata. - - duration: float - time taken by hashing input arguments, calling the wrapped - function and persisting its output. - - args, kwargs: list and dict - input arguments for wrapped function - - this_duration_limit: float - Max execution time for this function before issuing a warning. - """ - start_time = time.time() - argument_dict = filter_args(self.func, self.ignore, - args, kwargs) - - input_repr = dict((k, repr(v)) for k, v in argument_dict.items()) - # This can fail due to race-conditions with multiple - # concurrent joblibs removing the file or the directory - metadata = {"duration": duration, "input_args": input_repr} - - func_id, args_id = self._get_output_identifiers(*args, **kwargs) - self.store_backend.store_metadata([func_id, args_id], metadata) - - this_duration = time.time() - start_time - if this_duration > this_duration_limit: - # This persistence should be fast. It will not be if repr() takes - # time and its output is large, because json.dump will have to - # write a large file. This should not be an issue with numpy arrays - # for which repr() always output a short representation, but can - # be with complex dictionaries. Fixing the problem should be a - # matter of replacing repr() above by something smarter. - warnings.warn("Persisting input arguments took %.2fs to run.\n" - "If this happens often in your code, it can cause " - "performance problems \n" - "(results will be correct in all cases). \n" - "The reason for this is probably some large input " - "arguments for a wrapped\n" - " function (e.g. large strings).\n" - "THIS IS A JOBLIB ISSUE. If you can, kindly provide " - "the joblib's team with an\n" - " example so that they can fix the problem." - % this_duration, stacklevel=5) - return metadata - - # XXX: Need a method to check if results are available. - - # ------------------------------------------------------------------------ - # Private `object` interface - # ------------------------------------------------------------------------ - - def __repr__(self): - return '{class_name}(func={func}, location={location})'.format( - class_name=self.__class__.__name__, - func=self.func, - location=self.store_backend.location,) - - -############################################################################### -# class `Memory` -############################################################################### -class Memory(Logger): - """ A context object for caching a function's return value each time it - is called with the same input arguments. - - All values are cached on the filesystem, in a deep directory - structure. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - location: str or None - The path of the base directory to use as a data store - or None. If None is given, no caching is done and - the Memory object is completely transparent. This option - replaces cachedir since version 0.12. - - backend: str, optional - Type of store backend for reading/writing cache files. - Default: 'local'. - The 'local' backend is using regular filesystem operations to - manipulate data (open, mv, etc) in the backend. - - cachedir: str or None, optional - - .. deprecated: 0.12 - 'cachedir' has been deprecated in 0.12 and will be - removed in 0.14. Use the 'location' parameter instead. - - mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional - The memmapping mode used when loading from cache - numpy arrays. See numpy.load for the meaning of the - arguments. - - compress: boolean, or integer, optional - Whether to zip the stored data on disk. If an integer is - given, it should be between 1 and 9, and sets the amount - of compression. Note that compressed arrays cannot be - read by memmapping. - - verbose: int, optional - Verbosity flag, controls the debug messages that are issued - as functions are evaluated. - - bytes_limit: int, optional - Limit in bytes of the size of the cache. - - backend_options: dict, optional - Contains a dictionnary of named parameters used to configure - the store backend. - """ - # ------------------------------------------------------------------------ - # Public interface - # ------------------------------------------------------------------------ - - def __init__(self, location=None, backend='local', cachedir=None, - mmap_mode=None, compress=False, verbose=1, bytes_limit=None, - backend_options=None): - # XXX: Bad explanation of the None value of cachedir - Logger.__init__(self) - self._verbose = verbose - self.mmap_mode = mmap_mode - self.timestamp = time.time() - self.bytes_limit = bytes_limit - self.backend = backend - self.compress = compress - if backend_options is None: - backend_options = {} - self.backend_options = backend_options - - if compress and mmap_mode is not None: - warnings.warn('Compressed results cannot be memmapped', - stacklevel=2) - if cachedir is not None: - if location is not None: - raise ValueError( - 'You set both "location={0!r} and "cachedir={1!r}". ' - "'cachedir' has been deprecated in version " - "0.12 and will be removed in version 0.14.\n" - 'Please only set "location={0!r}"'.format( - location, cachedir)) - - warnings.warn( - "The 'cachedir' parameter has been deprecated in version " - "0.12 and will be removed in version 0.14.\n" - 'You provided "cachedir={0!r}", ' - 'use "location={0!r}" instead.'.format(cachedir), - DeprecationWarning, stacklevel=2) - location = cachedir - - self.location = location - if isinstance(location, _basestring): - location = os.path.join(location, 'joblib') - - self.store_backend = _store_backend_factory( - backend, location, verbose=self._verbose, - backend_options=dict(compress=compress, mmap_mode=mmap_mode, - **backend_options)) - - @property - def cachedir(self): - warnings.warn( - "The 'cachedir' attribute has been deprecated in version 0.12 " - "and will be removed in version 0.14.\n" - "Use os.path.join(memory.location, 'joblib') attribute instead.", - DeprecationWarning, stacklevel=2) - if self.location is None: - return None - return os.path.join(self.location, 'joblib') - - def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False): - """ Decorates the given function func to only compute its return - value for input arguments not cached on disk. - - Parameters - ---------- - func: callable, optional - The function to be decorated - ignore: list of strings - A list of arguments name to ignore in the hashing - verbose: integer, optional - The verbosity mode of the function. By default that - of the memory object is used. - mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional - The memmapping mode used when loading from cache - numpy arrays. See numpy.load for the meaning of the - arguments. By default that of the memory object is used. - - Returns - ------- - decorated_func: MemorizedFunc object - The returned object is a MemorizedFunc object, that is - callable (behaves like a function), but offers extra - methods for cache lookup and management. See the - documentation for :class:`joblib.memory.MemorizedFunc`. - """ - if func is None: - # Partial application, to be able to specify extra keyword - # arguments in decorators - return functools.partial(self.cache, ignore=ignore, - verbose=verbose, mmap_mode=mmap_mode) - if self.store_backend is None: - return NotMemorizedFunc(func) - if verbose is None: - verbose = self._verbose - if mmap_mode is False: - mmap_mode = self.mmap_mode - if isinstance(func, MemorizedFunc): - func = func.func - return MemorizedFunc(func, location=self.store_backend, - backend=self.backend, - ignore=ignore, mmap_mode=mmap_mode, - compress=self.compress, - verbose=verbose, timestamp=self.timestamp) - - def clear(self, warn=True): - """ Erase the complete cache directory. - """ - if warn: - self.warn('Flushing completely the cache') - if self.store_backend is not None: - self.store_backend.clear() - - def reduce_size(self): - """Remove cache elements to make cache size fit in ``bytes_limit``.""" - if self.bytes_limit is not None and self.store_backend is not None: - self.store_backend.reduce_store_size(self.bytes_limit) - - def eval(self, func, *args, **kwargs): - """ Eval function func with arguments `*args` and `**kwargs`, - in the context of the memory. - - This method works similarly to the builtin `apply`, except - that the function is called only if the cache is not - up to date. - - """ - if self.store_backend is None: - return func(*args, **kwargs) - return self.cache(func)(*args, **kwargs) - - # ------------------------------------------------------------------------ - # Private `object` interface - # ------------------------------------------------------------------------ - - def __repr__(self): - return '{class_name}(location={location})'.format( - class_name=self.__class__.__name__, - location=(None if self.store_backend is None - else self.store_backend.location)) - - def __getstate__(self): - """ We don't store the timestamp when pickling, to avoid the hash - depending from it. - """ - state = self.__dict__.copy() - state['timestamp'] = None - return state diff --git a/sklearn/externals/joblib/my_exceptions.py b/sklearn/externals/joblib/my_exceptions.py deleted file mode 100644 index 765885e33c178..0000000000000 --- a/sklearn/externals/joblib/my_exceptions.py +++ /dev/null @@ -1,125 +0,0 @@ -""" -Exceptions -""" -# Author: Gael Varoquaux < gael dot varoquaux at normalesup dot org > -# Copyright: 2010, Gael Varoquaux -# License: BSD 3 clause -from ._compat import PY3_OR_LATER - - -class JoblibException(Exception): - """A simple exception with an error message that you can get to.""" - def __init__(self, *args): - # We need to implement __init__ so that it is picked in the - # multiple heritance hierarchy in the class created in - # _mk_exception. Note: in Python 2, if you implement __init__ - # in your exception class you need to set .args correctly, - # otherwise you can dump an exception instance with pickle but - # not load it (at load time an empty .args will be passed to - # the constructor). Also we want to be explicit and not use - # 'super' here. Using 'super' can cause a sibling class method - # to be called and we have no control the sibling class method - # constructor signature in the exception returned by - # _mk_exception. - Exception.__init__(self, *args) - - def __repr__(self): - if hasattr(self, 'args') and len(self.args) > 0: - message = self.args[0] - else: - message = '' - - name = self.__class__.__name__ - return '%s\n%s\n%s\n%s' % (name, 75 * '_', message, 75 * '_') - - __str__ = __repr__ - - -class TransportableException(JoblibException): - """An exception containing all the info to wrap an original - exception and recreate it. - """ - - def __init__(self, message, etype): - # The next line set the .args correctly. This is needed to - # make the exception loadable with pickle - JoblibException.__init__(self, message, etype) - self.message = message - self.etype = etype - - def unwrap(self, context_message=""): - report = """\ -%s ---------------------------------------------------------------------------- -Joblib worker traceback: ---------------------------------------------------------------------------- -%s""" % (context_message, self.message) - # Unwrap the exception to a JoblibException - exception_type = _mk_exception(self.etype)[0] - return exception_type(report) - - -class WorkerInterrupt(Exception): - """ An exception that is not KeyboardInterrupt to allow subprocesses - to be interrupted. - """ - pass - - -_exception_mapping = dict() - - -def _mk_exception(exception, name=None): - if issubclass(exception, JoblibException): - # No need to wrap recursively JoblibException - return exception, exception.__name__ - - # Create an exception inheriting from both JoblibException - # and that exception - if name is None: - name = exception.__name__ - this_name = 'Joblib%s' % name - if this_name in _exception_mapping: - # Avoid creating twice the same exception - this_exception = _exception_mapping[this_name] - else: - if exception is Exception: - # JoblibException is already a subclass of Exception. No - # need to use multiple inheritance - return JoblibException, this_name - try: - this_exception = type( - this_name, (JoblibException, exception), {}) - _exception_mapping[this_name] = this_exception - except TypeError: - # This happens if "Cannot create a consistent method - # resolution order", e.g. because 'exception' is a - # subclass of JoblibException or 'exception' is not an - # acceptable base class - this_exception = JoblibException - - return this_exception, this_name - - -def _mk_common_exceptions(): - namespace = dict() - if PY3_OR_LATER: - import builtins as _builtin_exceptions - common_exceptions = filter( - lambda x: x.endswith('Error'), - dir(_builtin_exceptions)) - else: - import exceptions as _builtin_exceptions - common_exceptions = dir(_builtin_exceptions) - - for name in common_exceptions: - obj = getattr(_builtin_exceptions, name) - if isinstance(obj, type) and issubclass(obj, BaseException): - this_obj, this_name = _mk_exception(obj, name=name) - namespace[this_name] = this_obj - return namespace - - -# Updating module locals so that the exceptions pickle right. AFAIK this -# works only at module-creation time -locals().update(_mk_common_exceptions()) diff --git a/sklearn/externals/joblib/numpy_pickle.py b/sklearn/externals/joblib/numpy_pickle.py index bae0df31fa9c2..7a4a2885c9f15 100644 --- a/sklearn/externals/joblib/numpy_pickle.py +++ b/sklearn/externals/joblib/numpy_pickle.py @@ -1,600 +1,3 @@ -"""Utilities for fast persistence of big data, with optional compression.""" +# Import necessary to preserve backward compatibliity of pickles -# Author: Gael Varoquaux -# Copyright (c) 2009 Gael Varoquaux -# License: BSD Style, 3 clauses. - -import pickle -import os -import sys -import warnings -try: - from pathlib import Path -except ImportError: - Path = None - -from .compressor import lz4, LZ4_NOT_INSTALLED_ERROR -from .compressor import _COMPRESSORS, register_compressor, BinaryZlibFile -from .compressor import (ZlibCompressorWrapper, GzipCompressorWrapper, - BZ2CompressorWrapper, LZMACompressorWrapper, - XZCompressorWrapper, LZ4CompressorWrapper) -from .numpy_pickle_utils import Unpickler, Pickler -from .numpy_pickle_utils import _read_fileobject, _write_fileobject -from .numpy_pickle_utils import _read_bytes, BUFFER_SIZE -from .numpy_pickle_compat import load_compatibility -from .numpy_pickle_compat import NDArrayWrapper -# For compatibility with old versions of joblib, we need ZNDArrayWrapper -# to be visible in the current namespace. -# Explicitly skipping next line from flake8 as it triggers an F401 warning -# which we don't care. -from .numpy_pickle_compat import ZNDArrayWrapper # noqa -from ._compat import _basestring, PY3_OR_LATER -from .backports import make_memmap - -# Register supported compressors -register_compressor('zlib', ZlibCompressorWrapper()) -register_compressor('gzip', GzipCompressorWrapper()) -register_compressor('bz2', BZ2CompressorWrapper()) -register_compressor('lzma', LZMACompressorWrapper()) -register_compressor('xz', XZCompressorWrapper()) -register_compressor('lz4', LZ4CompressorWrapper()) - -############################################################################### -# Utility objects for persistence. - - -class NumpyArrayWrapper(object): - """An object to be persisted instead of numpy arrays. - - This object is used to hack into the pickle machinery and read numpy - array data from our custom persistence format. - More precisely, this object is used for: - * carrying the information of the persisted array: subclass, shape, order, - dtype. Those ndarray metadata are used to correctly reconstruct the array - with low level numpy functions. - * determining if memmap is allowed on the array. - * reading the array bytes from a file. - * reading the array using memorymap from a file. - * writing the array bytes to a file. - - Attributes - ---------- - subclass: numpy.ndarray subclass - Determine the subclass of the wrapped array. - shape: numpy.ndarray shape - Determine the shape of the wrapped array. - order: {'C', 'F'} - Determine the order of wrapped array data. 'C' is for C order, 'F' is - for fortran order. - dtype: numpy.ndarray dtype - Determine the data type of the wrapped array. - allow_mmap: bool - Determine if memory mapping is allowed on the wrapped array. - Default: False. - """ - - def __init__(self, subclass, shape, order, dtype, allow_mmap=False): - """Constructor. Store the useful information for later.""" - self.subclass = subclass - self.shape = shape - self.order = order - self.dtype = dtype - self.allow_mmap = allow_mmap - - def write_array(self, array, pickler): - """Write array bytes to pickler file handle. - - This function is an adaptation of the numpy write_array function - available in version 1.10.1 in numpy/lib/format.py. - """ - # Set buffer size to 16 MiB to hide the Python loop overhead. - buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) - if array.dtype.hasobject: - # We contain Python objects so we cannot write out the data - # directly. Instead, we will pickle it out with version 2 of the - # pickle protocol. - pickle.dump(array, pickler.file_handle, protocol=2) - else: - for chunk in pickler.np.nditer(array, - flags=['external_loop', - 'buffered', - 'zerosize_ok'], - buffersize=buffersize, - order=self.order): - pickler.file_handle.write(chunk.tostring('C')) - - def read_array(self, unpickler): - """Read array from unpickler file handle. - - This function is an adaptation of the numpy read_array function - available in version 1.10.1 in numpy/lib/format.py. - """ - if len(self.shape) == 0: - count = 1 - else: - count = unpickler.np.multiply.reduce(self.shape) - # Now read the actual data. - if self.dtype.hasobject: - # The array contained Python objects. We need to unpickle the data. - array = pickle.load(unpickler.file_handle) - else: - if (not PY3_OR_LATER and - unpickler.np.compat.isfileobj(unpickler.file_handle)): - # In python 2, gzip.GzipFile is considered as a file so one - # can use numpy.fromfile(). - # For file objects, use np.fromfile function. - # This function is faster than the memory-intensive - # method below. - array = unpickler.np.fromfile(unpickler.file_handle, - dtype=self.dtype, count=count) - else: - # This is not a real file. We have to read it the - # memory-intensive way. - # crc32 module fails on reads greater than 2 ** 32 bytes, - # breaking large reads from gzip streams. Chunk reads to - # BUFFER_SIZE bytes to avoid issue and reduce memory overhead - # of the read. In non-chunked case count < max_read_count, so - # only one read is performed. - max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, - self.dtype.itemsize) - - array = unpickler.np.empty(count, dtype=self.dtype) - for i in range(0, count, max_read_count): - read_count = min(max_read_count, count - i) - read_size = int(read_count * self.dtype.itemsize) - data = _read_bytes(unpickler.file_handle, - read_size, "array data") - array[i:i + read_count] = \ - unpickler.np.frombuffer(data, dtype=self.dtype, - count=read_count) - del data - - if self.order == 'F': - array.shape = self.shape[::-1] - array = array.transpose() - else: - array.shape = self.shape - - return array - - def read_mmap(self, unpickler): - """Read an array using numpy memmap.""" - offset = unpickler.file_handle.tell() - if unpickler.mmap_mode == 'w+': - unpickler.mmap_mode = 'r+' - - marray = make_memmap(unpickler.filename, - dtype=self.dtype, - shape=self.shape, - order=self.order, - mode=unpickler.mmap_mode, - offset=offset) - # update the offset so that it corresponds to the end of the read array - unpickler.file_handle.seek(offset + marray.nbytes) - - return marray - - def read(self, unpickler): - """Read the array corresponding to this wrapper. - - Use the unpickler to get all information to correctly read the array. - - Parameters - ---------- - unpickler: NumpyUnpickler - - Returns - ------- - array: numpy.ndarray - - """ - # When requested, only use memmap mode if allowed. - if unpickler.mmap_mode is not None and self.allow_mmap: - array = self.read_mmap(unpickler) - else: - array = self.read_array(unpickler) - - # Manage array subclass case - if (hasattr(array, '__array_prepare__') and - self.subclass not in (unpickler.np.ndarray, - unpickler.np.memmap)): - # We need to reconstruct another subclass - new_array = unpickler.np.core.multiarray._reconstruct( - self.subclass, (0,), 'b') - return new_array.__array_prepare__(array) - else: - return array - -############################################################################### -# Pickler classes - - -class NumpyPickler(Pickler): - """A pickler to persist big data efficiently. - - The main features of this object are: - * persistence of numpy arrays in a single file. - * optional compression with a special care on avoiding memory copies. - - Attributes - ---------- - fp: file - File object handle used for serializing the input object. - protocol: int, optional - Pickle protocol used. Default is pickle.DEFAULT_PROTOCOL under - python 3, pickle.HIGHEST_PROTOCOL otherwise. - """ - - dispatch = Pickler.dispatch.copy() - - def __init__(self, fp, protocol=None): - self.file_handle = fp - self.buffered = isinstance(self.file_handle, BinaryZlibFile) - - # By default we want a pickle protocol that only changes with - # the major python version and not the minor one - if protocol is None: - protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER - else pickle.HIGHEST_PROTOCOL) - - Pickler.__init__(self, self.file_handle, protocol=protocol) - # delayed import of numpy, to avoid tight coupling - try: - import numpy as np - except ImportError: - np = None - self.np = np - - def _create_array_wrapper(self, array): - """Create and returns a numpy array wrapper from a numpy array.""" - order = 'F' if (array.flags.f_contiguous and - not array.flags.c_contiguous) else 'C' - allow_mmap = not self.buffered and not array.dtype.hasobject - wrapper = NumpyArrayWrapper(type(array), - array.shape, order, array.dtype, - allow_mmap=allow_mmap) - - return wrapper - - def save(self, obj): - """Subclass the Pickler `save` method. - - This is a total abuse of the Pickler class in order to use the numpy - persistence function `save` instead of the default pickle - implementation. The numpy array is replaced by a custom wrapper in the - pickle persistence stack and the serialized array is written right - after in the file. Warning: the file produced does not follow the - pickle format. As such it can not be read with `pickle.load`. - """ - if self.np is not None and type(obj) in (self.np.ndarray, - self.np.matrix, - self.np.memmap): - if type(obj) is self.np.memmap: - # Pickling doesn't work with memmapped arrays - obj = self.np.asanyarray(obj) - - # The array wrapper is pickled instead of the real array. - wrapper = self._create_array_wrapper(obj) - Pickler.save(self, wrapper) - - # A framer was introduced with pickle protocol 4 and we want to - # ensure the wrapper object is written before the numpy array - # buffer in the pickle file. - # See https://www.python.org/dev/peps/pep-3154/#framing to get - # more information on the framer behavior. - if self.proto >= 4: - self.framer.commit_frame(force=True) - - # And then array bytes are written right after the wrapper. - wrapper.write_array(obj, self) - return - - return Pickler.save(self, obj) - - -class NumpyUnpickler(Unpickler): - """A subclass of the Unpickler to unpickle our numpy pickles. - - Attributes - ---------- - mmap_mode: str - The memorymap mode to use for reading numpy arrays. - file_handle: file_like - File object to unpickle from. - filename: str - Name of the file to unpickle from. It should correspond to file_handle. - This parameter is required when using mmap_mode. - np: module - Reference to numpy module if numpy is installed else None. - - """ - - dispatch = Unpickler.dispatch.copy() - - def __init__(self, filename, file_handle, mmap_mode=None): - # The next line is for backward compatibility with pickle generated - # with joblib versions less than 0.10. - self._dirname = os.path.dirname(filename) - - self.mmap_mode = mmap_mode - self.file_handle = file_handle - # filename is required for numpy mmap mode. - self.filename = filename - self.compat_mode = False - Unpickler.__init__(self, self.file_handle) - try: - import numpy as np - except ImportError: - np = None - self.np = np - - def load_build(self): - """Called to set the state of a newly created object. - - We capture it to replace our place-holder objects, NDArrayWrapper or - NumpyArrayWrapper, by the array we are interested in. We - replace them directly in the stack of pickler. - NDArrayWrapper is used for backward compatibility with joblib <= 0.9. - """ - Unpickler.load_build(self) - - # For backward compatibility, we support NDArrayWrapper objects. - if isinstance(self.stack[-1], (NDArrayWrapper, NumpyArrayWrapper)): - if self.np is None: - raise ImportError("Trying to unpickle an ndarray, " - "but numpy didn't import correctly") - array_wrapper = self.stack.pop() - # If any NDArrayWrapper is found, we switch to compatibility mode, - # this will be used to raise a DeprecationWarning to the user at - # the end of the unpickling. - if isinstance(array_wrapper, NDArrayWrapper): - self.compat_mode = True - self.stack.append(array_wrapper.read(self)) - - # Be careful to register our new method. - if PY3_OR_LATER: - dispatch[pickle.BUILD[0]] = load_build - else: - dispatch[pickle.BUILD] = load_build - - -############################################################################### -# Utility functions - -def dump(value, filename, compress=0, protocol=None, cache_size=None): - """Persist an arbitrary Python object into one file. - - Read more in the :ref:`User Guide `. - - Parameters - ----------- - value: any Python object - The object to store to disk. - filename: str, pathlib.Path, or file object. - The file object or path of the file in which it is to be stored. - The compression method corresponding to one of the supported filename - extensions ('.z', '.gz', '.bz2', '.xz' or '.lzma') will be used - automatically. - compress: int from 0 to 9 or bool or 2-tuple, optional - Optional compression level for the data. 0 or False is no compression. - Higher value means more compression, but also slower read and - write times. Using a value of 3 is often a good compromise. - See the notes for more details. - If compress is True, the compression level used is 3. - If compress is a 2-tuple, the first element must correspond to a string - between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma' - 'xz'), the second element must be an integer from 0 to 9, corresponding - to the compression level. - protocol: int, optional - Pickle protocol, see pickle.dump documentation for more details. - cache_size: positive int, optional - This option is deprecated in 0.10 and has no effect. - - Returns - ------- - filenames: list of strings - The list of file names in which the data is stored. If - compress is false, each array is stored in a different file. - - See Also - -------- - joblib.load : corresponding loader - - Notes - ----- - Memmapping on load cannot be used for compressed files. Thus - using compression can significantly slow down loading. In - addition, compressed files take extra extra memory during - dump and load. - - """ - - if Path is not None and isinstance(filename, Path): - filename = str(filename) - - is_filename = isinstance(filename, _basestring) - is_fileobj = hasattr(filename, "write") - - compress_method = 'zlib' # zlib is the default compression method. - if compress is True: - # By default, if compress is enabled, we want the default compress - # level of the compressor. - compress_level = None - elif isinstance(compress, tuple): - # a 2-tuple was set in compress - if len(compress) != 2: - raise ValueError( - 'Compress argument tuple should contain exactly 2 elements: ' - '(compress method, compress level), you passed {}' - .format(compress)) - compress_method, compress_level = compress - elif isinstance(compress, _basestring): - compress_method = compress - compress_level = None # Use default compress level - compress = (compress_method, compress_level) - else: - compress_level = compress - - # LZ4 compression is only supported and installation checked with - # python 3+. - if compress_method == 'lz4' and lz4 is None and PY3_OR_LATER: - raise ValueError(LZ4_NOT_INSTALLED_ERROR) - - if (compress_level is not None and - compress_level is not False and - compress_level not in range(10)): - # Raising an error if a non valid compress level is given. - raise ValueError( - 'Non valid compress level given: "{}". Possible values are ' - '{}.'.format(compress_level, list(range(10)))) - - if compress_method not in _COMPRESSORS: - # Raising an error if an unsupported compression method is given. - raise ValueError( - 'Non valid compression method given: "{}". Possible values are ' - '{}.'.format(compress_method, _COMPRESSORS)) - - if not is_filename and not is_fileobj: - # People keep inverting arguments, and the resulting error is - # incomprehensible - raise ValueError( - 'Second argument should be a filename or a file-like object, ' - '%s (type %s) was given.' - % (filename, type(filename)) - ) - - if is_filename and not isinstance(compress, tuple): - # In case no explicit compression was requested using both compression - # method and level in a tuple and the filename has an explicit - # extension, we select the corresponding compressor. - - # unset the variable to be sure no compression level is set afterwards. - compress_method = None - for name, compressor in _COMPRESSORS.items(): - if filename.endswith(compressor.extension): - compress_method = name - - if compress_method in _COMPRESSORS and compress_level == 0: - # we choose the default compress_level in case it was not given - # as an argument (using compress). - compress_level = None - - if not PY3_OR_LATER and compress_method in ('lzma', 'xz'): - raise NotImplementedError("{} compression is only available for " - "python version >= 3.3. You are using " - "{}.{}".format(compress_method, - sys.version_info[0], - sys.version_info[1])) - - if cache_size is not None: - # Cache size is deprecated starting from version 0.10 - warnings.warn("Please do not set 'cache_size' in joblib.dump, " - "this parameter has no effect and will be removed. " - "You used 'cache_size={}'".format(cache_size), - DeprecationWarning, stacklevel=2) - - if compress_level != 0: - with _write_fileobject(filename, compress=(compress_method, - compress_level)) as f: - NumpyPickler(f, protocol=protocol).dump(value) - elif is_filename: - with open(filename, 'wb') as f: - NumpyPickler(f, protocol=protocol).dump(value) - else: - NumpyPickler(filename, protocol=protocol).dump(value) - - # If the target container is a file object, nothing is returned. - if is_fileobj: - return - - # For compatibility, the list of created filenames (e.g with one element - # after 0.10.0) is returned by default. - return [filename] - - -def _unpickle(fobj, filename="", mmap_mode=None): - """Internal unpickling function.""" - # We are careful to open the file handle early and keep it open to - # avoid race-conditions on renames. - # That said, if data is stored in companion files, which can be - # the case with the old persistence format, moving the directory - # will create a race when joblib tries to access the companion - # files. - unpickler = NumpyUnpickler(filename, fobj, mmap_mode=mmap_mode) - obj = None - try: - obj = unpickler.load() - if unpickler.compat_mode: - warnings.warn("The file '%s' has been generated with a " - "joblib version less than 0.10. " - "Please regenerate this pickle file." - % filename, - DeprecationWarning, stacklevel=3) - except UnicodeDecodeError as exc: - # More user-friendly error message - if PY3_OR_LATER: - new_exc = ValueError( - 'You may be trying to read with ' - 'python 3 a joblib pickle generated with python 2. ' - 'This feature is not supported by joblib.') - new_exc.__cause__ = exc - raise new_exc - # Reraise exception with Python 2 - raise - - return obj - - -def load(filename, mmap_mode=None): - """Reconstruct a Python object from a file persisted with joblib.dump. - - Read more in the :ref:`User Guide `. - - Parameters - ----------- - filename: str, pathlib.Path, or file object. - The file object or path of the file from which to load the object - mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional - If not None, the arrays are memory-mapped from the disk. This - mode has no effect for compressed files. Note that in this - case the reconstructed object might no longer match exactly - the originally pickled object. - - Returns - ------- - result: any Python object - The object stored in the file. - - See Also - -------- - joblib.dump : function to save an object - - Notes - ----- - - This function can load numpy array files saved separately during the - dump. If the mmap_mode argument is given, it is passed to np.load and - arrays are loaded as memmaps. As a consequence, the reconstructed - object might not match the original pickled object. Note that if the - file was saved with compression, the arrays cannot be memmapped. - """ - if Path is not None and isinstance(filename, Path): - filename = str(filename) - - if hasattr(filename, "read"): - fobj = filename - filename = getattr(fobj, 'name', '') - with _read_fileobject(fobj, filename, mmap_mode) as fobj: - obj = _unpickle(fobj) - else: - with open(filename, 'rb') as f: - with _read_fileobject(f, filename, mmap_mode) as fobj: - if isinstance(fobj, _basestring): - # if the returned file object is a string, this means we - # try to load a pickle file generated with an version of - # Joblib so we load it with joblib compatibility function. - return load_compatibility(fobj) - - obj = _unpickle(fobj, filename, mmap_mode) - - return obj +from joblib.numpy_pickle import * diff --git a/sklearn/externals/joblib/numpy_pickle_compat.py b/sklearn/externals/joblib/numpy_pickle_compat.py deleted file mode 100644 index ba8ab827914e0..0000000000000 --- a/sklearn/externals/joblib/numpy_pickle_compat.py +++ /dev/null @@ -1,239 +0,0 @@ -"""Numpy pickle compatibility functions.""" - -import pickle -import os -import zlib -from io import BytesIO - -from ._compat import PY3_OR_LATER -from .numpy_pickle_utils import _ZFILE_PREFIX -from .numpy_pickle_utils import Unpickler - - -def hex_str(an_int): - """Convert an int to an hexadecimal string.""" - return '{:#x}'.format(an_int) - -if PY3_OR_LATER: - def asbytes(s): - if isinstance(s, bytes): - return s - return s.encode('latin1') -else: - asbytes = str - -_MAX_LEN = len(hex_str(2 ** 64)) -_CHUNK_SIZE = 64 * 1024 - - -def read_zfile(file_handle): - """Read the z-file and return the content as a string. - - Z-files are raw data compressed with zlib used internally by joblib - for persistence. Backward compatibility is not guaranteed. Do not - use for external purposes. - """ - file_handle.seek(0) - header_length = len(_ZFILE_PREFIX) + _MAX_LEN - length = file_handle.read(header_length) - length = length[len(_ZFILE_PREFIX):] - length = int(length, 16) - - # With python2 and joblib version <= 0.8.4 compressed pickle header is one - # character wider so we need to ignore an additional space if present. - # Note: the first byte of the zlib data is guaranteed not to be a - # space according to - # https://tools.ietf.org/html/rfc6713#section-2.1 - next_byte = file_handle.read(1) - if next_byte != b' ': - # The zlib compressed data has started and we need to go back - # one byte - file_handle.seek(header_length) - - # We use the known length of the data to tell Zlib the size of the - # buffer to allocate. - data = zlib.decompress(file_handle.read(), 15, length) - assert len(data) == length, ( - "Incorrect data length while decompressing %s." - "The file could be corrupted." % file_handle) - return data - - -def write_zfile(file_handle, data, compress=1): - """Write the data in the given file as a Z-file. - - Z-files are raw data compressed with zlib used internally by joblib - for persistence. Backward compatibility is not guarantied. Do not - use for external purposes. - """ - file_handle.write(_ZFILE_PREFIX) - length = hex_str(len(data)) - # Store the length of the data - file_handle.write(asbytes(length.ljust(_MAX_LEN))) - file_handle.write(zlib.compress(asbytes(data), compress)) - -############################################################################### -# Utility objects for persistence. - - -class NDArrayWrapper(object): - """An object to be persisted instead of numpy arrays. - - The only thing this object does, is to carry the filename in which - the array has been persisted, and the array subclass. - """ - - def __init__(self, filename, subclass, allow_mmap=True): - """Constructor. Store the useful information for later.""" - self.filename = filename - self.subclass = subclass - self.allow_mmap = allow_mmap - - def read(self, unpickler): - """Reconstruct the array.""" - filename = os.path.join(unpickler._dirname, self.filename) - # Load the array from the disk - # use getattr instead of self.allow_mmap to ensure backward compat - # with NDArrayWrapper instances pickled with joblib < 0.9.0 - allow_mmap = getattr(self, 'allow_mmap', True) - memmap_kwargs = ({} if not allow_mmap - else {'mmap_mode': unpickler.mmap_mode}) - array = unpickler.np.load(filename, **memmap_kwargs) - # Reconstruct subclasses. This does not work with old - # versions of numpy - if (hasattr(array, '__array_prepare__') and - self.subclass not in (unpickler.np.ndarray, - unpickler.np.memmap)): - # We need to reconstruct another subclass - new_array = unpickler.np.core.multiarray._reconstruct( - self.subclass, (0,), 'b') - return new_array.__array_prepare__(array) - else: - return array - - -class ZNDArrayWrapper(NDArrayWrapper): - """An object to be persisted instead of numpy arrays. - - This object store the Zfile filename in which - the data array has been persisted, and the meta information to - retrieve it. - The reason that we store the raw buffer data of the array and - the meta information, rather than array representation routine - (tostring) is that it enables us to use completely the strided - model to avoid memory copies (a and a.T store as fast). In - addition saving the heavy information separately can avoid - creating large temporary buffers when unpickling data with - large arrays. - """ - - def __init__(self, filename, init_args, state): - """Constructor. Store the useful information for later.""" - self.filename = filename - self.state = state - self.init_args = init_args - - def read(self, unpickler): - """Reconstruct the array from the meta-information and the z-file.""" - # Here we a simply reproducing the unpickling mechanism for numpy - # arrays - filename = os.path.join(unpickler._dirname, self.filename) - array = unpickler.np.core.multiarray._reconstruct(*self.init_args) - with open(filename, 'rb') as f: - data = read_zfile(f) - state = self.state + (data,) - array.__setstate__(state) - return array - - -class ZipNumpyUnpickler(Unpickler): - """A subclass of the Unpickler to unpickle our numpy pickles.""" - - dispatch = Unpickler.dispatch.copy() - - def __init__(self, filename, file_handle, mmap_mode=None): - """Constructor.""" - self._filename = os.path.basename(filename) - self._dirname = os.path.dirname(filename) - self.mmap_mode = mmap_mode - self.file_handle = self._open_pickle(file_handle) - Unpickler.__init__(self, self.file_handle) - try: - import numpy as np - except ImportError: - np = None - self.np = np - - def _open_pickle(self, file_handle): - return BytesIO(read_zfile(file_handle)) - - def load_build(self): - """Set the state of a newly created object. - - We capture it to replace our place-holder objects, - NDArrayWrapper, by the array we are interested in. We - replace them directly in the stack of pickler. - """ - Unpickler.load_build(self) - if isinstance(self.stack[-1], NDArrayWrapper): - if self.np is None: - raise ImportError("Trying to unpickle an ndarray, " - "but numpy didn't import correctly") - nd_array_wrapper = self.stack.pop() - array = nd_array_wrapper.read(self) - self.stack.append(array) - - # Be careful to register our new method. - if PY3_OR_LATER: - dispatch[pickle.BUILD[0]] = load_build - else: - dispatch[pickle.BUILD] = load_build - - -def load_compatibility(filename): - """Reconstruct a Python object from a file persisted with joblib.dump. - - This function ensures the compatibility with joblib old persistence format - (<= 0.9.3). - - Parameters - ----------- - filename: string - The name of the file from which to load the object - - Returns - ------- - result: any Python object - The object stored in the file. - - See Also - -------- - joblib.dump : function to save an object - - Notes - ----- - - This function can load numpy array files saved separately during the - dump. - """ - with open(filename, 'rb') as file_handle: - # We are careful to open the file handle early and keep it open to - # avoid race-conditions on renames. That said, if data is stored in - # companion files, moving the directory will create a race when - # joblib tries to access the companion files. - unpickler = ZipNumpyUnpickler(filename, file_handle=file_handle) - try: - obj = unpickler.load() - except UnicodeDecodeError as exc: - # More user-friendly error message - if PY3_OR_LATER: - new_exc = ValueError( - 'You may be trying to read with ' - 'python 3 a joblib pickle generated with python 2. ' - 'This feature is not supported by joblib.') - new_exc.__cause__ = exc - raise new_exc - finally: - if hasattr(unpickler, 'file_handle'): - unpickler.file_handle.close() - return obj diff --git a/sklearn/externals/joblib/numpy_pickle_utils.py b/sklearn/externals/joblib/numpy_pickle_utils.py deleted file mode 100644 index 1ebf1aa61bb44..0000000000000 --- a/sklearn/externals/joblib/numpy_pickle_utils.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Utilities for fast persistence of big data, with optional compression.""" - -# Author: Gael Varoquaux -# Copyright (c) 2009 Gael Varoquaux -# License: BSD Style, 3 clauses. - -import pickle -import io -import warnings -import contextlib -from contextlib import closing - -from ._compat import PY3_OR_LATER, PY27 -from .compressor import _ZFILE_PREFIX -from .compressor import _COMPRESSORS - -if PY3_OR_LATER: - Unpickler = pickle._Unpickler - Pickler = pickle._Pickler - xrange = range -else: - Unpickler = pickle.Unpickler - Pickler = pickle.Pickler - -try: - import numpy as np -except ImportError: - np = None - -try: - # The python standard library can be built without bz2 so we make bz2 - # usage optional. - # see https://github.com/scikit-learn/scikit-learn/issues/7526 for more - # details. - import bz2 -except ImportError: - bz2 = None - -# Buffer size used in io.BufferedReader and io.BufferedWriter -_IO_BUFFER_SIZE = 1024 ** 2 - - -def _is_raw_file(fileobj): - """Check if fileobj is a raw file object, e.g created with open.""" - if PY3_OR_LATER: - fileobj = getattr(fileobj, 'raw', fileobj) - return isinstance(fileobj, io.FileIO) - else: - return isinstance(fileobj, file) # noqa - - -def _get_prefixes_max_len(): - # Compute the max prefix len of registered compressors. - prefixes = [len(compressor.prefix) for compressor in _COMPRESSORS.values()] - prefixes += [len(_ZFILE_PREFIX)] - return max(prefixes) - - -############################################################################### -# Cache file utilities -def _detect_compressor(fileobj): - """Return the compressor matching fileobj. - - Parameters - ---------- - fileobj: file object - - Returns - ------- - str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 'not-compressed'} - """ - # Read the magic number in the first bytes of the file. - max_prefix_len = _get_prefixes_max_len() - if hasattr(fileobj, 'peek'): - # Peek allows to read those bytes without moving the cursor in the - # file whic. - first_bytes = fileobj.peek(max_prefix_len) - else: - # Fallback to seek if the fileobject is not peekable. - first_bytes = fileobj.read(max_prefix_len) - fileobj.seek(0) - - if first_bytes.startswith(_ZFILE_PREFIX): - return "compat" - else: - for name, compressor in _COMPRESSORS.items(): - if first_bytes.startswith(compressor.prefix): - return name - - return "not-compressed" - - -def _buffered_read_file(fobj): - """Return a buffered version of a read file object.""" - if PY27 and bz2 is not None and isinstance(fobj, bz2.BZ2File): - # Python 2.7 doesn't work with BZ2File through a buffer: "no - # attribute 'readable'" error. - return fobj - else: - return io.BufferedReader(fobj, buffer_size=_IO_BUFFER_SIZE) - - -def _buffered_write_file(fobj): - """Return a buffered version of a write file object.""" - if PY27 and bz2 is not None and isinstance(fobj, bz2.BZ2File): - # Python 2.7 doesn't work with BZ2File through a buffer: no attribute - # 'writable'. - # BZ2File doesn't implement the file object context manager in python 2 - # so we wrap the fileobj using `closing`. - return closing(fobj) - else: - return io.BufferedWriter(fobj, buffer_size=_IO_BUFFER_SIZE) - - -@contextlib.contextmanager -def _read_fileobject(fileobj, filename, mmap_mode=None): - """Utility function opening the right fileobject from a filename. - - The magic number is used to choose between the type of file object to open: - * regular file object (default) - * zlib file object - * gzip file object - * bz2 file object - * lzma file object (for xz and lzma compressor) - - Parameters - ---------- - fileobj: file object - compressor: str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', - 'not-compressed'} - filename: str - filename path corresponding to the fileobj parameter. - mmap_mode: str - memory map mode that should be used to open the pickle file. This - parameter is useful to verify that the user is not trying to one with - compression. Default: None. - - Returns - ------- - a file like object - - """ - # Detect if the fileobj contains compressed data. - compressor = _detect_compressor(fileobj) - - if compressor == 'compat': - # Compatibility with old pickle mode: simply return the input - # filename "as-is" and let the compatibility function be called by the - # caller. - warnings.warn("The file '%s' has been generated with a joblib " - "version less than 0.10. " - "Please regenerate this pickle file." % filename, - DeprecationWarning, stacklevel=2) - yield filename - else: - if compressor in _COMPRESSORS: - # based on the compressor detected in the file, we open the - # correct decompressor file object, wrapped in a buffer. - compressor_wrapper = _COMPRESSORS[compressor] - inst = compressor_wrapper.decompressor_file(fileobj) - fileobj = _buffered_read_file(inst) - - # Checking if incompatible load parameters with the type of file: - # mmap_mode cannot be used with compressed file or in memory buffers - # such as io.BytesIO. - if mmap_mode is not None: - if isinstance(fileobj, io.BytesIO): - warnings.warn('In memory persistence is not compatible with ' - 'mmap_mode "%(mmap_mode)s" flag passed. ' - 'mmap_mode option will be ignored.' - % locals(), stacklevel=2) - elif compressor != 'not-compressed': - warnings.warn('mmap_mode "%(mmap_mode)s" is not compatible ' - 'with compressed file %(filename)s. ' - '"%(mmap_mode)s" flag will be ignored.' - % locals(), stacklevel=2) - elif not _is_raw_file(fileobj): - warnings.warn('"%(fileobj)r" is not a raw file, mmap_mode ' - '"%(mmap_mode)s" flag will be ignored.' - % locals(), stacklevel=2) - - yield fileobj - - -def _write_fileobject(filename, compress=("zlib", 3)): - """Return the right compressor file object in write mode.""" - compressmethod = compress[0] - compresslevel = compress[1] - - if compressmethod in _COMPRESSORS.keys(): - file_instance = _COMPRESSORS[compressmethod].compressor_file( - filename, compresslevel=compresslevel) - return _buffered_write_file(file_instance) - else: - file_instance = _COMPRESSORS['zlib'].compressor_file( - filename, compresslevel=compresslevel) - return _buffered_write_file(file_instance) - - -# Utility functions/variables from numpy required for writing arrays. -# We need at least the functions introduced in version 1.9 of numpy. Here, -# we use the ones from numpy 1.10.2. -BUFFER_SIZE = 2 ** 18 # size of buffer for reading npz files in bytes - - -def _read_bytes(fp, size, error_template="ran out of data"): - """Read from file-like object until size bytes are read. - - Raises ValueError if not EOF is encountered before size bytes are read. - Non-blocking objects only supported if they derive from io objects. - - Required as e.g. ZipExtFile in python 2.6 can return less data than - requested. - - This function was taken from numpy/lib/format.py in version 1.10.2. - - Parameters - ---------- - fp: file-like object - size: int - error_template: str - - Returns - ------- - a bytes object - The data read in bytes. - - """ - data = bytes() - while True: - # io files (default in python3) return None or raise on - # would-block, python2 file will truncate, probably nothing can be - # done about that. note that regular files can't be non-blocking - try: - r = fp.read(size - len(data)) - data += r - if len(r) == 0 or len(data) == size: - break - except io.BlockingIOError: - pass - if len(data) != size: - msg = "EOF: reading %s, expected %d bytes got %d" - raise ValueError(msg % (error_template, size, len(data))) - else: - return data diff --git a/sklearn/externals/joblib/parallel.py b/sklearn/externals/joblib/parallel.py deleted file mode 100644 index df28678ad95fb..0000000000000 --- a/sklearn/externals/joblib/parallel.py +++ /dev/null @@ -1,948 +0,0 @@ -""" -Helpers for embarrassingly parallel code. -""" -# Author: Gael Varoquaux < gael dot varoquaux at normalesup dot org > -# Copyright: 2010, Gael Varoquaux -# License: BSD 3 clause - -from __future__ import division - -import os -import sys -from math import sqrt -import functools -import time -import inspect -import threading -import itertools -from numbers import Integral -import warnings -from functools import partial - -from ._multiprocessing_helpers import mp - -from .format_stack import format_outer_frames -from .logger import Logger, short_format_time -from .my_exceptions import TransportableException -from .disk import memstr_to_bytes -from ._parallel_backends import (FallbackToBackend, MultiprocessingBackend, - ThreadingBackend, SequentialBackend, - LokyBackend) -from ._compat import _basestring -from .externals.cloudpickle import dumps, loads -from .externals import loky - -# Make sure that those two classes are part of the public joblib.parallel API -# so that 3rd party backend implementers can import them from here. -from ._parallel_backends import AutoBatchingMixin # noqa -from ._parallel_backends import ParallelBackendBase # noqa - -BACKENDS = { - 'multiprocessing': MultiprocessingBackend, - 'threading': ThreadingBackend, - 'sequential': SequentialBackend, - 'loky': LokyBackend, -} -# name of the backend used by default by Parallel outside of any context -# managed by ``parallel_backend``. -DEFAULT_BACKEND = 'loky' -DEFAULT_N_JOBS = 1 -DEFAULT_THREAD_BACKEND = 'threading' - -# Thread local value that can be overridden by the ``parallel_backend`` context -# manager -_backend = threading.local() - -VALID_BACKEND_HINTS = ('processes', 'threads', None) -VALID_BACKEND_CONSTRAINTS = ('sharedmem', None) - - -def _register_dask(): - """ Register Dask Backend if called with parallel_backend("dask") """ - try: - from ._dask import DaskDistributedBackend - register_parallel_backend('dask', DaskDistributedBackend) - except ImportError: - msg = ("To use the dask.distributed backend you must install both " - "the `dask` and distributed modules.\n\n" - "See http://dask.pydata.org/en/latest/install.html for more " - "information.") - raise ImportError(msg) - - -EXTERNAL_BACKENDS = { - 'dask': _register_dask, -} - - -def get_active_backend(prefer=None, require=None, verbose=0): - """Return the active default backend""" - if prefer not in VALID_BACKEND_HINTS: - raise ValueError("prefer=%r is not a valid backend hint, " - "expected one of %r" % (prefer, VALID_BACKEND_HINTS)) - if require not in VALID_BACKEND_CONSTRAINTS: - raise ValueError("require=%r is not a valid backend constraint, " - "expected one of %r" - % (require, VALID_BACKEND_CONSTRAINTS)) - - if prefer == 'processes' and require == 'sharedmem': - raise ValueError("prefer == 'processes' and require == 'sharedmem'" - " are inconsistent settings") - backend_and_jobs = getattr(_backend, 'backend_and_jobs', None) - if backend_and_jobs is not None: - # Try to use the backend set by the user with the context manager. - backend, n_jobs = backend_and_jobs - supports_sharedmem = getattr(backend, 'supports_sharedmem', False) - if require == 'sharedmem' and not supports_sharedmem: - # This backend does not match the shared memory constraint: - # fallback to the default thead-based backend. - sharedmem_backend = BACKENDS[DEFAULT_THREAD_BACKEND]() - if verbose >= 10: - print("Using %s as joblib.Parallel backend instead of %s " - "as the latter does not provide shared memory semantics." - % (sharedmem_backend.__class__.__name__, - backend.__class__.__name__)) - return sharedmem_backend, DEFAULT_N_JOBS - else: - return backend_and_jobs - - # We are outside of the scope of any parallel_backend context manager, - # create the default backend instance now. - backend = BACKENDS[DEFAULT_BACKEND]() - supports_sharedmem = getattr(backend, 'supports_sharedmem', False) - uses_threads = getattr(backend, 'uses_threads', False) - if ((require == 'sharedmem' and not supports_sharedmem) or - (prefer == 'threads' and not uses_threads)): - # Make sure the selected default backend match the soft hints and - # hard constraints: - backend = BACKENDS[DEFAULT_THREAD_BACKEND]() - return backend, DEFAULT_N_JOBS - - -class parallel_backend(object): - """Change the default backend used by Parallel inside a with block. - - If ``backend`` is a string it must match a previously registered - implementation using the ``register_parallel_backend`` function. - - By default the following backends are available: - - - 'loky': single-host, process-based parallelism (used by default), - - 'threading': single-host, thread-based parallelism, - - 'multiprocessing': legacy single-host, process-based parallelism. - - 'loky' is recommended to run functions that manipulate Python objects. - 'threading' is a low-overhead alternative that is most efficient for - functions that release the Global Interpreter Lock: e.g. I/O-bound code or - CPU-bound code in a few calls to native code that explicitly releases the - GIL. - - In addition, if the `dask` and `distributed` Python packages are installed, - it is possible to use the 'dask' backend for better scheduling of nested - parallel calls without over-subscription and potentially distribute - parallel calls over a networked cluster of several hosts. - - Alternatively the backend can be passed directly as an instance. - - By default all available workers will be used (``n_jobs=-1``) unless the - caller passes an explicit value for the ``n_jobs`` parameter. - - This is an alternative to passing a ``backend='backend_name'`` argument to - the ``Parallel`` class constructor. It is particularly useful when calling - into library code that uses joblib internally but does not expose the - backend argument in its own API. - - >>> from operator import neg - >>> with parallel_backend('threading'): - ... print(Parallel()(delayed(neg)(i + 1) for i in range(5))) - ... - [-1, -2, -3, -4, -5] - - Warning: this function is experimental and subject to change in a future - version of joblib. - - .. versionadded:: 0.10 - - """ - def __init__(self, backend, n_jobs=-1, **backend_params): - if isinstance(backend, _basestring): - if backend not in BACKENDS and backend in EXTERNAL_BACKENDS: - register = EXTERNAL_BACKENDS[backend] - register() - - backend = BACKENDS[backend](**backend_params) - - self.old_backend_and_jobs = getattr(_backend, 'backend_and_jobs', None) - self.new_backend_and_jobs = (backend, n_jobs) - - _backend.backend_and_jobs = (backend, n_jobs) - - def __enter__(self): - return self.new_backend_and_jobs - - def __exit__(self, type, value, traceback): - self.unregister() - - def unregister(self): - if self.old_backend_and_jobs is None: - if getattr(_backend, 'backend_and_jobs', None) is not None: - del _backend.backend_and_jobs - else: - _backend.backend_and_jobs = self.old_backend_and_jobs - - -# Under Linux or OS X the default start method of multiprocessing -# can cause third party libraries to crash. Under Python 3.4+ it is possible -# to set an environment variable to switch the default start method from -# 'fork' to 'forkserver' or 'spawn' to avoid this issue albeit at the cost -# of causing semantic changes and some additional pool instantiation overhead. -DEFAULT_MP_CONTEXT = None -if hasattr(mp, 'get_context'): - method = os.environ.get('JOBLIB_START_METHOD', '').strip() or None - if method is not None: - DEFAULT_MP_CONTEXT = mp.get_context(method=method) - - -class BatchedCalls(object): - """Wrap a sequence of (func, args, kwargs) tuples as a single callable""" - - def __init__(self, iterator_slice, backend_and_jobs, pickle_cache=None): - self.items = list(iterator_slice) - self._size = len(self.items) - if isinstance(backend_and_jobs, tuple): - self._backend, self._n_jobs = backend_and_jobs - else: - # this is for backward compatibility purposes. Before 0.12.6, - # nested backends were returned without n_jobs indications. - self._backend, self._n_jobs = backend_and_jobs, None - self._pickle_cache = pickle_cache if pickle_cache is not None else {} - - def __call__(self): - # Set the default nested backend to self._backend but do not set the - # change the default number of processes to -1 - with parallel_backend(self._backend, n_jobs=self._n_jobs): - return [func(*args, **kwargs) - for func, args, kwargs in self.items] - - def __len__(self): - return self._size - - -############################################################################### -# CPU count that works also when multiprocessing has been disabled via -# the JOBLIB_MULTIPROCESSING environment variable -def cpu_count(): - """Return the number of CPUs.""" - if mp is None: - return 1 - - return loky.cpu_count() - - -############################################################################### -# For verbosity - -def _verbosity_filter(index, verbose): - """ Returns False for indices increasingly apart, the distance - depending on the value of verbose. - - We use a lag increasing as the square of index - """ - if not verbose: - return True - elif verbose > 10: - return False - if index == 0: - return False - verbose = .5 * (11 - verbose) ** 2 - scale = sqrt(index / verbose) - next_scale = sqrt((index + 1) / verbose) - return (int(next_scale) == int(scale)) - - -############################################################################### -def delayed(function, check_pickle=None): - """Decorator used to capture the arguments of a function.""" - if check_pickle is not None: - warnings.warn('check_pickle is deprecated in joblib 0.12 and will be' - ' removed in 0.13', DeprecationWarning) - # Try to pickle the input function, to catch the problems early when - # using with multiprocessing: - if check_pickle: - dumps(function) - - def delayed_function(*args, **kwargs): - return function, args, kwargs - try: - delayed_function = functools.wraps(function)(delayed_function) - except AttributeError: - " functools.wraps fails on some callable objects " - return delayed_function - - -############################################################################### -class BatchCompletionCallBack(object): - """Callback used by joblib.Parallel's multiprocessing backend. - - This callable is executed by the parent process whenever a worker process - has returned the results of a batch of tasks. - - It is used for progress reporting, to update estimate of the batch - processing duration and to schedule the next batch of tasks to be - processed. - - """ - def __init__(self, dispatch_timestamp, batch_size, parallel): - self.dispatch_timestamp = dispatch_timestamp - self.batch_size = batch_size - self.parallel = parallel - - def __call__(self, out): - self.parallel.n_completed_tasks += self.batch_size - this_batch_duration = time.time() - self.dispatch_timestamp - - self.parallel._backend.batch_completed(self.batch_size, - this_batch_duration) - self.parallel.print_progress() - with self.parallel._lock: - if self.parallel._original_iterator is not None: - self.parallel.dispatch_next() - - -############################################################################### -def register_parallel_backend(name, factory, make_default=False): - """Register a new Parallel backend factory. - - The new backend can then be selected by passing its name as the backend - argument to the Parallel class. Moreover, the default backend can be - overwritten globally by setting make_default=True. - - The factory can be any callable that takes no argument and return an - instance of ``ParallelBackendBase``. - - Warning: this function is experimental and subject to change in a future - version of joblib. - - .. versionadded:: 0.10 - - """ - BACKENDS[name] = factory - if make_default: - global DEFAULT_BACKEND - DEFAULT_BACKEND = name - - -def effective_n_jobs(n_jobs=-1): - """Determine the number of jobs that can actually run in parallel - - n_jobs is the number of workers requested by the callers. Passing n_jobs=-1 - means requesting all available workers for instance matching the number of - CPU cores on the worker host(s). - - This method should return a guesstimate of the number of workers that can - actually perform work concurrently with the currently enabled default - backend. The primary use case is to make it possible for the caller to know - in how many chunks to slice the work. - - In general working on larger data chunks is more efficient (less scheduling - overhead and better use of CPU cache prefetching heuristics) as long as all - the workers have enough work to do. - - Warning: this function is experimental and subject to change in a future - version of joblib. - - .. versionadded:: 0.10 - - """ - backend, _ = get_active_backend() - return backend.effective_n_jobs(n_jobs=n_jobs) - - -############################################################################### -class Parallel(Logger): - ''' Helper class for readable parallel mapping. - - Read more in the :ref:`User Guide `. - - Parameters - ----------- - n_jobs: int, default: None - The maximum number of concurrently running jobs, such as the number - of Python worker processes when backend="multiprocessing" - or the size of the thread-pool when backend="threading". - If -1 all CPUs are used. If 1 is given, no parallel computing code - is used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all - CPUs but one are used. - None is a marker for 'unset' that will be interpreted as n_jobs=1 - (sequential execution) unless the call is performed under a - parallel_backend context manager that sets another value for - n_jobs. - backend: str, ParallelBackendBase instance or None, default: 'loky' - Specify the parallelization backend implementation. - Supported backends are: - - - "loky" used by default, can induce some - communication and memory overhead when exchanging input and - output data with the worker Python processes. - - "multiprocessing" previous process-based backend based on - `multiprocessing.Pool`. Less robust than `loky`. - - "threading" is a very low-overhead backend but it suffers - from the Python Global Interpreter Lock if the called function - relies a lot on Python objects. "threading" is mostly useful - when the execution bottleneck is a compiled extension that - explicitly releases the GIL (for instance a Cython loop wrapped - in a "with nogil" block or an expensive call to a library such - as NumPy). - - finally, you can register backends by calling - register_parallel_backend. This will allow you to implement - a backend of your liking. - - It is not recommended to hard-code the backend name in a call to - Parallel in a library. Instead it is recommended to set soft hints - (prefer) or hard constraints (require) so as to make it possible - for library users to change the backend from the outside using the - parallel_backend context manager. - prefer: str in {'processes', 'threads'} or None, default: None - Soft hint to choose the default backend if no specific backend - was selected with the parallel_backend context manager. The - default process-based backend is 'loky' and the default - thread-based backend is 'threading'. - require: 'sharedmem' or None, default None - Hard constraint to select the backend. If set to 'sharedmem', - the selected backend will be single-host and thread-based even - if the user asked for a non-thread based backend with - parallel_backend. - verbose: int, optional - The verbosity level: if non zero, progress messages are - printed. Above 50, the output is sent to stdout. - The frequency of the messages increases with the verbosity level. - If it more than 10, all iterations are reported. - timeout: float, optional - Timeout limit for each task to complete. If any task takes longer - a TimeOutError will be raised. Only applied when n_jobs != 1 - pre_dispatch: {'all', integer, or expression, as in '3*n_jobs'} - The number of batches (of tasks) to be pre-dispatched. - Default is '2*n_jobs'. When batch_size="auto" this is reasonable - default and the workers should never starve. - batch_size: int or 'auto', default: 'auto' - The number of atomic tasks to dispatch at once to each - worker. When individual evaluations are very fast, dispatching - calls to workers can be slower than sequential computation because - of the overhead. Batching fast computations together can mitigate - this. - The ``'auto'`` strategy keeps track of the time it takes for a batch - to complete, and dynamically adjusts the batch size to keep the time - on the order of half a second, using a heuristic. The initial batch - size is 1. - ``batch_size="auto"`` with ``backend="threading"`` will dispatch - batches of a single task at a time as the threading backend has - very little overhead and using larger batch size has not proved to - bring any gain in that case. - temp_folder: str, optional - Folder to be used by the pool for memmapping large arrays - for sharing memory with worker processes. If None, this will try in - order: - - - a folder pointed by the JOBLIB_TEMP_FOLDER environment - variable, - - /dev/shm if the folder exists and is writable: this is a - RAM disk filesystem available by default on modern Linux - distributions, - - the default system temporary folder that can be - overridden with TMP, TMPDIR or TEMP environment - variables, typically /tmp under Unix operating systems. - - Only active when backend="loky" or "multiprocessing". - max_nbytes int, str, or None, optional, 1M by default - Threshold on the size of arrays passed to the workers that - triggers automated memory mapping in temp_folder. Can be an int - in Bytes, or a human-readable string, e.g., '1M' for 1 megabyte. - Use None to disable memmapping of large arrays. - Only active when backend="loky" or "multiprocessing". - mmap_mode: {None, 'r+', 'r', 'w+', 'c'} - Memmapping mode for numpy arrays passed to workers. - See 'max_nbytes' parameter documentation for more details. - - Notes - ----- - - This object uses workers to compute in parallel the application of a - function to many different arguments. The main functionality it brings - in addition to using the raw multiprocessing or concurrent.futures API - are (see examples for details): - - * More readable code, in particular since it avoids - constructing list of arguments. - - * Easier debugging: - - informative tracebacks even when the error happens on - the client side - - using 'n_jobs=1' enables to turn off parallel computing - for debugging without changing the codepath - - early capture of pickling errors - - * An optional progress meter. - - * Interruption of multiprocesses jobs with 'Ctrl-C' - - * Flexible pickling control for the communication to and from - the worker processes. - - * Ability to use shared memory efficiently with worker - processes for large numpy-based datastructures. - - Examples - -------- - - A simple example: - - >>> from math import sqrt - >>> from sklearn.externals.joblib import Parallel, delayed - >>> Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10)) - [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] - - Reshaping the output when the function has several return - values: - - >>> from math import modf - >>> from sklearn.externals.joblib import Parallel, delayed - >>> r = Parallel(n_jobs=1)(delayed(modf)(i/2.) for i in range(10)) - >>> res, i = zip(*r) - >>> res - (0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5) - >>> i - (0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0) - - The progress meter: the higher the value of `verbose`, the more - messages: - - >>> from time import sleep - >>> from sklearn.externals.joblib import Parallel, delayed - >>> r = Parallel(n_jobs=2, verbose=10)(delayed(sleep)(.2) for _ in range(10)) #doctest: +SKIP - [Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.6s - [Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 0.8s - [Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 1.4s finished - - Traceback example, note how the line of the error is indicated - as well as the values of the parameter passed to the function that - triggered the exception, even though the traceback happens in the - child process: - - >>> from heapq import nlargest - >>> from sklearn.externals.joblib import Parallel, delayed - >>> Parallel(n_jobs=2)(delayed(nlargest)(2, n) for n in (range(4), 'abcde', 3)) #doctest: +SKIP - #... - --------------------------------------------------------------------------- - Sub-process traceback: - --------------------------------------------------------------------------- - TypeError Mon Nov 12 11:37:46 2012 - PID: 12934 Python 2.7.3: /usr/bin/python - ........................................................................... - /usr/lib/python2.7/heapq.pyc in nlargest(n=2, iterable=3, key=None) - 419 if n >= size: - 420 return sorted(iterable, key=key, reverse=True)[:n] - 421 - 422 # When key is none, use simpler decoration - 423 if key is None: - --> 424 it = izip(iterable, count(0,-1)) # decorate - 425 result = _nlargest(n, it) - 426 return map(itemgetter(0), result) # undecorate - 427 - 428 # General case, slowest method - TypeError: izip argument #1 must support iteration - ___________________________________________________________________________ - - - Using pre_dispatch in a producer/consumer situation, where the - data is generated on the fly. Note how the producer is first - called 3 times before the parallel loop is initiated, and then - called to generate new data on the fly: - - >>> from math import sqrt - >>> from sklearn.externals.joblib import Parallel, delayed - >>> def producer(): - ... for i in range(6): - ... print('Produced %s' % i) - ... yield i - >>> out = Parallel(n_jobs=2, verbose=100, pre_dispatch='1.5*n_jobs')( - ... delayed(sqrt)(i) for i in producer()) #doctest: +SKIP - Produced 0 - Produced 1 - Produced 2 - [Parallel(n_jobs=2)]: Done 1 jobs | elapsed: 0.0s - Produced 3 - [Parallel(n_jobs=2)]: Done 2 jobs | elapsed: 0.0s - Produced 4 - [Parallel(n_jobs=2)]: Done 3 jobs | elapsed: 0.0s - Produced 5 - [Parallel(n_jobs=2)]: Done 4 jobs | elapsed: 0.0s - [Parallel(n_jobs=2)]: Done 6 out of 6 | elapsed: 0.0s remaining: 0.0s - [Parallel(n_jobs=2)]: Done 6 out of 6 | elapsed: 0.0s finished - - ''' - def __init__(self, n_jobs=None, backend=None, verbose=0, timeout=None, - pre_dispatch='2 * n_jobs', batch_size='auto', - temp_folder=None, max_nbytes='1M', mmap_mode='r', - prefer=None, require=None): - active_backend, context_n_jobs = get_active_backend( - prefer=prefer, require=require, verbose=verbose) - if backend is None and n_jobs is None: - # If we are under a parallel_backend context manager, look up - # the default number of jobs and use that instead: - n_jobs = context_n_jobs - if n_jobs is None: - # No specific context override and no specific value request: - # default to 1. - n_jobs = 1 - self.n_jobs = n_jobs - self.verbose = verbose - self.timeout = timeout - self.pre_dispatch = pre_dispatch - - if isinstance(max_nbytes, _basestring): - max_nbytes = memstr_to_bytes(max_nbytes) - - self._backend_args = dict( - max_nbytes=max_nbytes, - mmap_mode=mmap_mode, - temp_folder=temp_folder, - prefer=prefer, - require=require, - verbose=max(0, self.verbose - 50), - ) - if DEFAULT_MP_CONTEXT is not None: - self._backend_args['context'] = DEFAULT_MP_CONTEXT - elif hasattr(mp, "get_context"): - self._backend_args['context'] = mp.get_context() - - if backend is None: - backend = active_backend - elif isinstance(backend, ParallelBackendBase): - # Use provided backend as is - pass - elif hasattr(backend, 'Pool') and hasattr(backend, 'Lock'): - # Make it possible to pass a custom multiprocessing context as - # backend to change the start method to forkserver or spawn or - # preload modules on the forkserver helper process. - self._backend_args['context'] = backend - backend = MultiprocessingBackend() - else: - try: - backend_factory = BACKENDS[backend] - except KeyError: - raise ValueError("Invalid backend: %s, expected one of %r" - % (backend, sorted(BACKENDS.keys()))) - backend = backend_factory() - - if (require == 'sharedmem' and - not getattr(backend, 'supports_sharedmem', False)): - raise ValueError("Backend %s does not support shared memory" - % backend) - - if (batch_size == 'auto' or isinstance(batch_size, Integral) and - batch_size > 0): - self.batch_size = batch_size - else: - raise ValueError( - "batch_size must be 'auto' or a positive integer, got: %r" - % batch_size) - - self._backend = backend - self._output = None - self._jobs = list() - self._managed_backend = False - - # This lock is used coordinate the main thread of this process with - # the async callback thread of our the pool. - self._lock = threading.RLock() - - def __enter__(self): - self._managed_backend = True - self._initialize_backend() - return self - - def __exit__(self, exc_type, exc_value, traceback): - self._terminate_backend() - self._managed_backend = False - - def _initialize_backend(self): - """Build a process or thread pool and return the number of workers""" - try: - n_jobs = self._backend.configure(n_jobs=self.n_jobs, parallel=self, - **self._backend_args) - if self.timeout is not None and not self._backend.supports_timeout: - warnings.warn( - 'The backend class {!r} does not support timeout. ' - "You have set 'timeout={}' in Parallel but " - "the 'timeout' parameter will not be used.".format( - self._backend.__class__.__name__, - self.timeout)) - - except FallbackToBackend as e: - # Recursively initialize the backend in case of requested fallback. - self._backend = e.backend - n_jobs = self._initialize_backend() - - return n_jobs - - def _effective_n_jobs(self): - if self._backend: - return self._backend.effective_n_jobs(self.n_jobs) - return 1 - - def _terminate_backend(self): - if self._backend is not None: - self._backend.terminate() - - def _dispatch(self, batch): - """Queue the batch for computing, with or without multiprocessing - - WARNING: this method is not thread-safe: it should be only called - indirectly via dispatch_one_batch. - - """ - # If job.get() catches an exception, it closes the queue: - if self._aborting: - return - - self.n_dispatched_tasks += len(batch) - self.n_dispatched_batches += 1 - - dispatch_timestamp = time.time() - cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) - with self._lock: - job_idx = len(self._jobs) - job = self._backend.apply_async(batch, callback=cb) - # A job can complete so quickly than its callback is - # called before we get here, causing self._jobs to - # grow. To ensure correct results ordering, .insert is - # used (rather than .append) in the following line - self._jobs.insert(job_idx, job) - - def dispatch_next(self): - """Dispatch more data for parallel processing - - This method is meant to be called concurrently by the multiprocessing - callback. We rely on the thread-safety of dispatch_one_batch to protect - against concurrent consumption of the unprotected iterator. - - """ - if not self.dispatch_one_batch(self._original_iterator): - self._iterating = False - self._original_iterator = None - - def dispatch_one_batch(self, iterator): - """Prefetch the tasks for the next batch and dispatch them. - - The effective size of the batch is computed here. - If there are no more jobs to dispatch, return False, else return True. - - The iterator consumption and dispatching is protected by the same - lock so calling this function should be thread safe. - - """ - if self.batch_size == 'auto': - batch_size = self._backend.compute_batch_size() - else: - # Fixed batch size strategy - batch_size = self.batch_size - - with self._lock: - tasks = BatchedCalls(itertools.islice(iterator, batch_size), - self._backend.get_nested_backend(), - self._pickle_cache) - if len(tasks) == 0: - # No more tasks available in the iterator: tell caller to stop. - return False - else: - self._dispatch(tasks) - return True - - def _print(self, msg, msg_args): - """Display the message on stout or stderr depending on verbosity""" - # XXX: Not using the logger framework: need to - # learn to use logger better. - if not self.verbose: - return - if self.verbose < 50: - writer = sys.stderr.write - else: - writer = sys.stdout.write - msg = msg % msg_args - writer('[%s]: %s\n' % (self, msg)) - - def print_progress(self): - """Display the process of the parallel execution only a fraction - of time, controlled by self.verbose. - """ - if not self.verbose: - return - elapsed_time = time.time() - self._start_time - - # Original job iterator becomes None once it has been fully - # consumed : at this point we know the total number of jobs and we are - # able to display an estimation of the remaining time based on already - # completed jobs. Otherwise, we simply display the number of completed - # tasks. - if self._original_iterator is not None: - if _verbosity_filter(self.n_dispatched_batches, self.verbose): - return - self._print('Done %3i tasks | elapsed: %s', - (self.n_completed_tasks, - short_format_time(elapsed_time), )) - else: - index = self.n_completed_tasks - # We are finished dispatching - total_tasks = self.n_dispatched_tasks - # We always display the first loop - if not index == 0: - # Display depending on the number of remaining items - # A message as soon as we finish dispatching, cursor is 0 - cursor = (total_tasks - index + 1 - - self._pre_dispatch_amount) - frequency = (total_tasks // self.verbose) + 1 - is_last_item = (index + 1 == total_tasks) - if (is_last_item or cursor % frequency): - return - remaining_time = (elapsed_time / index) * \ - (self.n_dispatched_tasks - index * 1.0) - # only display status if remaining time is greater or equal to 0 - self._print('Done %3i out of %3i | elapsed: %s remaining: %s', - (index, - total_tasks, - short_format_time(elapsed_time), - short_format_time(remaining_time), - )) - - def retrieve(self): - self._output = list() - while self._iterating or len(self._jobs) > 0: - if len(self._jobs) == 0: - # Wait for an async callback to dispatch new jobs - time.sleep(0.01) - continue - # We need to be careful: the job list can be filling up as - # we empty it and Python list are not thread-safe by default hence - # the use of the lock - with self._lock: - job = self._jobs.pop(0) - - try: - if getattr(self._backend, 'supports_timeout', False): - self._output.extend(job.get(timeout=self.timeout)) - else: - self._output.extend(job.get()) - - except BaseException as exception: - # Note: we catch any BaseException instead of just Exception - # instances to also include KeyboardInterrupt. - - # Stop dispatching any new job in the async callback thread - self._aborting = True - - # If the backend allows it, cancel or kill remaining running - # tasks without waiting for the results as we will raise - # the exception we got back to the caller instead of returning - # any result. - backend = self._backend - if (backend is not None and - hasattr(backend, 'abort_everything')): - # If the backend is managed externally we need to make sure - # to leave it in a working state to allow for future jobs - # scheduling. - ensure_ready = self._managed_backend - backend.abort_everything(ensure_ready=ensure_ready) - - if isinstance(exception, TransportableException): - # Capture exception to add information on the local - # stack in addition to the distant stack - this_report = format_outer_frames(context=10, - stack_start=1) - raise exception.unwrap(this_report) - else: - raise - - def __call__(self, iterable): - if self._jobs: - raise ValueError('This Parallel instance is already running') - # A flag used to abort the dispatching of jobs in case an - # exception is found - self._aborting = False - - if not self._managed_backend: - n_jobs = self._initialize_backend() - else: - n_jobs = self._effective_n_jobs() - self._print("Using backend %s with %d concurrent workers.", - (self._backend.__class__.__name__, n_jobs)) - if hasattr(self._backend, 'start_call'): - self._backend.start_call() - iterator = iter(iterable) - pre_dispatch = self.pre_dispatch - - if pre_dispatch == 'all' or n_jobs == 1: - # prevent further dispatch via multiprocessing callback thread - self._original_iterator = None - self._pre_dispatch_amount = 0 - else: - self._original_iterator = iterator - if hasattr(pre_dispatch, 'endswith'): - pre_dispatch = eval(pre_dispatch) - self._pre_dispatch_amount = pre_dispatch = int(pre_dispatch) - - # The main thread will consume the first pre_dispatch items and - # the remaining items will later be lazily dispatched by async - # callbacks upon task completions. - iterator = itertools.islice(iterator, pre_dispatch) - - self._start_time = time.time() - self.n_dispatched_batches = 0 - self.n_dispatched_tasks = 0 - self.n_completed_tasks = 0 - # Use a caching dict for callables that are pickled with cloudpickle to - # improve performances. This cache is used only in the case of - # functions that are defined in the __main__ module, functions that are - # defined locally (inside another function) and lambda expressions. - self._pickle_cache = dict() - try: - # Only set self._iterating to True if at least a batch - # was dispatched. In particular this covers the edge - # case of Parallel used with an exhausted iterator. If - # self._original_iterator is None, then this means either - # that pre_dispatch == "all", n_jobs == 1 or that the first batch - # was very quick and its callback already dispatched all the - # remaining jobs. - self._iterating = False - if self.dispatch_one_batch(iterator): - self._iterating = self._original_iterator is not None - - while self.dispatch_one_batch(iterator): - pass - - if pre_dispatch == "all" or n_jobs == 1: - # The iterable was consumed all at once by the above for loop. - # No need to wait for async callbacks to trigger to - # consumption. - self._iterating = False - - with self._backend.retrieval_context(): - self.retrieve() - # Make sure that we get a last message telling us we are done - elapsed_time = time.time() - self._start_time - self._print('Done %3i out of %3i | elapsed: %s finished', - (len(self._output), len(self._output), - short_format_time(elapsed_time))) - finally: - if hasattr(self._backend, 'stop_call'): - self._backend.stop_call() - if not self._managed_backend: - self._terminate_backend() - self._jobs = list() - self._pickle_cache = None - output = self._output - self._output = None - return output - - def __repr__(self): - return '%s(n_jobs=%s)' % (self.__class__.__name__, self.n_jobs) diff --git a/sklearn/externals/joblib/pool.py b/sklearn/externals/joblib/pool.py deleted file mode 100644 index 396a3dfb4efcc..0000000000000 --- a/sklearn/externals/joblib/pool.py +++ /dev/null @@ -1,329 +0,0 @@ -"""Custom implementation of multiprocessing.Pool with custom pickler. - -This module provides efficient ways of working with data stored in -shared memory with numpy.memmap arrays without inducing any memory -copy between the parent and child processes. - -This module should not be imported if multiprocessing is not -available as it implements subclasses of multiprocessing Pool -that uses a custom alternative to SimpleQueue. - -""" -# Author: Olivier Grisel -# Copyright: 2012, Olivier Grisel -# License: BSD 3 clause - -import sys -import warnings -from time import sleep - -try: - WindowsError -except NameError: - WindowsError = type(None) - -# Customizable pure Python pickler in Python 2 -# customizable C-optimized pickler under Python 3.3+ -from pickle import Pickler - -from pickle import HIGHEST_PROTOCOL -from io import BytesIO - -from .disk import delete_folder -from ._memmapping_reducer import get_memmapping_reducers -from ._multiprocessing_helpers import mp, assert_spawning - -# We need the class definition to derive from it, not the multiprocessing.Pool -# factory function -from multiprocessing.pool import Pool - -try: - import numpy as np -except ImportError: - np = None - -if sys.version_info[:2] > (2, 7): - import copyreg - - -############################################################################### -# Enable custom pickling in Pool queues - -class CustomizablePickler(Pickler): - """Pickler that accepts custom reducers. - - HIGHEST_PROTOCOL is selected by default as this pickler is used - to pickle ephemeral datastructures for interprocess communication - hence no backward compatibility is required. - - `reducers` is expected to be a dictionary with key/values - being `(type, callable)` pairs where `callable` is a function that - give an instance of `type` will return a tuple `(constructor, - tuple_of_objects)` to rebuild an instance out of the pickled - `tuple_of_objects` as would return a `__reduce__` method. See the - standard library documentation on pickling for more details. - - """ - - # We override the pure Python pickler as its the only way to be able to - # customize the dispatch table without side effects in Python 2.7 - # to 3.2. For Python 3.3+ leverage the new dispatch_table - # feature from http://bugs.python.org/issue14166 that makes it possible - # to use the C implementation of the Pickler which is faster. - - def __init__(self, writer, reducers=None, protocol=HIGHEST_PROTOCOL): - Pickler.__init__(self, writer, protocol=protocol) - if reducers is None: - reducers = {} - if hasattr(Pickler, 'dispatch'): - # Make the dispatch registry an instance level attribute instead of - # a reference to the class dictionary under Python 2 - self.dispatch = Pickler.dispatch.copy() - else: - # Under Python 3 initialize the dispatch table with a copy of the - # default registry - self.dispatch_table = copyreg.dispatch_table.copy() - for type, reduce_func in reducers.items(): - self.register(type, reduce_func) - - def register(self, type, reduce_func): - """Attach a reducer function to a given type in the dispatch table.""" - if hasattr(Pickler, 'dispatch'): - # Python 2 pickler dispatching is not explicitly customizable. - # Let us use a closure to workaround this limitation. - def dispatcher(self, obj): - reduced = reduce_func(obj) - self.save_reduce(obj=obj, *reduced) - self.dispatch[type] = dispatcher - else: - self.dispatch_table[type] = reduce_func - - -class CustomizablePicklingQueue(object): - """Locked Pipe implementation that uses a customizable pickler. - - This class is an alternative to the multiprocessing implementation - of SimpleQueue in order to make it possible to pass custom - pickling reducers, for instance to avoid memory copy when passing - memory mapped datastructures. - - `reducers` is expected to be a dict with key / values being - `(type, callable)` pairs where `callable` is a function that, given an - instance of `type`, will return a tuple `(constructor, tuple_of_objects)` - to rebuild an instance out of the pickled `tuple_of_objects` as would - return a `__reduce__` method. - - See the standard library documentation on pickling for more details. - """ - - def __init__(self, context, reducers=None): - self._reducers = reducers - self._reader, self._writer = context.Pipe(duplex=False) - self._rlock = context.Lock() - if sys.platform == 'win32': - self._wlock = None - else: - self._wlock = context.Lock() - self._make_methods() - - def __getstate__(self): - assert_spawning(self) - return (self._reader, self._writer, self._rlock, self._wlock, - self._reducers) - - def __setstate__(self, state): - (self._reader, self._writer, self._rlock, self._wlock, - self._reducers) = state - self._make_methods() - - def empty(self): - return not self._reader.poll() - - def _make_methods(self): - self._recv = recv = self._reader.recv - racquire, rrelease = self._rlock.acquire, self._rlock.release - - def get(): - racquire() - try: - return recv() - finally: - rrelease() - - self.get = get - - if self._reducers: - def send(obj): - buffer = BytesIO() - CustomizablePickler(buffer, self._reducers).dump(obj) - self._writer.send_bytes(buffer.getvalue()) - self._send = send - else: - self._send = send = self._writer.send - if self._wlock is None: - # writes to a message oriented win32 pipe are atomic - self.put = send - else: - wlock_acquire, wlock_release = ( - self._wlock.acquire, self._wlock.release) - - def put(obj): - wlock_acquire() - try: - return send(obj) - finally: - wlock_release() - - self.put = put - - -class PicklingPool(Pool): - """Pool implementation with customizable pickling reducers. - - This is useful to control how data is shipped between processes - and makes it possible to use shared memory without useless - copies induces by the default pickling methods of the original - objects passed as arguments to dispatch. - - `forward_reducers` and `backward_reducers` are expected to be - dictionaries with key/values being `(type, callable)` pairs where - `callable` is a function that, given an instance of `type`, will return a - tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the - pickled `tuple_of_objects` as would return a `__reduce__` method. - See the standard library documentation about pickling for more details. - - """ - - def __init__(self, processes=None, forward_reducers=None, - backward_reducers=None, **kwargs): - if forward_reducers is None: - forward_reducers = dict() - if backward_reducers is None: - backward_reducers = dict() - self._forward_reducers = forward_reducers - self._backward_reducers = backward_reducers - poolargs = dict(processes=processes) - poolargs.update(kwargs) - super(PicklingPool, self).__init__(**poolargs) - - def _setup_queues(self): - context = getattr(self, '_ctx', mp) - self._inqueue = CustomizablePicklingQueue(context, - self._forward_reducers) - self._outqueue = CustomizablePicklingQueue(context, - self._backward_reducers) - self._quick_put = self._inqueue._send - self._quick_get = self._outqueue._recv - - -class MemmappingPool(PicklingPool): - """Process pool that shares large arrays to avoid memory copy. - - This drop-in replacement for `multiprocessing.pool.Pool` makes - it possible to work efficiently with shared memory in a numpy - context. - - Existing instances of numpy.memmap are preserved: the child - suprocesses will have access to the same shared memory in the - original mode except for the 'w+' mode that is automatically - transformed as 'r+' to avoid zeroing the original data upon - instantiation. - - Furthermore large arrays from the parent process are automatically - dumped to a temporary folder on the filesystem such as child - processes to access their content via memmapping (file system - backed shared memory). - - Note: it is important to call the terminate method to collect - the temporary folder used by the pool. - - Parameters - ---------- - processes: int, optional - Number of worker processes running concurrently in the pool. - initializer: callable, optional - Callable executed on worker process creation. - initargs: tuple, optional - Arguments passed to the initializer callable. - temp_folder: str, optional - Folder to be used by the pool for memmapping large arrays - for sharing memory with worker processes. If None, this will try in - order: - - a folder pointed by the JOBLIB_TEMP_FOLDER environment variable, - - /dev/shm if the folder exists and is writable: this is a RAMdisk - filesystem available by default on modern Linux distributions, - - the default system temporary folder that can be overridden - with TMP, TMPDIR or TEMP environment variables, typically /tmp - under Unix operating systems. - max_nbytes int or None, optional, 1e6 by default - Threshold on the size of arrays passed to the workers that - triggers automated memory mapping in temp_folder. - Use None to disable memmapping of large arrays. - mmap_mode: {'r+', 'r', 'w+', 'c'} - Memmapping mode for numpy arrays passed to workers. - See 'max_nbytes' parameter documentation for more details. - forward_reducers: dictionary, optional - Reducers used to pickle objects passed from master to worker - processes: see below. - backward_reducers: dictionary, optional - Reducers used to pickle return values from workers back to the - master process. - verbose: int, optional - Make it possible to monitor how the communication of numpy arrays - with the subprocess is handled (pickling or memmapping) - prewarm: bool or str, optional, "auto" by default. - If True, force a read on newly memmapped array to make sure that OS - pre-cache it in memory. This can be useful to avoid concurrent disk - access when the same data array is passed to different worker - processes. If "auto" (by default), prewarm is set to True, unless the - Linux shared memory partition /dev/shm is available and used as temp - folder. - - `forward_reducers` and `backward_reducers` are expected to be - dictionaries with key/values being `(type, callable)` pairs where - `callable` is a function that give an instance of `type` will return - a tuple `(constructor, tuple_of_objects)` to rebuild an instance out - of the pickled `tuple_of_objects` as would return a `__reduce__` - method. See the standard library documentation on pickling for more - details. - - """ - - def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6, - mmap_mode='r', forward_reducers=None, backward_reducers=None, - verbose=0, context_id=None, prewarm=False, **kwargs): - - if context_id is not None: - warnings.warn('context_id is deprecated and ignored in joblib' - ' 0.9.4 and will be removed in 0.11', - DeprecationWarning) - - forward_reducers, backward_reducers, self._temp_folder = \ - get_memmapping_reducers( - id(self), temp_folder=temp_folder, max_nbytes=max_nbytes, - mmap_mode=mmap_mode, forward_reducers=forward_reducers, - backward_reducers=backward_reducers, verbose=verbose, - prewarm=prewarm) - - poolargs = dict( - processes=processes, - forward_reducers=forward_reducers, - backward_reducers=backward_reducers) - poolargs.update(kwargs) - super(MemmappingPool, self).__init__(**poolargs) - - def terminate(self): - n_retries = 10 - for i in range(n_retries): - try: - super(MemmappingPool, self).terminate() - break - except OSError as e: - if isinstance(e, WindowsError): - # Workaround occasional "[Error 5] Access is denied" issue - # when trying to terminate a process under windows. - sleep(0.1) - if i + 1 == n_retries: - warnings.warn("Failed to terminate worker processes in" - " multiprocessing pool: %r" % e) - delete_folder(self._temp_folder) diff --git a/sklearn/externals/joblib/testing.py b/sklearn/externals/joblib/testing.py deleted file mode 100644 index 5426c6338651b..0000000000000 --- a/sklearn/externals/joblib/testing.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Helper for testing. -""" - -import sys -import warnings -import os.path -import re -import subprocess -import threading - -import pytest -import _pytest - -from sklearn.externals.joblib._compat import PY3_OR_LATER - - -raises = pytest.raises -warns = pytest.warns -SkipTest = _pytest.runner.Skipped -skipif = pytest.mark.skipif -fixture = pytest.fixture -parametrize = pytest.mark.parametrize -timeout = pytest.mark.timeout - - -def warnings_to_stdout(): - """ Redirect all warnings to stdout. - """ - showwarning_orig = warnings.showwarning - - def showwarning(msg, cat, fname, lno, file=None, line=0): - showwarning_orig(msg, cat, os.path.basename(fname), line, sys.stdout) - - warnings.showwarning = showwarning - # warnings.simplefilter('always') - - -def check_subprocess_call(cmd, timeout=5, stdout_regex=None, - stderr_regex=None): - """Runs a command in a subprocess with timeout in seconds. - - Also checks returncode is zero, stdout if stdout_regex is set, and - stderr if stderr_regex is set. - """ - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - def kill_process(): - warnings.warn("Timeout running {}".format(cmd)) - proc.kill() - - timer = threading.Timer(timeout, kill_process) - try: - timer.start() - stdout, stderr = proc.communicate() - - if PY3_OR_LATER: - stdout, stderr = stdout.decode(), stderr.decode() - if proc.returncode != 0: - message = ( - 'Non-zero return code: {}.\nStdout:\n{}\n' - 'Stderr:\n{}').format( - proc.returncode, stdout, stderr) - raise ValueError(message) - - if (stdout_regex is not None and - not re.search(stdout_regex, stdout)): - raise ValueError( - "Unexpected stdout: {!r} does not match:\n{!r}".format( - stdout_regex, stdout)) - if (stderr_regex is not None and - not re.search(stderr_regex, stderr)): - raise ValueError( - "Unexpected stderr: {!r} does not match:\n{!r}".format( - stderr_regex, stderr)) - - finally: - timer.cancel() diff --git a/sklearn/externals/setup.py b/sklearn/externals/setup.py index 452f7d25d071c..936f0327226d6 100644 --- a/sklearn/externals/setup.py +++ b/sklearn/externals/setup.py @@ -5,9 +5,5 @@ def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration config = Configuration('externals', parent_package, top_path) config.add_subpackage('joblib') - config.add_subpackage('joblib/externals') - config.add_subpackage('joblib/externals/loky') - config.add_subpackage('joblib/externals/loky/backend') - config.add_subpackage('joblib/externals/cloudpickle') return config diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py index d34f9e23d9fc9..d2cd677a4b163 100644 --- a/sklearn/tests/test_site_joblib.py +++ b/sklearn/tests/test_site_joblib.py @@ -1,20 +1,10 @@ import os import pytest -from sklearn import externals -from sklearn.externals import joblib as joblib_vendored from sklearn.utils._joblib import Parallel, delayed, Memory, parallel_backend -if os.environ.get('SKLEARN_SITE_JOBLIB', False): - import joblib as joblib_site -else: - joblib_site = None - def test_old_pickle(tmpdir): - vendored_joblib_home = os.path.dirname(joblib_vendored.__file__) - sklearn_externals_home = os.path.dirname(externals.__file__) - if not vendored_joblib_home.startswith(sklearn_externals_home): - pytest.skip("joblib is physically unvendored (e.g. as in debian)") + import joblib # Check that a pickle that references sklearn.external.joblib can load f = tmpdir.join('foo.pkl') @@ -26,24 +16,4 @@ def test_old_pickle(tmpdir): b'\x0fU\nallow_mmapq\x10\x88ub\x01\x00\x00\x00\x00\x00\x00\x00.', mode='wb') - joblib_vendored.load(str(f)) - - -def test_site_joblib_dispatch(): - if os.environ.get('SKLEARN_SITE_JOBLIB', False): - assert Parallel is joblib_site.Parallel - assert delayed is joblib_site.delayed - assert parallel_backend is joblib_site.parallel_backend - assert Memory is joblib_site.Memory - - assert joblib_vendored.Parallel is not joblib_site.Parallel - assert joblib_vendored.delayed is not joblib_site.delayed - assert joblib_vendored.parallel_backend is not \ - joblib_site.parallel_backend - assert joblib_vendored.Memory is not joblib_site.Memory - - else: - assert Parallel is joblib_vendored.Parallel - assert delayed is joblib_vendored.delayed - assert parallel_backend is joblib_vendored.parallel_backend - assert Memory is joblib_vendored.Memory + joblib.load(str(f)) diff --git a/sklearn/utils/_joblib.py b/sklearn/utils/_joblib.py index 0cf6519aea5a0..6bbebb6c1445a 100644 --- a/sklearn/utils/_joblib.py +++ b/sklearn/utils/_joblib.py @@ -1,31 +1,18 @@ -# We need the absolute_import to avoid the local joblib to override the -# site one import os as _os import warnings as _warnings -# An environment variable to use the site joblib -if _os.environ.get('SKLEARN_SITE_JOBLIB', False): - with _warnings.catch_warnings(): - _warnings.simplefilter("ignore") - # joblib imports may raise DeprecationWarning on certain Python - # versions - import joblib - from joblib import logger - from joblib import dump, load - from joblib import __version__ - from joblib import effective_n_jobs - from joblib import hash - from joblib import cpu_count, Parallel, Memory, delayed - from joblib import parallel_backend, register_parallel_backend -else: - from ..externals import joblib - from ..externals.joblib import logger - from ..externals.joblib import dump, load - from ..externals.joblib import __version__ - from ..externals.joblib import effective_n_jobs - from ..externals.joblib import hash - from ..externals.joblib import cpu_count, Parallel, Memory, delayed - from ..externals.joblib import parallel_backend, register_parallel_backend +with _warnings.catch_warnings(): + _warnings.simplefilter("ignore") + # joblib imports may raise DeprecationWarning on certain Python + # versions + import joblib + from joblib import logger + from joblib import dump, load + from joblib import __version__ + from joblib import effective_n_jobs + from joblib import hash + from joblib import cpu_count, Parallel, Memory, delayed + from joblib import parallel_backend, register_parallel_backend __all__ = ["parallel_backend", "register_parallel_backend", "cpu_count",