MAINT Unvendor joblib (scikit-learn#13531)

randomgeek78 · Apr 17, 2019 · fc33d30 · fc33d30
1 parent 7243cc3
commit fc33d30
Show file tree

Hide file tree

Showing 67 changed files with 61 additions and 14,071 deletions.
diff --git a/README.rst b/README.rst
@@ -50,6 +50,7 @@ scikit-learn requires:
 - Python (>= 3.5)
 - NumPy (>= 1.11.0)
 - SciPy (>= 0.17.0)
+- joblib (>= 0.11)
 
 **Scikit-learn 0.20 was the last version to support Python2.7.**
 Scikit-learn 0.21 and later require Python 3.5 or newer.

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -11,7 +11,6 @@ jobs:
       py35_np_atlas:
         DISTRIB: 'ubuntu'
         PYTHON_VERSION: '3.5'
-        SKLEARN_SITE_JOBLIB: '1'
         JOBLIB_VERSION: '0.11'
         SKLEARN_NO_OPENMP: 'True'
       # Linux + Python 3.5 build with OpenBLAS and without SITE_JOBLIB
@@ -23,10 +22,11 @@ jobs:
         SCIPY_VERSION: '0.17.0'
         CYTHON_VERSION: '*'
         PILLOW_VERSION: '4.0.0'
+        # later version of joblib are not packaged in conda for Python 3.5
+        JOBLIB_VERSION: '0.12.3'
         COVERAGE: 'true'
       # Linux environment to test the latest available dependencies and MKL.
       # It runs tests requiring pandas and PyAMG.
-      # It also runs with the site joblib instead of the vendored copy of joblib.
       pylatest_conda:
         DISTRIB: 'conda'
         PYTHON_VERSION: '*'
@@ -41,7 +41,6 @@ jobs:
         COVERAGE: 'true'
         CHECK_PYTEST_SOFT_DEPENDENCY: 'true'
         TEST_DOCSTRINGS: 'true'
-        SKLEARN_SITE_JOBLIB: '1'
         CHECK_WARNINGS: 'true'
 
 - template: build_tools/azure/posix.yml

diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd
@@ -11,11 +11,11 @@ IF "%PYTHON_ARCH%"=="64" (
     call deactivate
     @rem Clean up any left-over from a previous build
     conda remove --all -q -y -n %VIRTUALENV%
-    conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow
+    conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow joblib
 
     call activate %VIRTUALENV%
 ) else (
-    pip install numpy scipy cython pytest wheel pillow
+    pip install numpy scipy cython pytest wheel pillow joblib
 )
 if "%COVERAGE%" == "true" (
     pip install coverage codecov pytest-cov

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
@@ -27,7 +27,7 @@ make_conda() {
 if [[ "$DISTRIB" == "conda" ]]; then
     TO_INSTALL="python=$PYTHON_VERSION pip pytest pytest-cov \
                 numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \
-                cython=$CYTHON_VERSION"
+                cython=$CYTHON_VERSION joblib=$JOBLIB_VERSION"
 
     if [[ "$INSTALL_MKL" == "true" ]]; then
         TO_INSTALL="$TO_INSTALL mkl"
@@ -47,10 +47,6 @@ if [[ "$DISTRIB" == "conda" ]]; then
         TO_INSTALL="$TO_INSTALL pillow=$PILLOW_VERSION"
     fi
 
-    if [[ -n "$JOBLIB_VERSION" ]]; then
-        TO_INSTALL="$TO_INSTALL joblib=$JOBLIB_VERSION"
-    fi
-
 	make_conda $TO_INSTALL
 
 elif [[ "$DISTRIB" == "ubuntu" ]]; then

diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
@@ -113,7 +113,6 @@ elif [[ "$DISTRIB" == "scipy-dev" ]]; then
     pip install --pre --upgrade --timeout=60 -f $dev_url numpy scipy pandas cython
     echo "Installing joblib master"
     pip install https://github.com/joblib/joblib/archive/master.zip
-    export SKLEARN_SITE_JOBLIB=1
     echo "Installing pillow master"
     pip install https://github.com/python-pillow/Pillow/archive/master.zip
     pip install pytest pytest-cov

diff --git a/conftest.py b/conftest.py
@@ -61,3 +61,13 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if isinstance(item, DoctestItem):
                 item.add_marker(skip_marker)
+
+
+def pytest_configure(config):
+    import sys
+    sys._is_pytest_session = True
+
+
+def pytest_unconfigure(config):
+    import sys
+    del sys._is_pytest_session
diff --git a/doc/modules/computing.rst b/doc/modules/computing.rst
@@ -553,6 +553,11 @@ These environment variables should be set before importing scikit-learn.
     is supported. In addition, dumps from joblib.Memory might be incompatible,
     and you might loose some caches and have to redownload some datasets.
 
+    .. deprecated:: 0.21
+
+       As of version 0.21 this parameter has no effect, vendored joblib was
+       removed and site joblib is always used.
+
 :SKLEARN_ASSUME_FINITE:
 
     Sets the default value for the `assume_finite` argument of

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -634,6 +634,16 @@ Multiple modules
   :issue:`13422` by :user:`Madhura Parikh <jdnc>` and
   :user:`Clément Doumouro <ClemDoum>`.
 
+
+Dependencies
+............
+
+- |Enhancement| Joblib is no longer vendored in scikit-learn, and becomes a
+  dependency. Minimal supported version is joblib 0.11, however using
+  version >= 0.13 is strongly recommended. 
+  :issue:`13531` by :user:`Roman Yurchak <rth>`.
+
+
 Changes to estimator checks
 ---------------------------
 

diff --git a/setup.py b/setup.py
@@ -57,6 +57,7 @@
     SCIPY_MIN_VERSION = '0.17.0'
     NUMPY_MIN_VERSION = '1.11.0'
 
+JOBLIB_MIN_VERSION = '0.11'
 
 # Optional setuptools features
 # We need to import setuptools early, if we want setuptools features,
@@ -226,7 +227,8 @@ def setup_package():
                     cmdclass=cmdclass,
                     install_requires=[
                         'numpy>={}'.format(NUMPY_MIN_VERSION),
-                        'scipy>={}'.format(SCIPY_MIN_VERSION)
+                        'scipy>={}'.format(SCIPY_MIN_VERSION),
+                        'joblib>={}'.format(JOBLIB_MIN_VERSION)
                     ],
                     **extra_setuptools_args)
 

diff --git a/sklearn/externals/README b/sklearn/externals/README
@@ -1,9 +1,6 @@
 This directory contains bundled external dependencies that are updated
 every once in a while.
 
-Note to developers and advanced users: setting the SKLEARN_SITE_JOBLIB to
-a non null value will force scikit-learn to use the site joblib.
-
 Note for distribution packagers: if you want to remove the duplicated
 code and depend on a packaged version, we suggest that you simply do a
 symbolic link in this directory.

diff --git a/sklearn/externals/copy_joblib.sh b/sklearn/externals/copy_joblib.sh
diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
@@ -1,133 +1,15 @@
-"""Joblib is a set of tools to provide **lightweight pipelining in
-Python**. In particular:
+# Import necessary to preserve backward compatibility of pickles
+import sys
+import warnings
 
-1. transparent disk-caching of functions and lazy re-evaluation
-   (memoize pattern)
+from joblib import *
 
-2. easy simple parallel computing
 
-Joblib is optimized to be **fast** and **robust** in particular on large
-data and has specific optimizations for `numpy` arrays. It is
-**BSD-licensed**.
+msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be removed "
+       "in 0.23. Please import this functionality directly from joblib, "
+       "which can be installed with: pip install joblib. If this warning is "
+       "raised when loading pickled models, you may need to re-serialize "
+       "those models with scikit-learn 0.21+.")
 
-
-    ==================== ===============================================
-    **Documentation:**       https://joblib.readthedocs.io
-
-    **Download:**            http://pypi.python.org/pypi/joblib#downloads
-
-    **Source code:**         http://github.com/joblib/joblib
-
-    **Report issues:**       http://github.com/joblib/joblib/issues
-    ==================== ===============================================
-
-
-Vision
---------
-
-The vision is to provide tools to easily achieve better performance and
-reproducibility when working with long running jobs.
-
- *  **Avoid computing twice the same thing**: code is rerun over an
-    over, for instance when prototyping computational-heavy jobs (as in
-    scientific development), but hand-crafted solution to alleviate this
-    issue is error-prone and often leads to unreproducible results
-
- *  **Persist to disk transparently**: persisting in an efficient way
-    arbitrary objects containing large data is hard. Using
-    joblib's caching mechanism avoids hand-written persistence and
-    implicitly links the file on disk to the execution context of
-    the original Python object. As a result, joblib's persistence is
-    good for resuming an application status or computational job, eg
-    after a crash.
-
-Joblib addresses these problems while **leaving your code and your flow
-control as unmodified as possible** (no framework, no new paradigms).
-
-Main features
-------------------
-
-1) **Transparent and fast disk-caching of output value:** a memoize or
-   make-like functionality for Python functions that works well for
-   arbitrary Python objects, including very large numpy arrays. Separate
-   persistence and flow-execution logic from domain logic or algorithmic
-   code by writing the operations as a set of steps with well-defined
-   inputs and  outputs: Python functions. Joblib can save their
-   computation to disk and rerun it only if necessary::
-
-      >>> from sklearn.externals.joblib import Memory
-      >>> cachedir = 'your_cache_dir_goes_here'
-      >>> mem = Memory(cachedir)
-      >>> import numpy as np
-      >>> a = np.vander(np.arange(3)).astype(np.float)
-      >>> square = mem.cache(np.square)
-      >>> b = square(a)                                   # doctest: +ELLIPSIS
-      ________________________________________________________________________________
-      [Memory] Calling square...
-      square(array([[0., 0., 1.],
-             [1., 1., 1.],
-             [4., 2., 1.]]))
-      ___________________________________________________________square - 0...s, 0.0min
-
-      >>> c = square(a)
-      >>> # The above call did not trigger an evaluation
-
-2) **Embarrassingly parallel helper:** to make it easy to write readable
-   parallel code and debug it quickly::
-
-      >>> from sklearn.externals.joblib import Parallel, delayed
-      >>> from math import sqrt
-      >>> Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10))
-      [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
-
-
-3) **Fast compressed Persistence**: a replacement for pickle to work
-   efficiently on Python objects containing large data (
-   *joblib.dump* & *joblib.load* ).
-
-..
-    >>> import shutil ; shutil.rmtree(cachedir)
-
-"""
-
-# PEP0440 compatible formatted version, see:
-# https://www.python.org/dev/peps/pep-0440/
-#
-# Generic release markers:
-# X.Y
-# X.Y.Z # For bugfix releases
-#
-# Admissible pre-release markers:
-# X.YaN # Alpha release
-# X.YbN # Beta release
-# X.YrcN # Release Candidate
-# X.Y # Final release
-#
-# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
-# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-#
-__version__ = '0.13.0'
-
-
-from .memory import Memory, MemorizedResult, register_store_backend
-from .logger import PrintTime
-from .logger import Logger
-from .hashing import hash
-from .numpy_pickle import dump
-from .numpy_pickle import load
-from .compressor import register_compressor
-from .parallel import Parallel
-from .parallel import delayed
-from .parallel import cpu_count
-from .parallel import register_parallel_backend
-from .parallel import parallel_backend
-from .parallel import effective_n_jobs
-
-from .externals.loky import wrap_non_picklable_objects
-
-
-__all__ = ['Memory', 'MemorizedResult', 'PrintTime', 'Logger', 'hash', 'dump',
-           'load', 'Parallel', 'delayed', 'cpu_count', 'effective_n_jobs',
-           'register_parallel_backend', 'parallel_backend',
-           'register_store_backend', 'register_compressor',
-           'wrap_non_picklable_objects']
+if not hasattr(sys, "_is_pytest_session"):
+    warnings.warn(msg, category=DeprecationWarning)
diff --git a/sklearn/externals/joblib/_compat.py b/sklearn/externals/joblib/_compat.py