Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: remove isdir() check in copy_file(), about 10% of the run time was checking if the file to copy is a directory. #258

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7a6f42b
👹 Feed the hobgoblins (delint).
jaraco Apr 14, 2024
a996148
Move compatibility modules into compat package.
jaraco Apr 14, 2024
b164d66
Move compatibility module into compat package.
jaraco Apr 14, 2024
89522f9
Fix return type to match implementation.
jaraco Apr 14, 2024
c6b23d0
🧎‍♀️ Genuflect to the types.
jaraco Apr 14, 2024
e5268b5
Oops. Meant 2025.
jaraco Apr 14, 2024
6c1cb08
Migrated config to pyproject.toml using jaraco.develop.migrate-config…
jaraco Apr 18, 2024
851e71a
Merge https://github.com/jaraco/skeleton
jaraco Apr 21, 2024
7dcde5e
Extract _make_executable for TestSpawn.
jaraco Apr 20, 2024
041c42e
Move and reword comment for brevity and clarity.
jaraco Apr 20, 2024
b07b4ed
Remove C901 exclusion; code is now compliant.
jaraco Apr 21, 2024
ef8f235
Remove apparently unnecessary cast to list.
jaraco Apr 21, 2024
03f1d85
Use proper boolean literals.
jaraco Apr 21, 2024
e85efee
Replace Popen with check_call.
jaraco Apr 21, 2024
976e935
Extract function for _debug wrapper.
jaraco Apr 21, 2024
d6652a4
Extract function to inject macos version.
jaraco Apr 21, 2024
806b1ca
👹 Feed the hobgoblins (delint).
jaraco Apr 21, 2024
e5b06e1
Merge https://github.com/jaraco/skeleton
jaraco Apr 24, 2024
4549de1
Use mkstemp unconditionally. mktemp has been deprecated since Python …
jaraco May 29, 2024
294b206
Merge https://github.com/jaraco/skeleton
jaraco Jun 19, 2024
a37185d
Pin to pytest<8.1.
jaraco Jun 19, 2024
745640e
PERF: remove isdir() check in copy_file(), about 10% of the run time …
rmmancom May 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ python:
install:
- path: .
extra_requirements:
- docs
- doc

# required boilerplate readthedocs/readthedocs.org#10401
build:
Expand Down
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _save_cwd():

@pytest.fixture
def distutils_managed_tempdir(request):
from distutils.tests import py38compat as os_helper
from distutils.tests.compat import py38 as os_helper

self = request.instance
self.tempdirs = []
Expand Down
2 changes: 1 addition & 1 deletion distutils/_modified.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import os.path

from ._functools import splat
from .compat.py39 import zip_strict
from .errors import DistutilsFileError
from .py39compat import zip_strict


def _newer(source, target):
Expand Down
6 changes: 4 additions & 2 deletions distutils/command/bdist_rpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,13 @@ def run(self): # noqa: C901

source = sdist.get_archive_files()[0]
source_dir = rpm_dir['SOURCES']
self.copy_file(source, source_dir)
dest = os.path.join(source_dir, os.path.basename(source))
self.copy_file(source, dest)
Comment on lines +319 to +320
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm uneasy with this pattern being now duplicated 5 times across the codebase. I'd rather see something like:

Suggested change
dest = os.path.join(source_dir, os.path.basename(source))
self.copy_file(source, dest)
self.copy_file_to_dir(source, source_dir)

And then do some refactoring to make copy_file_to_dir perform the dest operation.


if self.icon:
if os.path.exists(self.icon):
self.copy_file(self.icon, source_dir)
dest = os.path.join(source_dir, os.path.basename(self.icon))
self.copy_file(self.icon, dest)
else:
raise DistutilsFileError("icon file '%s' does not exist" % self.icon)

Expand Down
6 changes: 4 additions & 2 deletions distutils/command/install_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def run(self):
"setup script did not provide a directory for "
f"'{f}' -- installing right in '{self.install_dir}'"
)
(out, _) = self.copy_file(f, self.install_dir)
dst = os.path.join(self.install_dir, os.path.basename(f))
(out, _) = self.copy_file(f, dst)
self.outfiles.append(out)
else:
# it's a tuple with path to install to and a list of files
Expand All @@ -74,7 +75,8 @@ def run(self):
# Copy files, adding them to the list of output files.
for data in f[1]:
data = convert_path(data)
(out, _) = self.copy_file(data, dir)
dst = os.path.join(dir, os.path.basename(data))
(out, _) = self.copy_file(data, dst)
self.outfiles.append(out)

def get_inputs(self):
Expand Down
5 changes: 4 additions & 1 deletion distutils/command/install_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
Implements the Distutils 'install_headers' command, to install C/C++ header
files to the Python include directory."""

import os

from ..core import Command


Expand Down Expand Up @@ -34,7 +36,8 @@ def run(self):

self.mkpath(self.install_dir)
for header in headers:
(out, _) = self.copy_file(header, self.install_dir)
dst = os.path.join(self.install_dir, os.path.basename(header))
(out, _) = self.copy_file(header, dst)
self.outfiles.append(out)

def get_inputs(self):
Expand Down
4 changes: 2 additions & 2 deletions distutils/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from .py38 import removeprefix


def consolidate_linker_args(args: list[str]) -> str:
def consolidate_linker_args(args: list[str]) -> list[str] | str:
"""
Ensure the return value is a string for backward compatibility.

Retain until at least 2024-04-31. See pypa/distutils#246
Retain until at least 2025-04-31. See pypa/distutils#246
"""

if not all(arg.startswith('-Wl,') for arg in args):
Expand Down
10 changes: 10 additions & 0 deletions distutils/compat/py38.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,13 @@ def removesuffix(self, suffix):

def removeprefix(self, prefix):
return self.removeprefix(prefix)


def aix_platform(osname, version, release):
try:
import _aix_support # type: ignore

return _aix_support.aix_platform()
except ImportError:
pass
return f"{osname}-{version}.{release}"
File renamed without changes.
16 changes: 3 additions & 13 deletions distutils/file_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,8 @@ def copy_file( # noqa: C901
verbose=1,
dry_run=0,
):
"""Copy a file 'src' to 'dst'. If 'dst' is a directory, then 'src' is
copied there with the same name; otherwise, it must be a filename. (If
the file exists, it will be ruthlessly clobbered.) If 'preserve_mode'
"""Copy a file 'src' to 'dst'.
(If the file exists, it will be ruthlessly clobbered.) If 'preserve_mode'
Comment on lines +73 to +74
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can assume that distutils is the only consumer of this function. Maybe no one else is using it, but we can't simply remove it without risking breaking users. Let's do this instead:

  1. Rename copy_file to _copy_file_direct, implementing this new, limited interface.
  2. Create a new copy_file that implements the backward-compatible behavior (infer dst from dir), raises a DeprecationWarning, and then calls _copy_file_direct.
  3. Retain the logic of emitting only the dirname if the basenames match (I don't think that change is necessary to get the performance gain).
  4. Create a new _copy_file_to_dir that assumes the dst is a dir and performs the join and basename, calling _copy_file_direct.

What do you think?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for 3. the logic is broken as far as I can tell. we can't keep the syscall to isdir() to determine whether we're dealing with a directory because that's what makes the function slow.
for context, this function is called for thousands of files, it's used by distutils and setuptools/pip when working with packages. (the profiling run was for python setup.py bdist_wheel)

I think we can't have a replacement function be private with a underscore. It's a public API that is used in other places. it think linters will complain if we import a private _ function in other places (note that setuptools is vendoring this repo and might have more CI rules).

searching on github distutils.file_util AND copy_file AND path:.py, it's finding 2k files using this function. Going over the results, I see a few are passing a directory. That is problematic. :(
I don't think adding a warning would be helpful, distutils is removed in python 3.12 and already raising warnings/errors. If people ain't fixing errors because the module is removed, they're not going to care about yet another warning. ^^

what I can think of?

  • make two functions copy_file_direct() and copy_file_to_dir().
  • fix code inside of distutils to use the appropriate function.
  • keep copy_file() logic to check whether the argument is a directory + call one of the 2 functions.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've started the refactoring... but unfortunately found out copy_file() is wrapped inside of distutils.cmd::Command class and there are a dozen of invocations through self.copy_file across the codebase.
If we don't want to fix copy_file() but would rather ship a copy_file_direct() + copy_file_to_dir(), then we would have to expose both functions though the class and adjust a dozen invocations. it's not hard to do but that's a fair amount of boilerplate and code to refactor.
I would like to have confirmation we want to do that before engaging into that.

realistically, distutils is removed from the interpreter as of python 3.12, it's only available inside of setuptools where it's vendored. I initially made the PR on setuptools where it works and pass all tests, it provides a welcome performance improvement to package operations.

Is there really a risk in fixing copy_file() to break other packages? packages that are not already broken because distutils is removed.

Copy link
Contributor

@abravalheri abravalheri Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hyrum's law usually hits hard on distutils/setuptools...

If you want to define 2 other new functions (one for files or one for directories) it should be fine. But my opinion is that we should keep the old behaviour (maybe behind a deprecated warning) for a couple of years before dropping it (if ever dropping would be an option... we have to remember that some packages in the ecosystem, haven't even moved to setuptools yet...)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, that sounds messy. Let's consider some other options.

is true (the default), the file's mode (type and permission bits, or
whatever is analogous on the current platform) is copied. If
'preserve_times' is true (the default), the last-modified and
Expand Down Expand Up @@ -109,12 +108,6 @@ def copy_file( # noqa: C901
"can't copy '%s': doesn't exist or not a regular file" % src
)

if os.path.isdir(dst):
dir = dst
dst = os.path.join(dst, os.path.basename(src))
else:
dir = os.path.dirname(dst)

if update and not newer(src, dst):
if verbose >= 1:
log.debug("not copying %s (output up-to-date)", src)
Expand All @@ -126,10 +119,7 @@ def copy_file( # noqa: C901
raise ValueError("invalid value '%s' for 'link' argument" % link)

if verbose >= 1:
if os.path.basename(dst) == os.path.basename(src):
log.info("%s %s -> %s", action, src, dir)
else:
log.info("%s %s -> %s", action, src, dst)
log.info("%s %s -> %s", action, src, dst)

if dry_run:
return (dst, 1)
Expand Down
8 changes: 0 additions & 8 deletions distutils/py38compat.py

This file was deleted.

63 changes: 36 additions & 27 deletions distutils/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,42 @@
executable name.
"""

from __future__ import annotations

import os
import platform
import subprocess
import sys
from typing import Mapping

from ._log import log
from .debug import DEBUG
from .errors import DistutilsExecError


def spawn(cmd, search_path=1, verbose=0, dry_run=0, env=None): # noqa: C901
def _debug(cmd):
"""
Render a subprocess command differently depending on DEBUG.
"""
return cmd if DEBUG else cmd[0]


def _inject_macos_ver(env: Mapping[str:str] | None) -> Mapping[str:str] | None:
if platform.system() != 'Darwin':
return env

from distutils.util import MACOSX_VERSION_VAR, get_macosx_target_ver

target_ver = get_macosx_target_ver()
update = {MACOSX_VERSION_VAR: target_ver} if target_ver else {}
return {**_resolve(env), **update}


def _resolve(env: Mapping[str:str] | None) -> Mapping[str:str]:
return os.environ if env is None else env


def spawn(cmd, search_path=True, verbose=False, dry_run=False, env=None):
"""Run another program, specified as a command list 'cmd', in a new process.

'cmd' is just the argument list for the new process, ie.
Expand All @@ -31,10 +57,6 @@ def spawn(cmd, search_path=1, verbose=0, dry_run=0, env=None): # noqa: C901
Raise DistutilsExecError if running the program fails in any way; just
return on success.
"""
# cmd is documented as a list, but just in case some code passes a tuple
# in, protect our %-formatting code against horrible death
cmd = list(cmd)

log.info(subprocess.list2cmdline(cmd))
if dry_run:
return
Expand All @@ -44,28 +66,16 @@ def spawn(cmd, search_path=1, verbose=0, dry_run=0, env=None): # noqa: C901
if executable is not None:
cmd[0] = executable

env = env if env is not None else dict(os.environ)

if sys.platform == 'darwin':
from distutils.util import MACOSX_VERSION_VAR, get_macosx_target_ver

macosx_target_ver = get_macosx_target_ver()
if macosx_target_ver:
env[MACOSX_VERSION_VAR] = macosx_target_ver

try:
proc = subprocess.Popen(cmd, env=env)
proc.wait()
exitcode = proc.returncode
subprocess.check_call(cmd, env=_inject_macos_ver(env))
except OSError as exc:
if not DEBUG:
cmd = cmd[0]
raise DistutilsExecError(f"command {cmd!r} failed: {exc.args[-1]}") from exc

if exitcode:
if not DEBUG:
cmd = cmd[0]
raise DistutilsExecError(f"command {cmd!r} failed with exit code {exitcode}")
raise DistutilsExecError(
f"command {_debug(cmd)!r} failed: {exc.args[-1]}"
) from exc
except subprocess.CalledProcessError as err:
raise DistutilsExecError(
f"command {_debug(cmd)!r} failed with exit code {err.returncode}"
) from err


def find_executable(executable, path=None):
Expand All @@ -83,14 +93,13 @@ def find_executable(executable, path=None):

if path is None:
path = os.environ.get('PATH', None)
# bpo-35755: Don't fall through if PATH is the empty string
if path is None:
try:
path = os.confstr("CS_PATH")
except (AttributeError, ValueError):
# os.confstr() or CS_PATH is not available
path = os.defpath
# bpo-35755: Don't use os.defpath if the PATH environment variable is
# set to an empty string

# PATH='' doesn't match, whereas PATH=':' looks in the current directory
if not path:
Expand Down
4 changes: 2 additions & 2 deletions distutils/sysconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import sys
import sysconfig

from . import py39compat
from ._functools import pass_none
from .compat import py39
from .errors import DistutilsPlatformError

IS_PYPY = '__pypy__' in sys.builtin_module_names
Expand Down Expand Up @@ -538,7 +538,7 @@ def get_config_vars(*args):
global _config_vars
if _config_vars is None:
_config_vars = sysconfig.get_config_vars().copy()
py39compat.add_ext_suffix(_config_vars)
py39.add_ext_suffix(_config_vars)

return [_config_vars.get(name) for name in args] if args else _config_vars

Expand Down
Empty file.
File renamed without changes.
2 changes: 1 addition & 1 deletion distutils/tests/test_archive_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import path
import pytest

from .py38compat import check_warnings
from .compat.py38 import check_warnings
from .unix_compat import UID_0_SUPPORT, grp, pwd, require_uid_0, require_unix_id


Expand Down
2 changes: 1 addition & 1 deletion distutils/tests/test_bdist_rpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import pytest

from .py38compat import requires_zlib
from .compat.py38 import requires_zlib

SETUP_PY = """\
from distutils.core import setup
Expand Down
2 changes: 1 addition & 1 deletion distutils/tests/test_build_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import path
import pytest

from . import py38compat as import_helper
from .compat import py38 as import_helper


@pytest.fixture()
Expand Down
3 changes: 2 additions & 1 deletion distutils/tests/test_dir_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ def test_copy_tree_verbosity(self, caplog):

mkpath(self.target, verbose=0)
a_file = path.Path(self.target) / 'ok.txt'
to_file = path.Path(self.target2) / 'ok.txt'
jaraco.path.build({'ok.txt': 'some content'}, self.target)

wanted = [f'copying {a_file} -> {self.target2}']
wanted = [f'copying {a_file} -> {to_file}']
copy_tree(self.target, self.target2, verbose=1)
assert caplog.messages == wanted

Expand Down
2 changes: 1 addition & 1 deletion distutils/tests/test_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from .py38compat import check_warnings
from .compat.py38 import check_warnings


class TestExtension:
Expand Down
2 changes: 1 addition & 1 deletion distutils/tests/test_filelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import jaraco.path
import pytest

from . import py38compat as os_helper
from .compat import py38 as os_helper

MANIFEST_IN = """\
include ok
Expand Down
2 changes: 1 addition & 1 deletion distutils/tests/test_sdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import pytest
from more_itertools import ilen

from .py38compat import check_warnings
from .compat.py38 import check_warnings
from .unix_compat import grp, pwd, require_uid_0, require_unix_id

SETUP_PY = """
Expand Down
22 changes: 13 additions & 9 deletions distutils/tests/test_spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import path
import pytest

from . import py38compat as os_helper
from .compat import py38 as os_helper


class TestSpawn(support.TempdirManager):
Expand Down Expand Up @@ -45,14 +45,9 @@ def test_spawn(self):
spawn([exe]) # should work without any error

def test_find_executable(self, tmp_path):
program_noeext = 'program'
# Give the temporary program an ".exe" suffix for all.
# It's needed on Windows and not harmful on other platforms.
program = program_noeext + ".exe"

program_path = tmp_path / program
program_path.write_text("", encoding='utf-8')
program_path.chmod(stat.S_IXUSR)
program_path = self._make_executable(tmp_path, '.exe')
program = program_path.name
program_noeext = program_path.with_suffix('').name
filename = str(program_path)
tmp_dir = path.Path(tmp_path)

Expand Down Expand Up @@ -121,6 +116,15 @@ def test_find_executable(self, tmp_path):
rv = find_executable(program)
assert rv == filename

@staticmethod
def _make_executable(tmp_path, ext):
# Give the temporary program a suffix regardless of platform.
# It's needed on Windows and not harmful on others.
program = tmp_path.joinpath('program').with_suffix(ext)
program.write_text("", encoding='utf-8')
program.chmod(stat.S_IXUSR)
return program

def test_spawn_missing_exe(self):
with pytest.raises(DistutilsExecError) as ctx:
spawn(['does-not-exist'])
Expand Down
Loading
Loading