From ddfbb9da5388d2f61f6c8802d4901c556ed1090a Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 19 Oct 2023 14:10:56 -0600 Subject: [PATCH] add fast file copy method --- lib/build.py | 7 +++-- lib/charliecloud.py | 4 --- lib/filesystem.py | 72 +++++++++++++++++++++++++++++++++++++++++++-- lib/image.py | 2 +- 4 files changed, 76 insertions(+), 9 deletions(-) diff --git a/lib/build.py b/lib/build.py index 6bff7f305..a541a37cd 100644 --- a/lib/build.py +++ b/lib/build.py @@ -767,7 +767,7 @@ def onerror(x): dst_path.rmtree() else: dst_path.unlink_() - ch.copy2(src_path, dst_path, follow_symlinks=False) + src_path.copy(dst_path) def copy_src_file(self, src, dst): """Copy file src to dst. src might be a symlink, but dst is a canonical @@ -789,8 +789,11 @@ def copy_src_file(self, src, dst): assert (not dst.is_symlink()) assert ( (dst.exists() and (dst.is_dir() or dst.is_file())) or (not dst.exists() and dst.parent.is_dir())) + if (dst.is_dir()): + dst = dst.parent // src.name + src = src.resolve() ch.DEBUG("copying named file: %s -> %s" % (src, dst)) - ch.copy2(src, dst, follow_symlinks=True) + src.copy(dst) def dest_realpath(self, unpack_path, dst): """Return the canonicalized version of path dst within (canonical) image diff --git a/lib/charliecloud.py b/lib/charliecloud.py index 47ef53ae8..987ef204f 100644 --- a/lib/charliecloud.py +++ b/lib/charliecloud.py @@ -609,10 +609,6 @@ def color_set(color, fp): if (fp.isatty()): print("\033[" + color, end="", flush=True, file=fp) -def copy2(src, dst, **kwargs): - "Wrapper for shutil.copy2() with error checking." - ossafe(shutil.copy2, "can’t copy: %s -> %s" % (src, dst), src, dst, **kwargs) - def dependencies_check(): """Check more dependencies. If any dependency problems found, here or above (e.g., lark module checked at import time), then complain and exit.""" diff --git a/lib/filesystem.py b/lib/filesystem.py index 5c4b2385d..af4c3f8a8 100644 --- a/lib/filesystem.py +++ b/lib/filesystem.py @@ -32,6 +32,16 @@ storage_lock = True +### Functions ### + +def copy(src, dst, follow_symlinks=False): + """Copy file src to dst. Wrapper function providing same signature as + shutil.copy2(). See Path.copy() for lots of gory details. Accepts + follow_symlinks, but the only valid value is False.""" + assert (not follow_symlinks) + src.copy(dst) + + ## Classes ## class Path(pathlib.PosixPath): @@ -187,9 +197,67 @@ def chmod_min(self, st=None): ch.ossafe(os.chmod, "can’t chmod: %s" % self, self, perms_new) return (st.st_mode | perms_new) + def copy(self, dst): + """Copy file myself to dst, including metadata, overwriting dst if it + exists. dst must be the actual destination path, i.e., it may not be + a directory. Does not follow symlinks. + + If (a) src is a regular file, (b) src and dst are on the same + filesystem, and (c) Python is version ≥3.8, then use + os.copy_file_range() [1,2], which at a minimum does an in-kernel data + transfer. If that filesystem also (d) supports copy-on-write [3], + then this is a very fast lazy reflink copy. + + [1]: https://docs.python.org/3/library/os.html#os.copy_file_range + [2]: https://man7.org/linux/man-pages/man2/copy_file_range.2.html + [3]: https://elixir.bootlin.com/linux/latest/A/ident/remap_file_range + """ + assert (not follow_symlinks) + src_st = self.stat_(follow_symlinks=False) + # dst is not a directory, so parent must be on the same filesystem. We + # *do* want to follow symlinks on the parent. + dst_dev = dst.parent.stat_(follow_symlinks=True).st_dev + if ( stat.S_ISREG(src_st.st_mode) + and src_st.st_dev == dst_dev + and hasattr(os, "copy_file_range")): + # Fast path. The same-filesystem restriction is because reliable + # copy_file_range(2) between filesystems seems quite new (maybe + # kernel 5.18?). + try: + if (dst.exists()): + # If dst is a symlink, we get OLOOP from os.open(). Delete it + # unconditionally though, for simplicity. + dst.unlink() + src_fd = os.open(self, os.O_RDONLY|os.O_NOFOLLOW) + dst_fd = os.open(dst, os.O_WRONLY|os.O_NOFOLLOW|os.O_CREAT) + # I’m not sure why we need to loop this -- there’s no explanation + # of *when* fewer bytes than requested would be copied -- but the + # man page example does. + remaining = src_st.st_size + while (remaining > 0): + copied = os.copy_file_range(src_fd, dst_fd, sys.maxsize) + if (copied == 0): + ch.FATAL("zero bytes copied: %s -> %s" % (self, dst)) + remaining -= copied + os.close(src_fd) + os.close(dst_fd) + except OSError as x: + FATAL("can’t copy (fast): %s -> %s: %s" % (self, dst, x.strerror)) + else: + # Slow path. + try: + shutil.copyfile(self, dst, follow_symlinks=False) + except OSError as x: + FATAL("can’t copy (slow): %s -> %s: %s" % (self, dst, x.strerror)) + try: + # Metadata. + shutil.copystat(self, dst, follow_symlinks=False) + except OSError as x: + FATAL("can’t copy metadata: %s -> %s" % (self, dst, x.strerror)) + def copytree(self, *args, **kwargs): "Wrapper for shutil.copytree() that exits on the first error." - shutil.copytree(str(self), copy_function=ch.copy2, *args, **kwargs) + shutil.copytree(str(self), copy_function=copy, *args, **kwargs) def disk_bytes(self): """Return the number of disk bytes consumed by path. Note this is @@ -435,7 +503,7 @@ def stat_(self, links): follow_symlinks kwarg is absent in pathlib for Python 3.6, which we want to retain compatibility with.""" return ch.ossafe(os.stat, "can’t stat: %s" % self, self, - follow_symlinks=links) + follow_symlinks=links) def strip(self, left=0, right=0): """Return a copy of myself with n leading components removed. E.g.: diff --git a/lib/image.py b/lib/image.py index e81987b75..4334ed7a5 100644 --- a/lib/image.py +++ b/lib/image.py @@ -396,7 +396,7 @@ def metadata_replace(self, config_json): else: # Copy pulled config file into the image so we still have it. path = self.metadata_path // "config.pulled.json" - ch.copy2(config_json, path) + config_json.copy(path) ch.VERBOSE("pulled config path: %s" % path) self.metadata_merge_from_config(path.json_from_file("config")) self.metadata_save()