From cef1a9876d99010dbb325009f71cb0aea87a19f3 Mon Sep 17 00:00:00 2001 From: "Owen W. Taylor" Date: Fri, 29 Sep 2023 12:09:04 -0400 Subject: [PATCH] When exporting, use hardlinks for duplicated files For ostree_repo_export_tree_to_archive(), and 'ostree export', when the exported tree contains multiple files with the same checksum, write an archive with hard links. Without this, importing a tree, then exporting it again breaks hardlinks. As an example of savings: this reduces the (compressed) size of the Fedora Flatpak Runtime image from 1345MiB to 712MiB. Resolves: #2925 --- src/libostree/ostree-repo-libarchive.c | 50 ++++++++++++++++++++------ tests/libtest.sh | 7 ++++ tests/test-export.sh | 10 +++++- 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/libostree/ostree-repo-libarchive.c b/src/libostree/ostree-repo-libarchive.c index d0f46883c7..65a309335f 100644 --- a/src/libostree/ostree-repo-libarchive.c +++ b/src/libostree/ostree-repo-libarchive.c @@ -943,15 +943,10 @@ ostree_repo_write_archive_to_mtree_from_fd (OstreeRepo *self, int fd, OstreeMuta #ifdef HAVE_LIBARCHIVE -static gboolean -file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path, - GFileInfo *file_info, struct archive_entry *entry, GError **error) +static char * +file_to_pathstr (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path) { - gboolean ret = FALSE; g_autofree char *pathstr = g_file_get_relative_path (root, path); - g_autoptr (GVariant) xattrs = NULL; - time_t ts = (time_t)opts->timestamp_secs; - if (opts->path_prefix && opts->path_prefix[0]) { g_autofree char *old_pathstr = pathstr; @@ -964,6 +959,18 @@ file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, pathstr = g_strdup ("."); } + return g_steal_pointer (&pathstr); +} + +static gboolean +file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path, + GFileInfo *file_info, struct archive_entry *entry, GError **error) +{ + gboolean ret = FALSE; + g_autofree char *pathstr = file_to_pathstr (root, opts, path); + g_autoptr (GVariant) xattrs = NULL; + time_t ts = (time_t)opts->timestamp_secs; + archive_entry_update_pathname_utf8 (entry, pathstr); archive_entry_set_ctime (entry, ts, OSTREE_TIMESTAMP); archive_entry_set_mtime (entry, ts, OSTREE_TIMESTAMP); @@ -1021,7 +1028,8 @@ write_header_free_entry (struct archive *a, struct archive_entry **entryp, GErro static gboolean write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchiveOptions *opts, GFile *root, GFile *dir, struct archive *a, - GCancellable *cancellable, GError **error) + GHashTable *seen_checksums, GCancellable *cancellable, + GError **error) { gboolean ret = FALSE; g_autoptr (GFileInfo) dir_info = NULL; @@ -1057,8 +1065,8 @@ write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchive /* First, handle directories recursively */ if (g_file_info_get_file_type (file_info) == G_FILE_TYPE_DIRECTORY) { - if (!write_directory_to_libarchive_recurse (self, opts, root, path, a, cancellable, - error)) + if (!write_directory_to_libarchive_recurse (self, opts, root, path, a, seen_checksums, + cancellable, error)) goto out; /* Go to the next entry */ @@ -1086,9 +1094,27 @@ write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchive g_autoptr (GInputStream) file_in = NULL; g_autoptr (GFileInfo) regular_file_info = NULL; const char *checksum; + GFile *old_path; checksum = ostree_repo_file_get_checksum ((OstreeRepoFile *)path); + old_path = g_hash_table_lookup (seen_checksums, checksum); + if (old_path) + { + g_autofree char *old_pathstr = file_to_pathstr (root, opts, old_path); + + archive_entry_set_hardlink (entry, old_pathstr); + if (!write_header_free_entry (a, &entry, error)) + goto out; + + break; + } + else + { + /* The checksum is owned by path (an OstreeRepoFile) */ + g_hash_table_insert (seen_checksums, (char *)checksum, g_object_ref (path)); + } + if (!ostree_repo_load_file (self, checksum, &file_in, ®ular_file_info, NULL, cancellable, error)) goto out; @@ -1168,9 +1194,11 @@ ostree_repo_export_tree_to_archive (OstreeRepo *self, OstreeRepoExportArchiveOpt #ifdef HAVE_LIBARCHIVE gboolean ret = FALSE; struct archive *a = archive; + g_autoptr (GHashTable) seen_checksums + = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_object_unref); if (!write_directory_to_libarchive_recurse (self, opts, (GFile *)root, (GFile *)root, a, - cancellable, error)) + seen_checksums, cancellable, error)) goto out; ret = TRUE; diff --git a/tests/libtest.sh b/tests/libtest.sh index fa93782703..d1c99eab8f 100755 --- a/tests/libtest.sh +++ b/tests/libtest.sh @@ -249,6 +249,13 @@ setup_test_repository () { mkdir baz/another/ echo x > baz/another/y + mkdir baz/sub1 + echo SAME_CONTENT > baz/sub1/duplicate_a + echo SAME_CONTENT > baz/sub1/duplicate_b + + mkdir baz/sub2 + echo SAME_CONTENT > baz/sub2/duplicate_c + # if we are running inside a container we cannot test # the overlayfs whiteout marker passthrough if ! test -n "${OSTREE_NO_WHITEOUTS:-}"; then diff --git a/tests/test-export.sh b/tests/test-export.sh index e490ae404e..6b8de94c4c 100755 --- a/tests/test-export.sh +++ b/tests/test-export.sh @@ -28,7 +28,7 @@ fi setup_test_repository "archive" -echo '1..5' +echo '1..6' $OSTREE checkout test2 test2-co $OSTREE commit --no-xattrs -b test2-noxattrs -s "test2 without xattrs" --tree=dir=test2-co @@ -81,3 +81,11 @@ assert_file_empty diff.txt rm test2.tar diff.txt t -rf echo 'ok export import' + +cd ${test_tmpdir} +${OSTREE} 'export' test2 -o test2.tar +tar tvf test2.tar > test2.manifest +assert_file_has_content test2.manifest 'baz/sub1/duplicate_b link to baz/sub1/duplicate_a' +assert_file_has_content test2.manifest 'baz/sub2/duplicate_c link to baz/sub1/duplicate_a' + +echo 'ok export hard links'