When exporting, use hardlinks for duplicated files

For ostree_repo_export_tree_to_archive(), and 'ostree export', when the
exported tree contains multiple files with the same checksum, write an
archive with hard links.

Without this, importing a tree, then exporting it again breaks
hardlinks.

As an example of savings: this reduces the (compressed) size of the
Fedora Flatpak Runtime image from 1345MiB to 712MiB.

Resolves: #2925
This commit is contained in:
Owen W. Taylor 2023-09-29 12:09:04 -04:00
parent 8c25452c1e
commit 3b2fd6e9ff
5 changed files with 58 additions and 15 deletions

View File

@ -943,15 +943,10 @@ ostree_repo_write_archive_to_mtree_from_fd (OstreeRepo *self, int fd, OstreeMuta
#ifdef HAVE_LIBARCHIVE
static gboolean
file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path,
GFileInfo *file_info, struct archive_entry *entry, GError **error)
static char *
file_to_pathstr (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path)
{
gboolean ret = FALSE;
g_autofree char *pathstr = g_file_get_relative_path (root, path);
g_autoptr (GVariant) xattrs = NULL;
time_t ts = (time_t)opts->timestamp_secs;
if (opts->path_prefix && opts->path_prefix[0])
{
g_autofree char *old_pathstr = pathstr;
@ -964,6 +959,18 @@ file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts,
pathstr = g_strdup (".");
}
return g_steal_pointer (&pathstr);
}
static gboolean
file_to_archive_entry_common (GFile *root, OstreeRepoExportArchiveOptions *opts, GFile *path,
GFileInfo *file_info, struct archive_entry *entry, GError **error)
{
gboolean ret = FALSE;
g_autofree char *pathstr = file_to_pathstr (root, opts, path);
g_autoptr (GVariant) xattrs = NULL;
time_t ts = (time_t)opts->timestamp_secs;
archive_entry_update_pathname_utf8 (entry, pathstr);
archive_entry_set_ctime (entry, ts, OSTREE_TIMESTAMP);
archive_entry_set_mtime (entry, ts, OSTREE_TIMESTAMP);
@ -1021,7 +1028,8 @@ out:
static gboolean
write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchiveOptions *opts,
GFile *root, GFile *dir, struct archive *a,
GCancellable *cancellable, GError **error)
GHashTable *seen_checksums, GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
g_autoptr (GFileInfo) dir_info = NULL;
@ -1057,8 +1065,8 @@ write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchive
/* First, handle directories recursively */
if (g_file_info_get_file_type (file_info) == G_FILE_TYPE_DIRECTORY)
{
if (!write_directory_to_libarchive_recurse (self, opts, root, path, a, cancellable,
error))
if (!write_directory_to_libarchive_recurse (self, opts, root, path, a, seen_checksums,
cancellable, error))
goto out;
/* Go to the next entry */
@ -1086,9 +1094,27 @@ write_directory_to_libarchive_recurse (OstreeRepo *self, OstreeRepoExportArchive
g_autoptr (GInputStream) file_in = NULL;
g_autoptr (GFileInfo) regular_file_info = NULL;
const char *checksum;
GFile *old_path;
checksum = ostree_repo_file_get_checksum ((OstreeRepoFile *)path);
old_path = g_hash_table_lookup (seen_checksums, checksum);
if (old_path)
{
g_autofree char *old_pathstr = file_to_pathstr (root, opts, old_path);
archive_entry_set_hardlink (entry, old_pathstr);
if (!write_header_free_entry (a, &entry, error))
goto out;
break;
}
else
{
/* The checksum is owned by path (an OstreeRepoFile) */
g_hash_table_insert (seen_checksums, (char *)checksum, g_object_ref (path));
}
if (!ostree_repo_load_file (self, checksum, &file_in, &regular_file_info, NULL,
cancellable, error))
goto out;
@ -1168,9 +1194,11 @@ ostree_repo_export_tree_to_archive (OstreeRepo *self, OstreeRepoExportArchiveOpt
#ifdef HAVE_LIBARCHIVE
gboolean ret = FALSE;
struct archive *a = archive;
g_autoptr (GHashTable) seen_checksums
= g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_object_unref);
if (!write_directory_to_libarchive_recurse (self, opts, (GFile *)root, (GFile *)root, a,
cancellable, error))
seen_checksums, cancellable, error))
goto out;
ret = TRUE;

View File

@ -72,9 +72,9 @@ date > test-overlays/overlaid-file
$OSTREE commit ${COMMIT_ARGS} -b test-base --base test2 --owner-uid 42 --owner-gid 42 test-overlays/
$OSTREE ls -R test-base > ls.txt
if can_create_whiteout_devices; then
assert_streq "$(wc -l < ls.txt)" 17
assert_streq "$(wc -l < ls.txt)" 22
else
assert_streq "$(wc -l < ls.txt)" 14
assert_streq "$(wc -l < ls.txt)" 19
fi
assert_streq "$(grep '42.*42' ls.txt | wc -l)" 2

View File

@ -249,6 +249,13 @@ setup_test_repository () {
mkdir baz/another/
echo x > baz/another/y
mkdir baz/sub1
echo SAME_CONTENT > baz/sub1/duplicate_a
echo SAME_CONTENT > baz/sub1/duplicate_b
mkdir baz/sub2
echo SAME_CONTENT > baz/sub2/duplicate_c
# if we are running inside a container we cannot test
# the overlayfs whiteout marker passthrough
if ! test -n "${OSTREE_NO_WHITEOUTS:-}"; then

View File

@ -38,7 +38,7 @@ orig_composefs_digest=$($OSTREE show --print-hex --print-metadata-key ostree.com
$OSTREE commit ${COMMIT_ARGS} -b test-composefs2 --generate-composefs-metadata test2-co
new_composefs_digest=$($OSTREE show --print-hex --print-metadata-key ostree.composefs.digest.v0 test-composefs2)
assert_streq "${orig_composefs_digest}" "${new_composefs_digest}"
assert_streq "${new_composefs_digest}" "7a53698f5aa7af7e8034a10bd2fcc195e9df46781efd967a3fc83d32a1d3eda1"
assert_streq "${new_composefs_digest}" "be956966c70970ea23b1a8043bca58cfb0d011d490a35a7817b36d04c0210954"
tap_ok "composefs metadata"
tap_end

View File

@ -28,7 +28,7 @@ fi
setup_test_repository "archive"
echo '1..5'
echo '1..6'
$OSTREE checkout test2 test2-co
$OSTREE commit --no-xattrs -b test2-noxattrs -s "test2 without xattrs" --tree=dir=test2-co
@ -81,3 +81,11 @@ assert_file_empty diff.txt
rm test2.tar diff.txt t -rf
echo 'ok export import'
cd ${test_tmpdir}
${OSTREE} 'export' test2 -o test2.tar
tar tvf test2.tar > test2.manifest
assert_file_has_content test2.manifest 'baz/sub1/duplicate_b link to baz/sub1/duplicate_a'
assert_file_has_content test2.manifest 'baz/sub2/duplicate_c link to baz/sub1/duplicate_a'
echo 'ok export hard links'