1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-14 23:24:38 +03:00

Merge pull request #24908 from DaanDeMeyer/repart-minimize

repart: Add Minimize setting
This commit is contained in:
Daan De Meyer 2022-11-15 08:19:28 +01:00 committed by GitHub
commit 32a3f802f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 310 additions and 55 deletions

View File

@ -581,6 +581,17 @@
below. Defaults to <literal>%t</literal>. To disable split artifact generation for a partition, set
<varname>SplitName=</varname> to <literal>-</literal>.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>Minimize=</varname></term>
<listitem><para>Takes a boolean. Disabled by default. If enabled, the partition is created at least
as big as required for the minimal file system of the type specified by <varname>Format=</varname>,
taking into account the sources configured with <varname>CopyFiles=</varname>. Note that unless the
filesystem is a read-only filesystem, <command>systemd-repart</command> will have to populate the
filesystem twice, so enabling this option might slow down repart when populating large partitions.
</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -76,19 +76,19 @@
#include "utf8.h"
/* If not configured otherwise use a minimal partition size of 10M */
#define DEFAULT_MIN_SIZE (10*1024*1024)
#define DEFAULT_MIN_SIZE (10ULL*1024ULL*1024ULL)
/* Hard lower limit for new partition sizes */
#define HARD_MIN_SIZE 4096
#define HARD_MIN_SIZE 4096ULL
/* We know up front we're never going to put more than this in a verity sig partition. */
#define VERITY_SIG_SIZE (HARD_MIN_SIZE * 4)
#define VERITY_SIG_SIZE (HARD_MIN_SIZE*4ULL)
/* libfdisk takes off slightly more than 1M of the disk size when creating a GPT disk label */
#define GPT_METADATA_SIZE (1044*1024)
#define GPT_METADATA_SIZE (1044ULL*1024ULL)
/* LUKS2 takes off 16M of the partition size with its metadata by default */
#define LUKS2_METADATA_SIZE (16*1024*1024)
#define LUKS2_METADATA_SIZE (16ULL*1024ULL*1024ULL)
/* Note: When growing and placing new partitions we always align to 4K sector size. It's how newer hard disks
* are designed, and if everything is aligned to that performance is best. And for older hard disks with 512B
@ -168,6 +168,7 @@ struct Partition {
sd_id128_t current_uuid, new_uuid;
bool new_uuid_is_set;
char *current_label, *new_label;
sd_id128_t fs_uuid;
bool dropped;
bool factory_reset;
@ -191,6 +192,7 @@ struct Partition {
char *copy_blocks_path;
bool copy_blocks_auto;
const char *copy_blocks_root;
int copy_blocks_fd;
uint64_t copy_blocks_size;
@ -200,6 +202,7 @@ struct Partition {
EncryptMode encrypt;
VerityMode verity;
char *verity_match_key;
bool minimize;
uint64_t gpt_flags;
int no_auto;
@ -344,20 +347,18 @@ static void partition_foreignize(Partition *p) {
/* Reset several parameters set through definition file to make the partition foreign. */
p->new_label = mfree(p->new_label);
p->definition_path = mfree(p->definition_path);
p->drop_in_files = strv_free(p->drop_in_files);
p->copy_blocks_path = mfree(p->copy_blocks_path);
p->copy_blocks_fd = safe_close(p->copy_blocks_fd);
p->copy_blocks_root = NULL;
p->format = mfree(p->format);
p->copy_files = strv_free(p->copy_files);
p->make_directories = strv_free(p->make_directories);
p->verity_match_key = mfree(p->verity_match_key);
p->new_uuid = SD_ID128_NULL;
p->new_uuid_is_set = false;
p->priority = 0;
p->weight = 1000;
p->padding_weight = 0;
@ -1338,6 +1339,7 @@ static int config_parse_copy_blocks(
if (streq(rvalue, "auto")) {
partition->copy_blocks_path = mfree(partition->copy_blocks_path);
partition->copy_blocks_auto = true;
partition->copy_blocks_root = arg_root;
return 0;
}
@ -1354,6 +1356,7 @@ static int config_parse_copy_blocks(
free_and_replace(partition->copy_blocks_path, d);
partition->copy_blocks_auto = false;
partition->copy_blocks_root = arg_root;
return 0;
}
@ -1498,6 +1501,7 @@ static int partition_read_definition(Partition *p, const char *path, const char
{ "Partition", "NoAuto", config_parse_tristate, 0, &p->no_auto },
{ "Partition", "GrowFileSystem", config_parse_tristate, 0, &p->growfs },
{ "Partition", "SplitName", config_parse_string, 0, &p->split_name_format },
{ "Partition", "Minimize", config_parse_bool, 0, &p->minimize },
{}
};
int r;
@ -1551,6 +1555,10 @@ static int partition_read_definition(Partition *p, const char *path, const char
return log_oom();
}
if (p->minimize && !p->format)
return log_syntax(NULL, LOG_ERR, path, 1, SYNTHETIC_ERRNO(EINVAL),
"Minimize= can only be enabled if Format= is set");
if (p->verity != VERITY_OFF || p->encrypt != ENCRYPT_OFF) {
r = dlopen_cryptsetup();
if (r < 0)
@ -3200,7 +3208,7 @@ static int context_copy_blocks(Context *context) {
log_info("Copying in '%s' (%s) on block level into future partition %" PRIu64 ".",
p->copy_blocks_path, FORMAT_BYTES(p->copy_blocks_size), p->partno);
r = copy_bytes_full(p->copy_blocks_fd, target_fd, p->copy_blocks_size, 0, NULL, NULL, NULL, NULL);
r = copy_bytes(p->copy_blocks_fd, target_fd, p->copy_blocks_size, COPY_REFLINK);
if (r < 0)
return log_error_errno(r, "Failed to copy in data from '%s': %m", p->copy_blocks_path);
@ -3274,14 +3282,14 @@ static int do_copy_files(Partition *p, const char *root, const Set *denylist) {
sfd, ".",
pfd, fn,
UID_INVALID, GID_INVALID,
COPY_REFLINK|COPY_MERGE|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS|COPY_ALL_XATTRS,
COPY_REFLINK|COPY_HOLES|COPY_MERGE|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS|COPY_ALL_XATTRS,
denylist);
} else
r = copy_tree_at(
sfd, ".",
tfd, ".",
UID_INVALID, GID_INVALID,
COPY_REFLINK|COPY_MERGE|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS|COPY_ALL_XATTRS,
COPY_REFLINK|COPY_HOLES|COPY_MERGE|COPY_REPLACE|COPY_SIGINT|COPY_HARDLINKS|COPY_ALL_XATTRS,
denylist);
if (r < 0)
return log_error_errno(r, "Failed to copy '%s' to '%s%s': %m", *source, strempty(arg_root), *target);
@ -3313,7 +3321,7 @@ static int do_copy_files(Partition *p, const char *root, const Set *denylist) {
if (tfd < 0)
return log_error_errno(errno, "Failed to create target file '%s': %m", *target);
r = copy_bytes(sfd, tfd, UINT64_MAX, COPY_REFLINK|COPY_SIGINT);
r = copy_bytes(sfd, tfd, UINT64_MAX, COPY_REFLINK|COPY_HOLES|COPY_SIGINT);
if (r < 0)
return log_error_errno(r, "Failed to copy '%s' to '%s%s': %m", *source, strempty(arg_root), *target);
@ -3349,17 +3357,6 @@ static int partition_populate_directory(Partition *p, const Set *denylist, char
assert(ret_root);
assert(ret_tmp_root);
/* When generating read-only filesystems, we need the source tree to be available when we generate
* the read-only filesystem. Because we might have multiple source trees, we build a temporary source
* tree beforehand where we merge all our inputs. We then use this merged source tree to create the
* read-only filesystem. */
if (!fstype_is_ro(p->format)) {
*ret_root = NULL;
*ret_tmp_root = NULL;
return 0;
}
/* If we only have a single directory that's meant to become the root directory of the filesystem,
* we can shortcut this function and just use that directory as the root directory instead. If we
* allocate a temporary directory, it's stored in "ret_tmp_root" to indicate it should be removed.
@ -3396,18 +3393,28 @@ static int partition_populate_directory(Partition *p, const Set *denylist, char
}
static int partition_populate_filesystem(Partition *p, const char *node, const Set *denylist) {
_cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
struct stat st;
int r;
assert(p);
assert(node);
if (fstype_is_ro(p->format))
return 0;
if (strv_isempty(p->copy_files) && strv_isempty(p->make_directories))
return 0;
log_info("Populating partition %" PRIu64 " with files.", p->partno);
if (stat(node, &st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", node);
if (!S_ISBLK(st.st_mode)) {
r = loop_device_make_by_path(node, O_RDWR, 0, LOCK_EX, &d);
if (r < 0)
return log_error_errno(r, "Failed to make loopback device of %s: %m", node);
node = d->node;
}
log_info("Populating %s filesystem with files.", p->format);
/* We copy in a child process, since we have to mount the fs for that, and we don't want that fs to
* appear in the host namespace. Hence we fork a child that has its own file system namespace and
@ -3444,7 +3451,7 @@ static int partition_populate_filesystem(Partition *p, const char *node, const S
_exit(EXIT_SUCCESS);
}
log_info("Successfully populated partition %" PRIu64 " with files.", p->partno);
log_info("Successfully populated %s filesystem with files.", p->format);
return 0;
}
@ -3507,7 +3514,6 @@ static int context_mkfs(Context *context) {
_cleanup_free_ char *encrypted = NULL, *root = NULL;
_cleanup_close_ int encrypted_dev_fd = -1;
const char *fsdev;
sd_id128_t fs_uuid;
if (p->dropped)
continue;
@ -3518,6 +3524,10 @@ static int context_mkfs(Context *context) {
if (!p->format)
continue;
/* Minimized partitions will use the copy blocks logic so let's make sure to skip those here. */
if (p->copy_blocks_fd >= 0)
continue;
assert(p->offset != UINT64_MAX);
assert(p->new_size != UINT64_MAX);
@ -3545,22 +3555,19 @@ static int context_mkfs(Context *context) {
log_info("Formatting future partition %" PRIu64 ".", p->partno);
/* Calculate the UUID for the file system as HMAC-SHA256 of the string "file-system-uuid",
* keyed off the partition UUID. */
r = derive_uuid(p->new_uuid, "file-system-uuid", &fs_uuid);
if (r < 0)
return r;
/* Ideally, we populate filesystems using our own code after creating the filesystem to
* ensure consistent handling of chattrs, xattrs and other similar things. However, when
* using read-only filesystems such as squashfs, we can't populate after creating the
* filesystem because it's read-only, so instead we create a temporary root to use as the
* source tree when generating the read-only filesystem. */
r = partition_populate_directory(p, denylist, &root, &tmp_root);
if (r < 0)
return r;
r = make_filesystem(fsdev, p->format, strempty(p->new_label), root ?: tmp_root, fs_uuid, arg_discard);
if (fstype_is_ro(p->format)) {
r = partition_populate_directory(p, denylist, &root, &tmp_root);
if (r < 0)
return r;
}
r = make_filesystem(fsdev, p->format, strempty(p->new_label), root ?: tmp_root, p->fs_uuid, arg_discard);
if (r < 0) {
encrypted_dev_fd = safe_close(encrypted_dev_fd);
(void) deactivate_luks(cd, encrypted);
@ -3575,11 +3582,13 @@ static int context_mkfs(Context *context) {
return log_error_errno(errno, "Failed to unlock LUKS device: %m");
/* Now, we can populate all the other filesystems that aren't read-only. */
r = partition_populate_filesystem(p, fsdev, denylist);
if (r < 0) {
encrypted_dev_fd = safe_close(encrypted_dev_fd);
(void) deactivate_luks(cd, encrypted);
return r;
if (!fstype_is_ro(p->format)) {
r = partition_populate_filesystem(p, fsdev, denylist);
if (r < 0) {
encrypted_dev_fd = safe_close(encrypted_dev_fd);
(void) deactivate_luks(cd, encrypted);
return r;
}
}
/* Note that we always sync explicitly here, since mkfs.fat doesn't do that on its own, and
@ -4060,6 +4069,12 @@ static int context_acquire_partition_uuids_and_labels(Context *context) {
p->new_uuid_is_set = true;
}
/* Calculate the UUID for the file system as HMAC-SHA256 of the string "file-system-uuid",
* keyed off the partition UUID. */
r = derive_uuid(p->new_uuid, "file-system-uuid", &p->fs_uuid);
if (r < 0)
return r;
if (!isempty(p->current_label)) {
/* never change initialized labels */
r = free_and_strdup_warn(&p->new_label, p->current_label);
@ -4401,7 +4416,7 @@ static int context_split(Context *context) {
if (lseek(fd, p->offset, SEEK_SET) < 0)
return log_error_errno(errno, "Failed to seek to partition offset: %m");
r = copy_bytes_full(fd, fdt, p->new_size, COPY_REFLINK|COPY_HOLES, NULL, NULL, NULL, NULL);
r = copy_bytes(fd, fdt, p->new_size, COPY_REFLINK|COPY_HOLES);
if (r < 0)
return log_error_errno(r, "Failed to copy to split partition %s: %m", fname);
}
@ -4884,7 +4899,6 @@ static int resolve_copy_blocks_auto(
static int context_open_copy_block_paths(
Context *context,
const char *root,
dev_t restrict_devno) {
int r;
@ -4906,7 +4920,7 @@ static int context_open_copy_block_paths(
if (p->copy_blocks_path) {
source_fd = chase_symlinks_and_open(p->copy_blocks_path, root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &opened);
source_fd = chase_symlinks_and_open(p->copy_blocks_path, p->copy_blocks_root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NONBLOCK, &opened);
if (source_fd < 0)
return log_error_errno(source_fd, "Failed to open '%s': %m", p->copy_blocks_path);
@ -4920,7 +4934,7 @@ static int context_open_copy_block_paths(
} else if (p->copy_blocks_auto) {
dev_t devno;
r = resolve_copy_blocks_auto(p->type_uuid, root, restrict_devno, &devno, &uuid);
r = resolve_copy_blocks_auto(p->type_uuid, p->copy_blocks_root, restrict_devno, &devno, &uuid);
if (r < 0)
return r;
@ -4989,6 +5003,174 @@ static int context_open_copy_block_paths(
return 0;
}
static int fd_apparent_size(int fd, uint64_t *ret) {
off_t initial = 0;
uint64_t size = 0;
assert(fd >= 0);
assert(ret);
initial = lseek(fd, 0, SEEK_CUR);
if (initial < 0)
return log_error_errno(errno, "Failed to get file offset: %m");
for (off_t off = 0;;) {
off_t r;
r = lseek(fd, off, SEEK_DATA);
if (r < 0 && errno == ENXIO)
/* If errno == ENXIO, that means we've reached the final hole of the file and
* that hole isn't followed by more data. */
break;
if (r < 0)
return log_error_errno(errno, "Failed to seek data in file from offset %"PRIi64": %m", off);
off = r; /* Set the offset to the start of the data segment. */
/* After copying a potential hole, find the end of the data segment by looking for
* the next hole. If we get ENXIO, we're at EOF. */
r = lseek(fd, off, SEEK_HOLE);
if (r < 0) {
if (errno == ENXIO)
break;
return log_error_errno(errno, "Failed to seek hole in file from offset %"PRIi64": %m", off);
}
size += r - off;
off = r;
}
if (lseek(fd, initial, SEEK_SET) < 0)
return log_error_errno(errno, "Failed to reset file offset: %m");
*ret = size;
return 0;
}
static int context_minimize(Context *context) {
_cleanup_set_free_ Set *denylist = NULL;
const char *vt;
int r;
assert(context);
r = make_copy_files_denylist(context, &denylist);
if (r < 0)
return r;
r = var_tmp_dir(&vt);
if (r < 0)
return log_error_errno(r, "Could not determine temporary directory: %m");
LIST_FOREACH(partitions, p, context->partitions) {
_cleanup_(rm_rf_physical_and_freep) char *tmp_root = NULL;
_cleanup_(unlink_and_freep) char *temp = NULL;
_cleanup_free_ char *root = NULL;
_cleanup_close_ int fd = -1;
sd_id128_t fs_uuid;
uint64_t fsz;
if (p->dropped)
continue;
if (PARTITION_EXISTS(p)) /* Never format existing partitions */
continue;
if (!p->format)
continue;
if (!p->minimize)
continue;
assert(!p->copy_blocks_path);
r = tempfn_random_child(vt, "repart", &temp);
if (r < 0)
return log_error_errno(r, "Failed to generate temporary file path: %m");
if (!fstype_is_ro(p->format)) {
fd = open(temp, O_CREAT|O_EXCL|O_CLOEXEC|O_RDWR|O_NOCTTY, 0600);
if (fd < 0)
return log_error_errno(errno, "Failed to open temporary file %s: %m", temp);
/* This may seem huge but it will be created sparse so it doesn't take up any space
* on disk until written to. */
if (ftruncate(fd, 1024ULL * 1024ULL * 1024ULL * 1024ULL) < 0)
return log_error_errno(errno, "Failed to truncate temporary file to %s: %m",
FORMAT_BYTES(1024ULL * 1024ULL * 1024ULL * 1024ULL));
/* We're going to populate this filesystem twice so use a random UUID the first time
* to avoid UUID conflicts. */
r = sd_id128_randomize(&fs_uuid);
if (r < 0)
return r;
} else {
r = partition_populate_directory(p, denylist, &root, &tmp_root);
if (r < 0)
return r;
fs_uuid = p->fs_uuid;
}
r = make_filesystem(temp, p->format, strempty(p->new_label), root ?: tmp_root, fs_uuid,
arg_discard);
if (r < 0)
return r;
/* Read-only filesystems are minimal from the first try because they create and size the
* loopback file for us. */
if (fstype_is_ro(p->format)) {
p->copy_blocks_path = TAKE_PTR(temp);
continue;
}
r = partition_populate_filesystem(p, temp, denylist);
if (r < 0)
return r;
/* Other filesystems need to be provided with a pre-sized loopback file and will adapt to
* fully occupy it. Because we gave the filesystem a 1T sparse file, we need to shrink the
* filesystem down to a reasonable size again to fit it in the disk image. While there are
* some filesystems that support shrinking, it doesn't always work properly (e.g. shrinking
* btrfs gives us a 2.0G filesystem regardless of what we put in it). Instead, let's populate
* the filesystem again, but this time, instead of providing the filesystem with a 1T sparse
* loopback file, let's size the loopback file based on the actual data used by the
* filesystem in the sparse file after the first attempt. This should be a good guess of the
* minimal amount of space needed in the filesystem to fit all the required data.
*/
r = fd_apparent_size(fd, &fsz);
if (r < 0)
return r;
/* Massage the size a bit because just going by actual data used in the sparse file isn't
* fool-proof. */
fsz = round_up_size(fsz + (fsz / 2), context->grain_size);
if (minimal_size_by_fs_name(p->format) != UINT64_MAX)
fsz = MAX(minimal_size_by_fs_name(p->format), fsz);
/* Erase the previous filesystem first. */
if (ftruncate(fd, 0))
return log_error_errno(errno, "Failed to erase temporary file: %m");
if (ftruncate(fd, fsz))
return log_error_errno(errno, "Failed to truncate temporary file to %s: %m", FORMAT_BYTES(fsz));
r = make_filesystem(temp, p->format, strempty(p->new_label), root ?: tmp_root, p->fs_uuid,
arg_discard);
if (r < 0)
return r;
r = partition_populate_filesystem(p, temp, denylist);
if (r < 0)
return r;
p->copy_blocks_path = TAKE_PTR(temp);
}
return 0;
}
static int help(void) {
_cleanup_free_ char *link = NULL;
int r;
@ -5949,10 +6131,18 @@ static int run(int argc, char *argv[]) {
if (r < 0)
return r;
/* Make sure each partition has a unique UUID and unique label */
r = context_acquire_partition_uuids_and_labels(context);
if (r < 0)
return r;
r = context_minimize(context);
if (r < 0)
return r;
/* Open all files to copy blocks from now, since we want to take their size into consideration */
r = context_open_copy_block_paths(
context,
arg_root,
loop_device ? loop_device->devno : /* if --image= is specified, only allow partitions on the loopback device */
arg_root && !arg_image ? 0 : /* if --root= is specified, don't accept any block device */
(dev_t) -1); /* if neither is specified, make no restrictions */
@ -6005,11 +6195,6 @@ static int run(int argc, char *argv[]) {
/* Now calculate where each new partition gets placed */
context_place_partitions(context);
/* Make sure each partition has a unique UUID and unique label */
r = context_acquire_partition_uuids_and_labels(context);
if (r < 0)
return r;
(void) context_dump(context, node, /*late=*/ false);
r = context_write_partition_table(context, node, from_scratch);

View File

@ -8,7 +8,7 @@
int resize_fs(int fd, uint64_t sz, uint64_t *ret_size);
#define BTRFS_MINIMAL_SIZE (256U*1024U*1024U)
#define XFS_MINIMAL_SIZE (14U*1024U*1024U)
#define XFS_MINIMAL_SIZE (16U*1024U*1024U)
#define EXT4_MINIMAL_SIZE (1024U*1024U)
uint64_t minimal_size_by_fs_magic(statfs_f_type_t magic);

View File

@ -3,6 +3,8 @@
set -e
TEST_DESCRIPTION="test systemd-repart"
IMAGE_NAME="repart"
TEST_FORCE_NEWIMAGE=1
# shellcheck source=test/test-functions
. "$TEST_BASE_DIR/test-functions"
@ -15,6 +17,7 @@ test_append_files() {
fi
instmods dm_verity =md
generate_module_dependencies
image_install -o /sbin/mksquashfs
fi
}

View File

@ -1346,6 +1346,9 @@ create_empty_image() {
root_size=$((4 * root_size))
data_size=$((2 * data_size))
fi
if [ "$IMAGE_NAME" = "repart" ]; then
root_size=$((root_size+=1000))
fi
echo "Setting up ${IMAGE_PUBLIC:?} (${root_size} MB)"
rm -f "${IMAGE_PRIVATE:?}" "$IMAGE_PUBLIC"

View File

@ -831,6 +831,58 @@ EOF
losetup -d "$loop"
}
test_minimize() {
local defs imgs output
if systemd-detect-virt --quiet --container; then
echo "Skipping minimize test in container."
return
fi
defs="$(mktemp --directory "/tmp/test-repart.XXXXXXXXXX")"
imgs="$(mktemp --directory "/var/tmp/test-repart.XXXXXXXXXX")"
# shellcheck disable=SC2064
trap "rm -rf '$defs' '$imgs'" RETURN
for format in ext4 vfat; do
if ! command -v "mkfs.$format" >/dev/null; then
continue
fi
cat >"$defs/root-$format.conf" <<EOF
[Partition]
Type=root-${architecture}
Format=${format}
CopyFiles=${defs}
Minimize=yes
EOF
done
if ! command -v mksquashfs >/dev/null; then
cat >"$defs/root-squashfs.conf" <<EOF
[Partition]
Type=root-${architecture}
Format=squashfs
CopyFiles=${defs}
Minimize=yes
EOF
fi
output=$(systemd-repart --definitions="$defs" \
--seed="$seed" \
--dry-run=no \
--empty=create \
--size=auto \
--json=pretty \
"$imgs/zzz")
# Check that we can dissect, mount and unmount a minimized image.
systemd-dissect "$imgs/zzz"
systemd-dissect "$imgs/zzz" -M "$imgs/mnt"
systemd-dissect -U "$imgs/mnt"
}
test_sector() {
local defs imgs output loop
local start size ratio
@ -900,6 +952,7 @@ test_issue_24553
test_zero_uuid
test_verity
test_issue_24786
test_minimize
# Valid block sizes on the Linux block layer are >= 512 and <= PAGE_SIZE, and
# must be powers of 2. Which leaves exactly four different ones to test on