diff --git a/man/ostree.repo-config.xml b/man/ostree.repo-config.xml index 90ac9083..7a01fc01 100644 --- a/man/ostree.repo-config.xml +++ b/man/ostree.repo-config.xml @@ -127,6 +127,18 @@ Boston, MA 02111-1307, USA. + + per-object-fsync + By default, OSTree will batch fsync() after + writing everything; however, this can cause latency spikes + for other processes which are also invoking fsync(). + Turn on this boolean to reduce potential latency spikes, + at the cost of slowing down OSTree updates. You most + likely want this on by default for "background" OS updates. + + + + min-free-space-percent diff --git a/src/libostree/ostree-repo-commit.c b/src/libostree/ostree-repo-commit.c index 727e853b..0c9de239 100644 --- a/src/libostree/ostree-repo-commit.c +++ b/src/libostree/ostree-repo-commit.c @@ -51,15 +51,33 @@ #define FICLONE _IOW(0x94, 9, int) #endif - -/* If fsync is enabled and we're in a txn, we write into a staging dir for - * commit, but we also allow direct writes into objects/ for e.g. hardlink - * imports. +/* Understanding ostree's fsync strategy + * + * A long time ago, ostree used to invoke fsync() on each object, + * then move it into the objects directory. However, it turned + * out to be a *lot* faster to write the objects into a separate "staging" + * directory (letting the filesystem handle writeback how it likes) + * and then only walk over each of the files, fsync(), then rename() + * into place. See also https://lwn.net/Articles/789024/ + * + * (We also support a "disable fsync entirely" mode, where you don't + * care about integrity; e.g. test suites using disposable VMs). + * + * This "delayed fsync" pattern though is much worse for other concurrent processes + * like databases because it forces a lot to go through the filesystem + * journal at once once we do the sync. So now we support a `per_object_fsync` + * option that again invokes `fsync()` directly. This also notably + * provides "backpressure", ensuring we aren't queuing up a huge amount + * of I/O at once. */ + +/* The directory where we place content */ static int commit_dest_dfd (OstreeRepo *self) { - if (self->in_transaction && !self->disable_fsync) + if (self->per_object_fsync) + return self->objects_dir_fd; + else if (self->in_transaction && !self->disable_fsync) return self->commit_stagedir.fd; else return self->objects_dir_fd; @@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self, /* Ensure that in case of a power cut, these files have the data we * want. See http://lwn.net/Articles/322823/ */ - if (!self->in_transaction && !self->disable_fsync) + if (!self->disable_fsync && self->per_object_fsync) { if (fsync (tmpf->fd) == -1) return glnx_throw_errno_prefix (error, "fsync"); @@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self, return TRUE; } +/* Synchronize the directories holding the objects */ +static gboolean +fsync_object_dirs (OstreeRepo *self, + GCancellable *cancellable, + GError **error) +{ + GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error); + g_auto(GLnxDirFdIterator) dfd_iter = { 0, }; + + if (self->disable_fsync) + return TRUE; /* No fsync? Nothing to do then. */ + + if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error)) + return FALSE; + while (TRUE) + { + struct dirent *dent; + if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error)) + return FALSE; + if (dent == NULL) + break; + if (dent->d_type != DT_DIR) + continue; + /* All object directories only have two character entries */ + if (strlen (dent->d_name) != 2) + continue; + + glnx_autofd int target_dir_fd = -1; + if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE, + &target_dir_fd, error)) + return FALSE; + /* This synchronizes the directory to ensure all the objects we wrote + * are there. We need to do this before removing the .commitpartial + * stamp (or have a ref point to the commit). + */ + if (fsync (target_dir_fd) == -1) + return glnx_throw_errno_prefix (error, "fsync"); + } + + /* In case we created any loose object subdirs, make sure they are on disk */ + if (fsync (self->objects_dir_fd) == -1) + return glnx_throw_errno_prefix (error, "fsync"); + + return TRUE; +} + /* Called for commit, to iterate over the "staging" directory and rename all the * objects into the primary objects/ location. Notably this is called only after * syncfs() has potentially been invoked to ensure that all objects have been @@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self, while (TRUE) { struct dirent *dent; - gboolean renamed_some_object = FALSE; - g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, }; - char loose_objpath[_OSTREE_LOOSE_PATH_MAX]; - if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error)) return FALSE; if (dent == NULL) @@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self, if (strlen (dent->d_name) != 2) continue; + g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, }; if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE, &child_dfd_iter, error)) return FALSE; + char loose_objpath[_OSTREE_LOOSE_PATH_MAX]; loose_objpath[0] = dent->d_name[0]; loose_objpath[1] = dent->d_name[1]; loose_objpath[2] = '/'; @@ -1899,35 +1961,7 @@ rename_pending_loose_objects (OstreeRepo *self, if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3, self->objects_dir_fd, loose_objpath, error)) return FALSE; - - renamed_some_object = TRUE; } - - if (renamed_some_object && !self->disable_fsync) - { - /* Ensure that in the case of a power cut all the directory metadata that - we want has reached the disk. In particular, we want this before we - update the refs to point to these objects. */ - glnx_autofd int target_dir_fd = -1; - - loose_objpath[2] = 0; - - if (!glnx_opendirat (self->objects_dir_fd, - loose_objpath, FALSE, - &target_dir_fd, - error)) - return FALSE; - - if (fsync (target_dir_fd) == -1) - return glnx_throw_errno_prefix (error, "fsync"); - } - } - - /* In case we created any loose object subdirs, make sure they are on disk */ - if (!self->disable_fsync) - { - if (fsync (self->objects_dir_fd) == -1) - return glnx_throw_errno_prefix (error, "fsync"); } return TRUE; @@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self, if (!rename_pending_loose_objects (self, cancellable, error)) return FALSE; + if (!fsync_object_dirs (self, cancellable, error)) + return FALSE; + g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path)); if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error)) return FALSE; diff --git a/src/libostree/ostree-repo-private.h b/src/libostree/ostree-repo-private.h index a744c069..8c1f5071 100644 --- a/src/libostree/ostree-repo-private.h +++ b/src/libostree/ostree-repo-private.h @@ -38,13 +38,17 @@ G_BEGIN_DECLS #define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8 #define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2 -/* In most cases, writing to disk should be much faster than - * fetching from the network, so we shouldn't actually hit - * this. But if using pipelining and e.g. pulling over LAN - * (or writing to slow media), we can have a runaway - * situation towards EMFILE. +/* We want some parallelism with disk writes, but we also + * want to avoid starting tens or hundreds of threads + * (via GTask) all writing to disk. Eventually we may + * use io_uring which handles backpressure correctly. + * Also, in "immediate fsync" mode, this helps provide + * much more backpressure, helping our I/O patterns + * be nicer for any concurrent processes, such as etcd + * or other databases. + * https://github.com/openshift/machine-config-operator/issues/1897 * */ -#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16 +#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3 /* Well-known keys for the additional metadata field in a summary file. */ #define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified" @@ -147,6 +151,7 @@ struct OstreeRepo { GError *writable_error; gboolean in_transaction; gboolean disable_fsync; + gboolean per_object_fsync; gboolean disable_xattrs; guint zlib_compression_level; GHashTable *loose_object_devino_hash; diff --git a/src/libostree/ostree-repo-pull.c b/src/libostree/ostree-repo-pull.c index 6d35be26..5fdbeabc 100644 --- a/src/libostree/ostree-repo-pull.c +++ b/src/libostree/ostree-repo-pull.c @@ -3286,6 +3286,7 @@ initiate_request (OtPullData *pull_data, * * disable-sign-verify (b): Disable signapi verification of commits * * disable-sign-verify-summary (b): Disable signapi verification of the summary * * depth (i): How far in the history to traverse; default is 0, -1 means infinite + * * per-object-fsync (b): Perform disk writes more slowly, avoiding a single large I/O sync * * disable-static-deltas (b): Do not use static deltas * * require-static-deltas (b): Require static deltas * * override-commit-ids (as): Array of specific commit IDs to fetch for refs @@ -3340,6 +3341,7 @@ ostree_repo_pull_with_options (OstreeRepo *self, g_autoptr(GVariantIter) collection_refs_iter = NULL; g_autofree char **override_commit_ids = NULL; g_autoptr(GSource) update_timeout = NULL; + gboolean opt_per_object_fsync = FALSE; gboolean opt_gpg_verify_set = FALSE; gboolean opt_gpg_verify_summary_set = FALSE; gboolean opt_collection_refs_set = FALSE; @@ -3386,6 +3388,7 @@ ostree_repo_pull_with_options (OstreeRepo *self, (void) g_variant_lookup (options, "require-static-deltas", "b", &pull_data->require_static_deltas); (void) g_variant_lookup (options, "override-commit-ids", "^a&s", &override_commit_ids); (void) g_variant_lookup (options, "dry-run", "b", &pull_data->dry_run); + (void) g_variant_lookup (options, "per-object-fsync", "b", &opt_per_object_fsync); (void) g_variant_lookup (options, "override-url", "&s", &url_override); (void) g_variant_lookup (options, "inherit-transaction", "b", &inherit_transaction); (void) g_variant_lookup (options, "http-headers", "@a(ss)", &pull_data->extra_headers); @@ -3454,6 +3457,10 @@ ostree_repo_pull_with_options (OstreeRepo *self, pull_data->main_context = g_main_context_ref_thread_default (); pull_data->flags = flags; + /* TODO: Avoid mutating the repo object */ + if (opt_per_object_fsync) + self->per_object_fsync = TRUE; + if (!opt_n_network_retries_set) pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES; diff --git a/src/libostree/ostree-repo.c b/src/libostree/ostree-repo.c index 82dd286a..95eb0efc 100644 --- a/src/libostree/ostree-repo.c +++ b/src/libostree/ostree-repo.c @@ -2922,6 +2922,10 @@ reload_core_config (OstreeRepo *self, ostree_repo_set_disable_fsync (self, TRUE); } + if (!ot_keyfile_get_boolean_with_default (self->config, "core", "per-object-fsync", + FALSE, &self->per_object_fsync, error)) + return FALSE; + /* See https://github.com/ostreedev/ostree/issues/758 */ if (!ot_keyfile_get_boolean_with_default (self->config, "core", "disable-xattrs", FALSE, &self->disable_xattrs, error)) diff --git a/src/ostree/ot-builtin-pull-local.c b/src/ostree/ot-builtin-pull-local.c index c42d38d7..43f4f255 100644 --- a/src/ostree/ot-builtin-pull-local.c +++ b/src/ostree/ot-builtin-pull-local.c @@ -34,6 +34,7 @@ static char *opt_remote; static gboolean opt_commit_only; static gboolean opt_disable_fsync; +static gboolean opt_per_object_fsync; static gboolean opt_untrusted; static gboolean opt_bareuseronly_files; static gboolean opt_require_static_deltas; @@ -50,6 +51,7 @@ static GOptionEntry options[] = { { "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL }, { "remote", 0, 0, G_OPTION_ARG_STRING, &opt_remote, "Add REMOTE to refspec", "REMOTE" }, { "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL }, + { "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL }, { "untrusted", 0, 0, G_OPTION_ARG_NONE, &opt_untrusted, "Verify checksums of local sources (always enabled for HTTP pulls)", NULL }, { "bareuseronly-files", 0, 0, G_OPTION_ARG_NONE, &opt_bareuseronly_files, "Reject regular files with mode outside of 0775 (world writable, suid, etc.)", NULL }, { "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL }, @@ -188,6 +190,9 @@ ostree_builtin_pull_local (int argc, char **argv, OstreeCommandInvocation *invoc g_variant_new_variant (g_variant_new_boolean (TRUE))); g_variant_builder_add (&builder, "{s@v}", "disable-sign-verify-summary", g_variant_new_variant (g_variant_new_boolean (TRUE))); + if (opt_per_object_fsync) + g_variant_builder_add (&builder, "{s@v}", "per-object-fsync", + g_variant_new_variant (g_variant_new_boolean (TRUE))); if (console.is_tty) progress = ostree_async_progress_new_and_connect (ostree_repo_pull_default_console_progress_changed, &console); diff --git a/src/ostree/ot-builtin-pull.c b/src/ostree/ot-builtin-pull.c index 1625ab47..e69d62e3 100644 --- a/src/ostree/ot-builtin-pull.c +++ b/src/ostree/ot-builtin-pull.c @@ -29,6 +29,7 @@ #include "otutil.h" static gboolean opt_disable_fsync; +static gboolean opt_per_object_fsync; static gboolean opt_mirror; static gboolean opt_commit_only; static gboolean opt_dry_run; @@ -58,6 +59,7 @@ static GOptionEntry options[] = { { "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL }, { "cache-dir", 0, 0, G_OPTION_ARG_FILENAME, &opt_cache_dir, "Use custom cache dir", NULL }, { "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL }, + { "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL }, { "disable-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_disable_static_deltas, "Do not use static deltas", NULL }, { "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL }, { "mirror", 0, 0, G_OPTION_ARG_NONE, &opt_mirror, "Write refs suitable for a mirror and fetches all refs if none provided", NULL }, @@ -325,7 +327,9 @@ ostree_builtin_pull (int argc, char **argv, OstreeCommandInvocation *invocation, if (opt_localcache_repos) g_variant_builder_add (&builder, "{s@v}", "localcache-repos", g_variant_new_variant (g_variant_new_strv ((const char*const*)opt_localcache_repos, -1))); - + if (opt_per_object_fsync) + g_variant_builder_add (&builder, "{s@v}", "per-object-fsync", + g_variant_new_variant (g_variant_new_boolean (TRUE))); if (opt_http_headers) { GVariantBuilder hdr_builder; diff --git a/tests/pull-test.sh b/tests/pull-test.sh index 07851a2c..082ed315 100644 --- a/tests/pull-test.sh +++ b/tests/pull-test.sh @@ -55,10 +55,10 @@ function verify_initial_contents() { } if has_gpgme; then - echo "1..34" + echo "1..35" else # 3 tests needs GPG support - echo "1..31" + echo "1..32" fi # Try both syntaxes @@ -74,6 +74,16 @@ cd ${test_tmpdir} verify_initial_contents echo "ok pull contents" +# And a test with incremental fsync +repo_init --no-sign-verify +${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin main >out.txt +assert_file_has_content out.txt "[1-9][0-9]* metadata, [1-9][0-9]* content objects fetched" +${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin:main > out.txt +assert_not_file_has_content out.txt "[1-9][0-9]* content objects fetched" +${CMD_PREFIX} ostree --repo=repo fsck +verify_initial_contents +echo "ok pull --per-object-fsync" + cd ${test_tmpdir} mkdir mirrorrepo ostree_repo_init mirrorrepo --mode=archive