pull: Add --per-object-fsync

This is the opposite of
https://github.com/ostreedev/ostree/issues/1184

Motivated by OpenShift seeing etcd performance issues during
OS updates: https://github.com/openshift/machine-config-operator/issues/1897

Basically, if we switch to invoking `fsync()` as we go, it makes
ostree performance worse (in my tests, 31s to write 2G versus 7s if we
delay sync) but it avoids *huge* outliers in `fsync()` time for etcd.
This commit is contained in:
Colin Walters 2020-07-16 21:13:36 +00:00
parent 4752dd02cf
commit a615d35762
8 changed files with 131 additions and 47 deletions

View File

@ -127,6 +127,18 @@ Boston, MA 02111-1307, USA.
</listitem>
</varlistentry>
<varlistentry>
<term><varname>per-object-fsync</varname></term>
<listitem><para>By default, OSTree will batch fsync() after
writing everything; however, this can cause latency spikes
for other processes which are also invoking fsync().
Turn on this boolean to reduce potential latency spikes,
at the cost of slowing down OSTree updates. You most
likely want this on by default for "background" OS updates.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>min-free-space-percent</varname></term>
<listitem>

View File

@ -51,15 +51,33 @@
#define FICLONE _IOW(0x94, 9, int)
#endif
/* If fsync is enabled and we're in a txn, we write into a staging dir for
* commit, but we also allow direct writes into objects/ for e.g. hardlink
* imports.
/* Understanding ostree's fsync strategy
*
* A long time ago, ostree used to invoke fsync() on each object,
* then move it into the objects directory. However, it turned
* out to be a *lot* faster to write the objects into a separate "staging"
* directory (letting the filesystem handle writeback how it likes)
* and then only walk over each of the files, fsync(), then rename()
* into place. See also https://lwn.net/Articles/789024/
*
* (We also support a "disable fsync entirely" mode, where you don't
* care about integrity; e.g. test suites using disposable VMs).
*
* This "delayed fsync" pattern though is much worse for other concurrent processes
* like databases because it forces a lot to go through the filesystem
* journal at once once we do the sync. So now we support a `per_object_fsync`
* option that again invokes `fsync()` directly. This also notably
* provides "backpressure", ensuring we aren't queuing up a huge amount
* of I/O at once.
*/
/* The directory where we place content */
static int
commit_dest_dfd (OstreeRepo *self)
{
if (self->in_transaction && !self->disable_fsync)
if (self->per_object_fsync)
return self->objects_dir_fd;
else if (self->in_transaction && !self->disable_fsync)
return self->commit_stagedir.fd;
else
return self->objects_dir_fd;
@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self,
/* Ensure that in case of a power cut, these files have the data we
* want. See http://lwn.net/Articles/322823/
*/
if (!self->in_transaction && !self->disable_fsync)
if (!self->disable_fsync && self->per_object_fsync)
{
if (fsync (tmpf->fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self,
return TRUE;
}
/* Synchronize the directories holding the objects */
static gboolean
fsync_object_dirs (OstreeRepo *self,
GCancellable *cancellable,
GError **error)
{
GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error);
g_auto(GLnxDirFdIterator) dfd_iter = { 0, };
if (self->disable_fsync)
return TRUE; /* No fsync? Nothing to do then. */
if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error))
return FALSE;
while (TRUE)
{
struct dirent *dent;
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
break;
if (dent->d_type != DT_DIR)
continue;
/* All object directories only have two character entries */
if (strlen (dent->d_name) != 2)
continue;
glnx_autofd int target_dir_fd = -1;
if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE,
&target_dir_fd, error))
return FALSE;
/* This synchronizes the directory to ensure all the objects we wrote
* are there. We need to do this before removing the .commitpartial
* stamp (or have a ref point to the commit).
*/
if (fsync (target_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}
/* In case we created any loose object subdirs, make sure they are on disk */
if (fsync (self->objects_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
return TRUE;
}
/* Called for commit, to iterate over the "staging" directory and rename all the
* objects into the primary objects/ location. Notably this is called only after
* syncfs() has potentially been invoked to ensure that all objects have been
@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self,
while (TRUE)
{
struct dirent *dent;
gboolean renamed_some_object = FALSE;
g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self,
if (strlen (dent->d_name) != 2)
continue;
g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE,
&child_dfd_iter, error))
return FALSE;
char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
loose_objpath[0] = dent->d_name[0];
loose_objpath[1] = dent->d_name[1];
loose_objpath[2] = '/';
@ -1899,35 +1961,7 @@ rename_pending_loose_objects (OstreeRepo *self,
if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3,
self->objects_dir_fd, loose_objpath, error))
return FALSE;
renamed_some_object = TRUE;
}
if (renamed_some_object && !self->disable_fsync)
{
/* Ensure that in the case of a power cut all the directory metadata that
we want has reached the disk. In particular, we want this before we
update the refs to point to these objects. */
glnx_autofd int target_dir_fd = -1;
loose_objpath[2] = 0;
if (!glnx_opendirat (self->objects_dir_fd,
loose_objpath, FALSE,
&target_dir_fd,
error))
return FALSE;
if (fsync (target_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}
}
/* In case we created any loose object subdirs, make sure they are on disk */
if (!self->disable_fsync)
{
if (fsync (self->objects_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}
return TRUE;
@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self,
if (!rename_pending_loose_objects (self, cancellable, error))
return FALSE;
if (!fsync_object_dirs (self, cancellable, error))
return FALSE;
g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path));
if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error))
return FALSE;

View File

@ -38,13 +38,17 @@ G_BEGIN_DECLS
#define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8
#define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2
/* In most cases, writing to disk should be much faster than
* fetching from the network, so we shouldn't actually hit
* this. But if using pipelining and e.g. pulling over LAN
* (or writing to slow media), we can have a runaway
* situation towards EMFILE.
/* We want some parallelism with disk writes, but we also
* want to avoid starting tens or hundreds of threads
* (via GTask) all writing to disk. Eventually we may
* use io_uring which handles backpressure correctly.
* Also, in "immediate fsync" mode, this helps provide
* much more backpressure, helping our I/O patterns
* be nicer for any concurrent processes, such as etcd
* or other databases.
* https://github.com/openshift/machine-config-operator/issues/1897
* */
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3
/* Well-known keys for the additional metadata field in a summary file. */
#define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified"
@ -147,6 +151,7 @@ struct OstreeRepo {
GError *writable_error;
gboolean in_transaction;
gboolean disable_fsync;
gboolean per_object_fsync;
gboolean disable_xattrs;
guint zlib_compression_level;
GHashTable *loose_object_devino_hash;

View File

@ -3286,6 +3286,7 @@ initiate_request (OtPullData *pull_data,
* * disable-sign-verify (b): Disable signapi verification of commits
* * disable-sign-verify-summary (b): Disable signapi verification of the summary
* * depth (i): How far in the history to traverse; default is 0, -1 means infinite
* * per-object-fsync (b): Perform disk writes more slowly, avoiding a single large I/O sync
* * disable-static-deltas (b): Do not use static deltas
* * require-static-deltas (b): Require static deltas
* * override-commit-ids (as): Array of specific commit IDs to fetch for refs
@ -3340,6 +3341,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
g_autoptr(GVariantIter) collection_refs_iter = NULL;
g_autofree char **override_commit_ids = NULL;
g_autoptr(GSource) update_timeout = NULL;
gboolean opt_per_object_fsync = FALSE;
gboolean opt_gpg_verify_set = FALSE;
gboolean opt_gpg_verify_summary_set = FALSE;
gboolean opt_collection_refs_set = FALSE;
@ -3385,6 +3387,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
(void) g_variant_lookup (options, "require-static-deltas", "b", &pull_data->require_static_deltas);
(void) g_variant_lookup (options, "override-commit-ids", "^a&s", &override_commit_ids);
(void) g_variant_lookup (options, "dry-run", "b", &pull_data->dry_run);
(void) g_variant_lookup (options, "per-object-fsync", "b", &opt_per_object_fsync);
(void) g_variant_lookup (options, "override-url", "&s", &url_override);
(void) g_variant_lookup (options, "inherit-transaction", "b", &inherit_transaction);
(void) g_variant_lookup (options, "http-headers", "@a(ss)", &pull_data->extra_headers);
@ -3453,6 +3456,10 @@ ostree_repo_pull_with_options (OstreeRepo *self,
pull_data->main_context = g_main_context_ref_thread_default ();
pull_data->flags = flags;
/* TODO: Avoid mutating the repo object */
if (opt_per_object_fsync)
self->per_object_fsync = TRUE;
if (!opt_n_network_retries_set)
pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES;

View File

@ -2922,6 +2922,10 @@ reload_core_config (OstreeRepo *self,
ostree_repo_set_disable_fsync (self, TRUE);
}
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "per-object-fsync",
FALSE, &self->per_object_fsync, error))
return FALSE;
/* See https://github.com/ostreedev/ostree/issues/758 */
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "disable-xattrs",
FALSE, &self->disable_xattrs, error))

View File

@ -34,6 +34,7 @@
static char *opt_remote;
static gboolean opt_commit_only;
static gboolean opt_disable_fsync;
static gboolean opt_per_object_fsync;
static gboolean opt_untrusted;
static gboolean opt_bareuseronly_files;
static gboolean opt_require_static_deltas;
@ -50,6 +51,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "remote", 0, 0, G_OPTION_ARG_STRING, &opt_remote, "Add REMOTE to refspec", "REMOTE" },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "untrusted", 0, 0, G_OPTION_ARG_NONE, &opt_untrusted, "Verify checksums of local sources (always enabled for HTTP pulls)", NULL },
{ "bareuseronly-files", 0, 0, G_OPTION_ARG_NONE, &opt_bareuseronly_files, "Reject regular files with mode outside of 0775 (world writable, suid, etc.)", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
@ -188,6 +190,9 @@ ostree_builtin_pull_local (int argc, char **argv, OstreeCommandInvocation *invoc
g_variant_new_variant (g_variant_new_boolean (TRUE)));
g_variant_builder_add (&builder, "{s@v}", "disable-sign-verify-summary",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_per_object_fsync)
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (console.is_tty)
progress = ostree_async_progress_new_and_connect (ostree_repo_pull_default_console_progress_changed, &console);

View File

@ -29,6 +29,7 @@
#include "otutil.h"
static gboolean opt_disable_fsync;
static gboolean opt_per_object_fsync;
static gboolean opt_mirror;
static gboolean opt_commit_only;
static gboolean opt_dry_run;
@ -58,6 +59,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "cache-dir", 0, 0, G_OPTION_ARG_FILENAME, &opt_cache_dir, "Use custom cache dir", NULL },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "disable-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_disable_static_deltas, "Do not use static deltas", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
{ "mirror", 0, 0, G_OPTION_ARG_NONE, &opt_mirror, "Write refs suitable for a mirror and fetches all refs if none provided", NULL },
@ -325,7 +327,9 @@ ostree_builtin_pull (int argc, char **argv, OstreeCommandInvocation *invocation,
if (opt_localcache_repos)
g_variant_builder_add (&builder, "{s@v}", "localcache-repos",
g_variant_new_variant (g_variant_new_strv ((const char*const*)opt_localcache_repos, -1)));
if (opt_per_object_fsync)
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_http_headers)
{
GVariantBuilder hdr_builder;

View File

@ -55,10 +55,10 @@ function verify_initial_contents() {
}
if has_gpgme; then
echo "1..34"
echo "1..35"
else
# 3 tests needs GPG support
echo "1..31"
echo "1..32"
fi
# Try both syntaxes
@ -74,6 +74,16 @@ cd ${test_tmpdir}
verify_initial_contents
echo "ok pull contents"
# And a test with incremental fsync
repo_init --no-sign-verify
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin main >out.txt
assert_file_has_content out.txt "[1-9][0-9]* metadata, [1-9][0-9]* content objects fetched"
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin:main > out.txt
assert_not_file_has_content out.txt "[1-9][0-9]* content objects fetched"
${CMD_PREFIX} ostree --repo=repo fsck
verify_initial_contents
echo "ok pull --per-object-fsync"
cd ${test_tmpdir}
mkdir mirrorrepo
ostree_repo_init mirrorrepo --mode=archive