mirror of
https://github.com/ostreedev/ostree.git
synced 2024-12-22 17:35:55 +03:00
pull: Add --per-object-fsync
This is the opposite of https://github.com/ostreedev/ostree/issues/1184 Motivated by OpenShift seeing etcd performance issues during OS updates: https://github.com/openshift/machine-config-operator/issues/1897 Basically, if we switch to invoking `fsync()` as we go, it makes ostree performance worse (in my tests, 31s to write 2G versus 7s if we delay sync) but it avoids *huge* outliers in `fsync()` time for etcd.
This commit is contained in:
parent
4752dd02cf
commit
a615d35762
@ -127,6 +127,18 @@ Boston, MA 02111-1307, USA.
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>per-object-fsync</varname></term>
|
||||
<listitem><para>By default, OSTree will batch fsync() after
|
||||
writing everything; however, this can cause latency spikes
|
||||
for other processes which are also invoking fsync().
|
||||
Turn on this boolean to reduce potential latency spikes,
|
||||
at the cost of slowing down OSTree updates. You most
|
||||
likely want this on by default for "background" OS updates.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>min-free-space-percent</varname></term>
|
||||
<listitem>
|
||||
|
@ -51,15 +51,33 @@
|
||||
#define FICLONE _IOW(0x94, 9, int)
|
||||
#endif
|
||||
|
||||
|
||||
/* If fsync is enabled and we're in a txn, we write into a staging dir for
|
||||
* commit, but we also allow direct writes into objects/ for e.g. hardlink
|
||||
* imports.
|
||||
/* Understanding ostree's fsync strategy
|
||||
*
|
||||
* A long time ago, ostree used to invoke fsync() on each object,
|
||||
* then move it into the objects directory. However, it turned
|
||||
* out to be a *lot* faster to write the objects into a separate "staging"
|
||||
* directory (letting the filesystem handle writeback how it likes)
|
||||
* and then only walk over each of the files, fsync(), then rename()
|
||||
* into place. See also https://lwn.net/Articles/789024/
|
||||
*
|
||||
* (We also support a "disable fsync entirely" mode, where you don't
|
||||
* care about integrity; e.g. test suites using disposable VMs).
|
||||
*
|
||||
* This "delayed fsync" pattern though is much worse for other concurrent processes
|
||||
* like databases because it forces a lot to go through the filesystem
|
||||
* journal at once once we do the sync. So now we support a `per_object_fsync`
|
||||
* option that again invokes `fsync()` directly. This also notably
|
||||
* provides "backpressure", ensuring we aren't queuing up a huge amount
|
||||
* of I/O at once.
|
||||
*/
|
||||
|
||||
/* The directory where we place content */
|
||||
static int
|
||||
commit_dest_dfd (OstreeRepo *self)
|
||||
{
|
||||
if (self->in_transaction && !self->disable_fsync)
|
||||
if (self->per_object_fsync)
|
||||
return self->objects_dir_fd;
|
||||
else if (self->in_transaction && !self->disable_fsync)
|
||||
return self->commit_stagedir.fd;
|
||||
else
|
||||
return self->objects_dir_fd;
|
||||
@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self,
|
||||
/* Ensure that in case of a power cut, these files have the data we
|
||||
* want. See http://lwn.net/Articles/322823/
|
||||
*/
|
||||
if (!self->in_transaction && !self->disable_fsync)
|
||||
if (!self->disable_fsync && self->per_object_fsync)
|
||||
{
|
||||
if (fsync (tmpf->fd) == -1)
|
||||
return glnx_throw_errno_prefix (error, "fsync");
|
||||
@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self,
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* Synchronize the directories holding the objects */
|
||||
static gboolean
|
||||
fsync_object_dirs (OstreeRepo *self,
|
||||
GCancellable *cancellable,
|
||||
GError **error)
|
||||
{
|
||||
GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error);
|
||||
g_auto(GLnxDirFdIterator) dfd_iter = { 0, };
|
||||
|
||||
if (self->disable_fsync)
|
||||
return TRUE; /* No fsync? Nothing to do then. */
|
||||
|
||||
if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error))
|
||||
return FALSE;
|
||||
while (TRUE)
|
||||
{
|
||||
struct dirent *dent;
|
||||
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
|
||||
return FALSE;
|
||||
if (dent == NULL)
|
||||
break;
|
||||
if (dent->d_type != DT_DIR)
|
||||
continue;
|
||||
/* All object directories only have two character entries */
|
||||
if (strlen (dent->d_name) != 2)
|
||||
continue;
|
||||
|
||||
glnx_autofd int target_dir_fd = -1;
|
||||
if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE,
|
||||
&target_dir_fd, error))
|
||||
return FALSE;
|
||||
/* This synchronizes the directory to ensure all the objects we wrote
|
||||
* are there. We need to do this before removing the .commitpartial
|
||||
* stamp (or have a ref point to the commit).
|
||||
*/
|
||||
if (fsync (target_dir_fd) == -1)
|
||||
return glnx_throw_errno_prefix (error, "fsync");
|
||||
}
|
||||
|
||||
/* In case we created any loose object subdirs, make sure they are on disk */
|
||||
if (fsync (self->objects_dir_fd) == -1)
|
||||
return glnx_throw_errno_prefix (error, "fsync");
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* Called for commit, to iterate over the "staging" directory and rename all the
|
||||
* objects into the primary objects/ location. Notably this is called only after
|
||||
* syncfs() has potentially been invoked to ensure that all objects have been
|
||||
@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self,
|
||||
while (TRUE)
|
||||
{
|
||||
struct dirent *dent;
|
||||
gboolean renamed_some_object = FALSE;
|
||||
g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
|
||||
char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
|
||||
|
||||
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
|
||||
return FALSE;
|
||||
if (dent == NULL)
|
||||
@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self,
|
||||
if (strlen (dent->d_name) != 2)
|
||||
continue;
|
||||
|
||||
g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
|
||||
if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE,
|
||||
&child_dfd_iter, error))
|
||||
return FALSE;
|
||||
|
||||
char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
|
||||
loose_objpath[0] = dent->d_name[0];
|
||||
loose_objpath[1] = dent->d_name[1];
|
||||
loose_objpath[2] = '/';
|
||||
@ -1899,35 +1961,7 @@ rename_pending_loose_objects (OstreeRepo *self,
|
||||
if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3,
|
||||
self->objects_dir_fd, loose_objpath, error))
|
||||
return FALSE;
|
||||
|
||||
renamed_some_object = TRUE;
|
||||
}
|
||||
|
||||
if (renamed_some_object && !self->disable_fsync)
|
||||
{
|
||||
/* Ensure that in the case of a power cut all the directory metadata that
|
||||
we want has reached the disk. In particular, we want this before we
|
||||
update the refs to point to these objects. */
|
||||
glnx_autofd int target_dir_fd = -1;
|
||||
|
||||
loose_objpath[2] = 0;
|
||||
|
||||
if (!glnx_opendirat (self->objects_dir_fd,
|
||||
loose_objpath, FALSE,
|
||||
&target_dir_fd,
|
||||
error))
|
||||
return FALSE;
|
||||
|
||||
if (fsync (target_dir_fd) == -1)
|
||||
return glnx_throw_errno_prefix (error, "fsync");
|
||||
}
|
||||
}
|
||||
|
||||
/* In case we created any loose object subdirs, make sure they are on disk */
|
||||
if (!self->disable_fsync)
|
||||
{
|
||||
if (fsync (self->objects_dir_fd) == -1)
|
||||
return glnx_throw_errno_prefix (error, "fsync");
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self,
|
||||
if (!rename_pending_loose_objects (self, cancellable, error))
|
||||
return FALSE;
|
||||
|
||||
if (!fsync_object_dirs (self, cancellable, error))
|
||||
return FALSE;
|
||||
|
||||
g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path));
|
||||
if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error))
|
||||
return FALSE;
|
||||
|
@ -38,13 +38,17 @@ G_BEGIN_DECLS
|
||||
#define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8
|
||||
#define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2
|
||||
|
||||
/* In most cases, writing to disk should be much faster than
|
||||
* fetching from the network, so we shouldn't actually hit
|
||||
* this. But if using pipelining and e.g. pulling over LAN
|
||||
* (or writing to slow media), we can have a runaway
|
||||
* situation towards EMFILE.
|
||||
/* We want some parallelism with disk writes, but we also
|
||||
* want to avoid starting tens or hundreds of threads
|
||||
* (via GTask) all writing to disk. Eventually we may
|
||||
* use io_uring which handles backpressure correctly.
|
||||
* Also, in "immediate fsync" mode, this helps provide
|
||||
* much more backpressure, helping our I/O patterns
|
||||
* be nicer for any concurrent processes, such as etcd
|
||||
* or other databases.
|
||||
* https://github.com/openshift/machine-config-operator/issues/1897
|
||||
* */
|
||||
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16
|
||||
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3
|
||||
|
||||
/* Well-known keys for the additional metadata field in a summary file. */
|
||||
#define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified"
|
||||
@ -147,6 +151,7 @@ struct OstreeRepo {
|
||||
GError *writable_error;
|
||||
gboolean in_transaction;
|
||||
gboolean disable_fsync;
|
||||
gboolean per_object_fsync;
|
||||
gboolean disable_xattrs;
|
||||
guint zlib_compression_level;
|
||||
GHashTable *loose_object_devino_hash;
|
||||
|
@ -3286,6 +3286,7 @@ initiate_request (OtPullData *pull_data,
|
||||
* * disable-sign-verify (b): Disable signapi verification of commits
|
||||
* * disable-sign-verify-summary (b): Disable signapi verification of the summary
|
||||
* * depth (i): How far in the history to traverse; default is 0, -1 means infinite
|
||||
* * per-object-fsync (b): Perform disk writes more slowly, avoiding a single large I/O sync
|
||||
* * disable-static-deltas (b): Do not use static deltas
|
||||
* * require-static-deltas (b): Require static deltas
|
||||
* * override-commit-ids (as): Array of specific commit IDs to fetch for refs
|
||||
@ -3340,6 +3341,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
|
||||
g_autoptr(GVariantIter) collection_refs_iter = NULL;
|
||||
g_autofree char **override_commit_ids = NULL;
|
||||
g_autoptr(GSource) update_timeout = NULL;
|
||||
gboolean opt_per_object_fsync = FALSE;
|
||||
gboolean opt_gpg_verify_set = FALSE;
|
||||
gboolean opt_gpg_verify_summary_set = FALSE;
|
||||
gboolean opt_collection_refs_set = FALSE;
|
||||
@ -3385,6 +3387,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
|
||||
(void) g_variant_lookup (options, "require-static-deltas", "b", &pull_data->require_static_deltas);
|
||||
(void) g_variant_lookup (options, "override-commit-ids", "^a&s", &override_commit_ids);
|
||||
(void) g_variant_lookup (options, "dry-run", "b", &pull_data->dry_run);
|
||||
(void) g_variant_lookup (options, "per-object-fsync", "b", &opt_per_object_fsync);
|
||||
(void) g_variant_lookup (options, "override-url", "&s", &url_override);
|
||||
(void) g_variant_lookup (options, "inherit-transaction", "b", &inherit_transaction);
|
||||
(void) g_variant_lookup (options, "http-headers", "@a(ss)", &pull_data->extra_headers);
|
||||
@ -3453,6 +3456,10 @@ ostree_repo_pull_with_options (OstreeRepo *self,
|
||||
pull_data->main_context = g_main_context_ref_thread_default ();
|
||||
pull_data->flags = flags;
|
||||
|
||||
/* TODO: Avoid mutating the repo object */
|
||||
if (opt_per_object_fsync)
|
||||
self->per_object_fsync = TRUE;
|
||||
|
||||
if (!opt_n_network_retries_set)
|
||||
pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES;
|
||||
|
||||
|
@ -2922,6 +2922,10 @@ reload_core_config (OstreeRepo *self,
|
||||
ostree_repo_set_disable_fsync (self, TRUE);
|
||||
}
|
||||
|
||||
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "per-object-fsync",
|
||||
FALSE, &self->per_object_fsync, error))
|
||||
return FALSE;
|
||||
|
||||
/* See https://github.com/ostreedev/ostree/issues/758 */
|
||||
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "disable-xattrs",
|
||||
FALSE, &self->disable_xattrs, error))
|
||||
|
@ -34,6 +34,7 @@
|
||||
static char *opt_remote;
|
||||
static gboolean opt_commit_only;
|
||||
static gboolean opt_disable_fsync;
|
||||
static gboolean opt_per_object_fsync;
|
||||
static gboolean opt_untrusted;
|
||||
static gboolean opt_bareuseronly_files;
|
||||
static gboolean opt_require_static_deltas;
|
||||
@ -50,6 +51,7 @@ static GOptionEntry options[] = {
|
||||
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
|
||||
{ "remote", 0, 0, G_OPTION_ARG_STRING, &opt_remote, "Add REMOTE to refspec", "REMOTE" },
|
||||
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
|
||||
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
|
||||
{ "untrusted", 0, 0, G_OPTION_ARG_NONE, &opt_untrusted, "Verify checksums of local sources (always enabled for HTTP pulls)", NULL },
|
||||
{ "bareuseronly-files", 0, 0, G_OPTION_ARG_NONE, &opt_bareuseronly_files, "Reject regular files with mode outside of 0775 (world writable, suid, etc.)", NULL },
|
||||
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
|
||||
@ -188,6 +190,9 @@ ostree_builtin_pull_local (int argc, char **argv, OstreeCommandInvocation *invoc
|
||||
g_variant_new_variant (g_variant_new_boolean (TRUE)));
|
||||
g_variant_builder_add (&builder, "{s@v}", "disable-sign-verify-summary",
|
||||
g_variant_new_variant (g_variant_new_boolean (TRUE)));
|
||||
if (opt_per_object_fsync)
|
||||
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
|
||||
g_variant_new_variant (g_variant_new_boolean (TRUE)));
|
||||
|
||||
if (console.is_tty)
|
||||
progress = ostree_async_progress_new_and_connect (ostree_repo_pull_default_console_progress_changed, &console);
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "otutil.h"
|
||||
|
||||
static gboolean opt_disable_fsync;
|
||||
static gboolean opt_per_object_fsync;
|
||||
static gboolean opt_mirror;
|
||||
static gboolean opt_commit_only;
|
||||
static gboolean opt_dry_run;
|
||||
@ -58,6 +59,7 @@ static GOptionEntry options[] = {
|
||||
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
|
||||
{ "cache-dir", 0, 0, G_OPTION_ARG_FILENAME, &opt_cache_dir, "Use custom cache dir", NULL },
|
||||
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
|
||||
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
|
||||
{ "disable-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_disable_static_deltas, "Do not use static deltas", NULL },
|
||||
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
|
||||
{ "mirror", 0, 0, G_OPTION_ARG_NONE, &opt_mirror, "Write refs suitable for a mirror and fetches all refs if none provided", NULL },
|
||||
@ -325,7 +327,9 @@ ostree_builtin_pull (int argc, char **argv, OstreeCommandInvocation *invocation,
|
||||
if (opt_localcache_repos)
|
||||
g_variant_builder_add (&builder, "{s@v}", "localcache-repos",
|
||||
g_variant_new_variant (g_variant_new_strv ((const char*const*)opt_localcache_repos, -1)));
|
||||
|
||||
if (opt_per_object_fsync)
|
||||
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
|
||||
g_variant_new_variant (g_variant_new_boolean (TRUE)));
|
||||
if (opt_http_headers)
|
||||
{
|
||||
GVariantBuilder hdr_builder;
|
||||
|
@ -55,10 +55,10 @@ function verify_initial_contents() {
|
||||
}
|
||||
|
||||
if has_gpgme; then
|
||||
echo "1..34"
|
||||
echo "1..35"
|
||||
else
|
||||
# 3 tests needs GPG support
|
||||
echo "1..31"
|
||||
echo "1..32"
|
||||
fi
|
||||
|
||||
# Try both syntaxes
|
||||
@ -74,6 +74,16 @@ cd ${test_tmpdir}
|
||||
verify_initial_contents
|
||||
echo "ok pull contents"
|
||||
|
||||
# And a test with incremental fsync
|
||||
repo_init --no-sign-verify
|
||||
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin main >out.txt
|
||||
assert_file_has_content out.txt "[1-9][0-9]* metadata, [1-9][0-9]* content objects fetched"
|
||||
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin:main > out.txt
|
||||
assert_not_file_has_content out.txt "[1-9][0-9]* content objects fetched"
|
||||
${CMD_PREFIX} ostree --repo=repo fsck
|
||||
verify_initial_contents
|
||||
echo "ok pull --per-object-fsync"
|
||||
|
||||
cd ${test_tmpdir}
|
||||
mkdir mirrorrepo
|
||||
ostree_repo_init mirrorrepo --mode=archive
|
||||
|
Loading…
Reference in New Issue
Block a user