diff --git a/man/ostree.repo-config.xml b/man/ostree.repo-config.xml
index 90ac9083..7a01fc01 100644
--- a/man/ostree.repo-config.xml
+++ b/man/ostree.repo-config.xml
@@ -127,6 +127,18 @@ Boston, MA 02111-1307, USA.
+
+ per-object-fsync
+ By default, OSTree will batch fsync() after
+ writing everything; however, this can cause latency spikes
+ for other processes which are also invoking fsync().
+ Turn on this boolean to reduce potential latency spikes,
+ at the cost of slowing down OSTree updates. You most
+ likely want this on by default for "background" OS updates.
+
+
+
+
min-free-space-percent
diff --git a/src/libostree/ostree-repo-commit.c b/src/libostree/ostree-repo-commit.c
index 727e853b..0c9de239 100644
--- a/src/libostree/ostree-repo-commit.c
+++ b/src/libostree/ostree-repo-commit.c
@@ -51,15 +51,33 @@
#define FICLONE _IOW(0x94, 9, int)
#endif
-
-/* If fsync is enabled and we're in a txn, we write into a staging dir for
- * commit, but we also allow direct writes into objects/ for e.g. hardlink
- * imports.
+/* Understanding ostree's fsync strategy
+ *
+ * A long time ago, ostree used to invoke fsync() on each object,
+ * then move it into the objects directory. However, it turned
+ * out to be a *lot* faster to write the objects into a separate "staging"
+ * directory (letting the filesystem handle writeback how it likes)
+ * and then only walk over each of the files, fsync(), then rename()
+ * into place. See also https://lwn.net/Articles/789024/
+ *
+ * (We also support a "disable fsync entirely" mode, where you don't
+ * care about integrity; e.g. test suites using disposable VMs).
+ *
+ * This "delayed fsync" pattern though is much worse for other concurrent processes
+ * like databases because it forces a lot to go through the filesystem
+ * journal at once once we do the sync. So now we support a `per_object_fsync`
+ * option that again invokes `fsync()` directly. This also notably
+ * provides "backpressure", ensuring we aren't queuing up a huge amount
+ * of I/O at once.
*/
+
+/* The directory where we place content */
static int
commit_dest_dfd (OstreeRepo *self)
{
- if (self->in_transaction && !self->disable_fsync)
+ if (self->per_object_fsync)
+ return self->objects_dir_fd;
+ else if (self->in_transaction && !self->disable_fsync)
return self->commit_stagedir.fd;
else
return self->objects_dir_fd;
@@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self,
/* Ensure that in case of a power cut, these files have the data we
* want. See http://lwn.net/Articles/322823/
*/
- if (!self->in_transaction && !self->disable_fsync)
+ if (!self->disable_fsync && self->per_object_fsync)
{
if (fsync (tmpf->fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
@@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self,
return TRUE;
}
+/* Synchronize the directories holding the objects */
+static gboolean
+fsync_object_dirs (OstreeRepo *self,
+ GCancellable *cancellable,
+ GError **error)
+{
+ GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error);
+ g_auto(GLnxDirFdIterator) dfd_iter = { 0, };
+
+ if (self->disable_fsync)
+ return TRUE; /* No fsync? Nothing to do then. */
+
+ if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error))
+ return FALSE;
+ while (TRUE)
+ {
+ struct dirent *dent;
+ if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
+ return FALSE;
+ if (dent == NULL)
+ break;
+ if (dent->d_type != DT_DIR)
+ continue;
+ /* All object directories only have two character entries */
+ if (strlen (dent->d_name) != 2)
+ continue;
+
+ glnx_autofd int target_dir_fd = -1;
+ if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE,
+ &target_dir_fd, error))
+ return FALSE;
+ /* This synchronizes the directory to ensure all the objects we wrote
+ * are there. We need to do this before removing the .commitpartial
+ * stamp (or have a ref point to the commit).
+ */
+ if (fsync (target_dir_fd) == -1)
+ return glnx_throw_errno_prefix (error, "fsync");
+ }
+
+ /* In case we created any loose object subdirs, make sure they are on disk */
+ if (fsync (self->objects_dir_fd) == -1)
+ return glnx_throw_errno_prefix (error, "fsync");
+
+ return TRUE;
+}
+
/* Called for commit, to iterate over the "staging" directory and rename all the
* objects into the primary objects/ location. Notably this is called only after
* syncfs() has potentially been invoked to ensure that all objects have been
@@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self,
while (TRUE)
{
struct dirent *dent;
- gboolean renamed_some_object = FALSE;
- g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
- char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
-
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
@@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self,
if (strlen (dent->d_name) != 2)
continue;
+ g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE,
&child_dfd_iter, error))
return FALSE;
+ char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
loose_objpath[0] = dent->d_name[0];
loose_objpath[1] = dent->d_name[1];
loose_objpath[2] = '/';
@@ -1899,35 +1961,7 @@ rename_pending_loose_objects (OstreeRepo *self,
if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3,
self->objects_dir_fd, loose_objpath, error))
return FALSE;
-
- renamed_some_object = TRUE;
}
-
- if (renamed_some_object && !self->disable_fsync)
- {
- /* Ensure that in the case of a power cut all the directory metadata that
- we want has reached the disk. In particular, we want this before we
- update the refs to point to these objects. */
- glnx_autofd int target_dir_fd = -1;
-
- loose_objpath[2] = 0;
-
- if (!glnx_opendirat (self->objects_dir_fd,
- loose_objpath, FALSE,
- &target_dir_fd,
- error))
- return FALSE;
-
- if (fsync (target_dir_fd) == -1)
- return glnx_throw_errno_prefix (error, "fsync");
- }
- }
-
- /* In case we created any loose object subdirs, make sure they are on disk */
- if (!self->disable_fsync)
- {
- if (fsync (self->objects_dir_fd) == -1)
- return glnx_throw_errno_prefix (error, "fsync");
}
return TRUE;
@@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self,
if (!rename_pending_loose_objects (self, cancellable, error))
return FALSE;
+ if (!fsync_object_dirs (self, cancellable, error))
+ return FALSE;
+
g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path));
if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error))
return FALSE;
diff --git a/src/libostree/ostree-repo-private.h b/src/libostree/ostree-repo-private.h
index a744c069..8c1f5071 100644
--- a/src/libostree/ostree-repo-private.h
+++ b/src/libostree/ostree-repo-private.h
@@ -38,13 +38,17 @@ G_BEGIN_DECLS
#define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8
#define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2
-/* In most cases, writing to disk should be much faster than
- * fetching from the network, so we shouldn't actually hit
- * this. But if using pipelining and e.g. pulling over LAN
- * (or writing to slow media), we can have a runaway
- * situation towards EMFILE.
+/* We want some parallelism with disk writes, but we also
+ * want to avoid starting tens or hundreds of threads
+ * (via GTask) all writing to disk. Eventually we may
+ * use io_uring which handles backpressure correctly.
+ * Also, in "immediate fsync" mode, this helps provide
+ * much more backpressure, helping our I/O patterns
+ * be nicer for any concurrent processes, such as etcd
+ * or other databases.
+ * https://github.com/openshift/machine-config-operator/issues/1897
* */
-#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16
+#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3
/* Well-known keys for the additional metadata field in a summary file. */
#define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified"
@@ -147,6 +151,7 @@ struct OstreeRepo {
GError *writable_error;
gboolean in_transaction;
gboolean disable_fsync;
+ gboolean per_object_fsync;
gboolean disable_xattrs;
guint zlib_compression_level;
GHashTable *loose_object_devino_hash;
diff --git a/src/libostree/ostree-repo-pull.c b/src/libostree/ostree-repo-pull.c
index 6d35be26..5fdbeabc 100644
--- a/src/libostree/ostree-repo-pull.c
+++ b/src/libostree/ostree-repo-pull.c
@@ -3286,6 +3286,7 @@ initiate_request (OtPullData *pull_data,
* * disable-sign-verify (b): Disable signapi verification of commits
* * disable-sign-verify-summary (b): Disable signapi verification of the summary
* * depth (i): How far in the history to traverse; default is 0, -1 means infinite
+ * * per-object-fsync (b): Perform disk writes more slowly, avoiding a single large I/O sync
* * disable-static-deltas (b): Do not use static deltas
* * require-static-deltas (b): Require static deltas
* * override-commit-ids (as): Array of specific commit IDs to fetch for refs
@@ -3340,6 +3341,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
g_autoptr(GVariantIter) collection_refs_iter = NULL;
g_autofree char **override_commit_ids = NULL;
g_autoptr(GSource) update_timeout = NULL;
+ gboolean opt_per_object_fsync = FALSE;
gboolean opt_gpg_verify_set = FALSE;
gboolean opt_gpg_verify_summary_set = FALSE;
gboolean opt_collection_refs_set = FALSE;
@@ -3386,6 +3388,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
(void) g_variant_lookup (options, "require-static-deltas", "b", &pull_data->require_static_deltas);
(void) g_variant_lookup (options, "override-commit-ids", "^a&s", &override_commit_ids);
(void) g_variant_lookup (options, "dry-run", "b", &pull_data->dry_run);
+ (void) g_variant_lookup (options, "per-object-fsync", "b", &opt_per_object_fsync);
(void) g_variant_lookup (options, "override-url", "&s", &url_override);
(void) g_variant_lookup (options, "inherit-transaction", "b", &inherit_transaction);
(void) g_variant_lookup (options, "http-headers", "@a(ss)", &pull_data->extra_headers);
@@ -3454,6 +3457,10 @@ ostree_repo_pull_with_options (OstreeRepo *self,
pull_data->main_context = g_main_context_ref_thread_default ();
pull_data->flags = flags;
+ /* TODO: Avoid mutating the repo object */
+ if (opt_per_object_fsync)
+ self->per_object_fsync = TRUE;
+
if (!opt_n_network_retries_set)
pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES;
diff --git a/src/libostree/ostree-repo.c b/src/libostree/ostree-repo.c
index 82dd286a..95eb0efc 100644
--- a/src/libostree/ostree-repo.c
+++ b/src/libostree/ostree-repo.c
@@ -2922,6 +2922,10 @@ reload_core_config (OstreeRepo *self,
ostree_repo_set_disable_fsync (self, TRUE);
}
+ if (!ot_keyfile_get_boolean_with_default (self->config, "core", "per-object-fsync",
+ FALSE, &self->per_object_fsync, error))
+ return FALSE;
+
/* See https://github.com/ostreedev/ostree/issues/758 */
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "disable-xattrs",
FALSE, &self->disable_xattrs, error))
diff --git a/src/ostree/ot-builtin-pull-local.c b/src/ostree/ot-builtin-pull-local.c
index c42d38d7..43f4f255 100644
--- a/src/ostree/ot-builtin-pull-local.c
+++ b/src/ostree/ot-builtin-pull-local.c
@@ -34,6 +34,7 @@
static char *opt_remote;
static gboolean opt_commit_only;
static gboolean opt_disable_fsync;
+static gboolean opt_per_object_fsync;
static gboolean opt_untrusted;
static gboolean opt_bareuseronly_files;
static gboolean opt_require_static_deltas;
@@ -50,6 +51,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "remote", 0, 0, G_OPTION_ARG_STRING, &opt_remote, "Add REMOTE to refspec", "REMOTE" },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
+ { "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "untrusted", 0, 0, G_OPTION_ARG_NONE, &opt_untrusted, "Verify checksums of local sources (always enabled for HTTP pulls)", NULL },
{ "bareuseronly-files", 0, 0, G_OPTION_ARG_NONE, &opt_bareuseronly_files, "Reject regular files with mode outside of 0775 (world writable, suid, etc.)", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
@@ -188,6 +190,9 @@ ostree_builtin_pull_local (int argc, char **argv, OstreeCommandInvocation *invoc
g_variant_new_variant (g_variant_new_boolean (TRUE)));
g_variant_builder_add (&builder, "{s@v}", "disable-sign-verify-summary",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
+ if (opt_per_object_fsync)
+ g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
+ g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (console.is_tty)
progress = ostree_async_progress_new_and_connect (ostree_repo_pull_default_console_progress_changed, &console);
diff --git a/src/ostree/ot-builtin-pull.c b/src/ostree/ot-builtin-pull.c
index 1625ab47..e69d62e3 100644
--- a/src/ostree/ot-builtin-pull.c
+++ b/src/ostree/ot-builtin-pull.c
@@ -29,6 +29,7 @@
#include "otutil.h"
static gboolean opt_disable_fsync;
+static gboolean opt_per_object_fsync;
static gboolean opt_mirror;
static gboolean opt_commit_only;
static gboolean opt_dry_run;
@@ -58,6 +59,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "cache-dir", 0, 0, G_OPTION_ARG_FILENAME, &opt_cache_dir, "Use custom cache dir", NULL },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
+ { "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "disable-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_disable_static_deltas, "Do not use static deltas", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
{ "mirror", 0, 0, G_OPTION_ARG_NONE, &opt_mirror, "Write refs suitable for a mirror and fetches all refs if none provided", NULL },
@@ -325,7 +327,9 @@ ostree_builtin_pull (int argc, char **argv, OstreeCommandInvocation *invocation,
if (opt_localcache_repos)
g_variant_builder_add (&builder, "{s@v}", "localcache-repos",
g_variant_new_variant (g_variant_new_strv ((const char*const*)opt_localcache_repos, -1)));
-
+ if (opt_per_object_fsync)
+ g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
+ g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_http_headers)
{
GVariantBuilder hdr_builder;
diff --git a/tests/pull-test.sh b/tests/pull-test.sh
index 07851a2c..082ed315 100644
--- a/tests/pull-test.sh
+++ b/tests/pull-test.sh
@@ -55,10 +55,10 @@ function verify_initial_contents() {
}
if has_gpgme; then
- echo "1..34"
+ echo "1..35"
else
# 3 tests needs GPG support
- echo "1..31"
+ echo "1..32"
fi
# Try both syntaxes
@@ -74,6 +74,16 @@ cd ${test_tmpdir}
verify_initial_contents
echo "ok pull contents"
+# And a test with incremental fsync
+repo_init --no-sign-verify
+${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin main >out.txt
+assert_file_has_content out.txt "[1-9][0-9]* metadata, [1-9][0-9]* content objects fetched"
+${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin:main > out.txt
+assert_not_file_has_content out.txt "[1-9][0-9]* content objects fetched"
+${CMD_PREFIX} ostree --repo=repo fsck
+verify_initial_contents
+echo "ok pull --per-object-fsync"
+
cd ${test_tmpdir}
mkdir mirrorrepo
ostree_repo_init mirrorrepo --mode=archive