e383e158ed
When logging an inode and we require to copy items from subvolume leaves
to the log tree, we clone each subvolume leaf and than use that clone to
copy items to the log tree. This is required to avoid possible deadlocks
as stated in commit 796787c978
("btrfs: do not modify log tree while
holding a leaf from fs tree locked").
The cloning requires allocating an extent buffer (struct extent_buffer)
and then allocating pages (folios) to attach to the extent buffer. This
may be slow in case we are under memory pressure, and since we are doing
the cloning while holding a read lock on a subvolume leaf, it means we
can be blocking other operations on that leaf for significant periods of
time, which can increase latency on operations like creating other files,
renaming files, etc. Similarly because we're under a log transaction, we
may also cause extra delay on other tasks doing an fsync, because syncing
the log requires waiting for tasks that joined a log transaction to exit
the transaction.
So to improve this, for any inode logging operation that needs to copy
items from a subvolume leaf ("full sync" or "copy everything" bit set
in the inode), preallocate a dummy extent buffer before locking any
extent buffer from the subvolume tree, and even before joining a log
transaction, add it to the log context and then use it when we need to
copy items from a subvolume leaf to the log tree. This avoids making
other operations get extra latency when waiting to lock a subvolume
leaf that is used during inode logging and we are under heavy memory
pressure.
The following test script with bonnie++ was used to test this:
$ cat test.sh
#!/bin/bash
DEV=/dev/sdh
MNT=/mnt/sdh
MOUNT_OPTIONS="-o ssd"
MEMTOTAL_BYTES=`free -b | grep Mem: | awk '{ print $2 }'`
NR_DIRECTORIES=20
NR_FILES=20480
DATASET_SIZE=$((MEMTOTAL_BYTES * 2 / 1048576))
DIRECTORY_SIZE=$((MEMTOTAL_BYTES * 2 / NR_FILES))
NR_FILES=$((NR_FILES / 1024))
echo "performance" | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
umount $DEV &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
bonnie++ -u root -d $MNT \
-n $NR_FILES:$DIRECTORY_SIZE:$DIRECTORY_SIZE:$NR_DIRECTORIES \
-r 0 -s $DATASET_SIZE -b
umount $MNT
The results of this test on a 8G VM running a non-debug kernel (Debian's
default kernel config), were the following.
Before this change:
Version 2.00a ------Sequential Output------ --Sequential Input- --Random-
-Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Name:Size etc /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP
debian0 7501M 376k 99 1.4g 96 117m 14 1510k 99 2.5g 95 +++++ +++
Latency 35068us 24976us 2944ms 30725us 71770us 26152us
Version 2.00a ------Sequential Create------ --------Random Create--------
debian0 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
files:max:min /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP
20:384100:384100/20 20480 32 20480 58 20480 48 20480 39 20480 56 20480 61
Latency 411ms 11914us 119ms 617ms 10296us 110ms
After this change:
Version 2.00a ------Sequential Output------ --Sequential Input- --Random-
-Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Name:Size etc /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP
debian0 7501M 375k 99 1.4g 97 117m 14 1546k 99 2.3g 98 +++++ +++
Latency 35975us 20945us 2144ms 10297us 2217us 6004us
Version 2.00a ------Sequential Create------ --------Random Create--------
debian0 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
files:max:min /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP
20:384100:384100/20 20480 35 20480 58 20480 48 20480 40 20480 57 20480 59
Latency 320ms 11237us 77779us 518ms 6470us 86389us
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
136 lines
4.5 KiB
C
136 lines
4.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2008 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_TREE_LOG_H
|
|
#define BTRFS_TREE_LOG_H
|
|
|
|
#include "messages.h"
|
|
#include "ctree.h"
|
|
#include "transaction.h"
|
|
|
|
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
|
|
#define BTRFS_NO_LOG_SYNC 256
|
|
|
|
/*
|
|
* We can't use the tree log for whatever reason, force a transaction commit.
|
|
* We use a negative value because there are functions through the logging code
|
|
* that need to return an error (< 0 value), false (0) or true (1). Any negative
|
|
* value will do, as it will cause the log to be marked for a full sync.
|
|
*/
|
|
#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1))
|
|
|
|
struct btrfs_log_ctx {
|
|
int log_ret;
|
|
int log_transid;
|
|
bool log_new_dentries;
|
|
bool logging_new_name;
|
|
bool logging_new_delayed_dentries;
|
|
/* Indicate if the inode being logged was logged before. */
|
|
bool logged_before;
|
|
struct inode *inode;
|
|
struct list_head list;
|
|
/* Only used for fast fsyncs. */
|
|
struct list_head ordered_extents;
|
|
struct list_head conflict_inodes;
|
|
int num_conflict_inodes;
|
|
bool logging_conflict_inodes;
|
|
/*
|
|
* Used for fsyncs that need to copy items from the subvolume tree to
|
|
* the log tree (full sync flag set or copy everything flag set) to
|
|
* avoid allocating a temporary extent buffer while holding a lock on
|
|
* an extent buffer of the subvolume tree and under the log transaction.
|
|
* Also helps to avoid allocating and freeing a temporary extent buffer
|
|
* in case we need to process multiple leaves from the subvolume tree.
|
|
*/
|
|
struct extent_buffer *scratch_eb;
|
|
};
|
|
|
|
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
|
|
struct inode *inode)
|
|
{
|
|
ctx->log_ret = 0;
|
|
ctx->log_transid = 0;
|
|
ctx->log_new_dentries = false;
|
|
ctx->logging_new_name = false;
|
|
ctx->logging_new_delayed_dentries = false;
|
|
ctx->logged_before = false;
|
|
ctx->inode = inode;
|
|
INIT_LIST_HEAD(&ctx->list);
|
|
INIT_LIST_HEAD(&ctx->ordered_extents);
|
|
INIT_LIST_HEAD(&ctx->conflict_inodes);
|
|
ctx->num_conflict_inodes = 0;
|
|
ctx->logging_conflict_inodes = false;
|
|
ctx->scratch_eb = NULL;
|
|
}
|
|
|
|
static inline void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
|
|
{
|
|
struct btrfs_inode *inode = BTRFS_I(ctx->inode);
|
|
|
|
if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
|
|
!test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
|
|
return;
|
|
|
|
/*
|
|
* Don't care about allocation failure. This is just for optimization,
|
|
* if we fail to allocate here, we will try again later if needed.
|
|
*/
|
|
ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
|
|
}
|
|
|
|
static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
|
|
{
|
|
struct btrfs_ordered_extent *ordered;
|
|
struct btrfs_ordered_extent *tmp;
|
|
|
|
ASSERT(inode_is_locked(ctx->inode));
|
|
|
|
list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
|
|
list_del_init(&ordered->log_list);
|
|
btrfs_put_ordered_extent(ordered);
|
|
}
|
|
}
|
|
|
|
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
|
|
{
|
|
WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid);
|
|
}
|
|
|
|
static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans)
|
|
{
|
|
return READ_ONCE(trans->fs_info->last_trans_log_full_commit) ==
|
|
trans->transid;
|
|
}
|
|
|
|
int btrfs_sync_log(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root, struct btrfs_log_ctx *ctx);
|
|
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
|
|
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
|
|
struct btrfs_fs_info *fs_info);
|
|
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
|
|
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
|
|
struct dentry *dentry,
|
|
struct btrfs_log_ctx *ctx);
|
|
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root,
|
|
const struct fscrypt_str *name,
|
|
struct btrfs_inode *dir, u64 index);
|
|
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root,
|
|
const struct fscrypt_str *name,
|
|
struct btrfs_inode *inode, u64 dirid);
|
|
void btrfs_end_log_trans(struct btrfs_root *root);
|
|
void btrfs_pin_log_trans(struct btrfs_root *root);
|
|
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *dir, struct btrfs_inode *inode,
|
|
bool for_rename);
|
|
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *dir);
|
|
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
|
|
struct dentry *old_dentry, struct btrfs_inode *old_dir,
|
|
u64 old_dir_index, struct dentry *parent);
|
|
|
|
#endif
|