d0c2f4fa55
Often an fsync needs to fallback to a transaction commit for several reasons (to ensure consistency after a power failure, a new block group was allocated or a temporary error such as ENOMEM or ENOSPC happened). In that case the log is marked as needing a full commit and any concurrent tasks attempting to log inodes or commit the log will also fallback to the transaction commit. When this happens they all wait for the task that first started the transaction commit to finish the transaction commit - however they wait until the full transaction commit happens, which is not needed, as they only need to wait for the superblocks to be persisted and not for unpinning all the extents pinned during the transaction's lifetime, which even for short lived transactions can be a few thousand and take some significant amount of time to complete - for dbench workloads I have observed up to 4~5 milliseconds of time spent unpinning extents in the worst cases, and the number of pinned extents was between 2 to 3 thousand. So allow fsync tasks to skip waiting for the unpinning of extents when they call btrfs_commit_transaction() and they were not the task that started the transaction commit (that one has to do it, the alternative would be to offload the transaction commit to another task so that it could avoid waiting for the extent unpinning or offload the extent unpinning to another task). This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit After applying the entire patchset, dbench shows improvements in respect to throughput and latency. The script used to measure it is the following: $ cat dbench-test.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-m single -d single" echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 300 64 umount $MNT The test was run on a physical machine with 12 cores (Intel corei7), 64G of ram, using a NVMe device and a non-debug kernel configuration (Debian's default configuration). Before applying patchset, 32 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9627107 0.153 61.938 Close 7072076 0.001 3.175 Rename 407633 1.222 44.439 Unlink 1943895 0.658 44.440 Deltree 256 17.339 110.891 Mkdir 128 0.003 0.009 Qpathinfo 8725406 0.064 17.850 Qfileinfo 1529516 0.001 2.188 Qfsinfo 1599884 0.002 1.457 Sfileinfo 784200 0.005 3.562 Find 3373513 0.411 30.312 WriteX 4802132 0.053 29.054 ReadX 15089959 0.002 5.801 LockX 31344 0.002 0.425 UnlockX 31344 0.001 0.173 Flush 674724 5.952 341.830 Throughput 1008.02 MB/sec 32 clients 32 procs max_latency=341.833 ms After applying patchset, 32 clients: After patchset, with 32 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9931568 0.111 25.597 Close 7295730 0.001 2.171 Rename 420549 0.982 49.714 Unlink 2005366 0.497 39.015 Deltree 256 11.149 89.242 Mkdir 128 0.002 0.014 Qpathinfo 9001863 0.049 20.761 Qfileinfo 1577730 0.001 2.546 Qfsinfo 1650508 0.002 3.531 Sfileinfo 809031 0.005 5.846 Find 3480259 0.309 23.977 WriteX 4952505 0.043 41.283 ReadX 15568127 0.002 5.476 LockX 32338 0.002 0.978 UnlockX 32338 0.001 2.032 Flush 696017 7.485 228.835 Throughput 1049.91 MB/sec 32 clients 32 procs max_latency=228.847 ms --> +4.1% throughput, -39.6% max latency Before applying patchset, 64 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 8956748 0.342 108.312 Close 6579660 0.001 3.823 Rename 379209 2.396 81.897 Unlink 1808625 1.108 131.148 Deltree 256 25.632 172.176 Mkdir 128 0.003 0.018 Qpathinfo 8117615 0.131 55.916 Qfileinfo 1423495 0.001 2.635 Qfsinfo 1488496 0.002 5.412 Sfileinfo 729472 0.007 8.643 Find 3138598 0.855 78.321 WriteX 4470783 0.102 79.442 ReadX 14038139 0.002 7.578 LockX 29158 0.002 0.844 UnlockX 29158 0.001 0.567 Flush 627746 14.168 506.151 Throughput 924.738 MB/sec 64 clients 64 procs max_latency=506.154 ms After applying patchset, 64 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9069003 0.303 43.193 Close 6662328 0.001 3.888 Rename 383976 2.194 46.418 Unlink 1831080 1.022 43.873 Deltree 256 24.037 155.763 Mkdir 128 0.002 0.005 Qpathinfo 8219173 0.137 30.233 Qfileinfo 1441203 0.001 3.204 Qfsinfo 1507092 0.002 4.055 Sfileinfo 738775 0.006 5.431 Find 3177874 0.936 38.170 WriteX 4526152 0.084 39.518 ReadX 14213562 0.002 24.760 LockX 29522 0.002 1.221 UnlockX 29522 0.001 0.694 Flush 635652 14.358 422.039 Throughput 990.13 MB/sec 64 clients 64 procs max_latency=422.043 ms --> +6.8% throughput, -18.1% max latency Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
239 lines
7.6 KiB
C
239 lines
7.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_TRANSACTION_H
|
|
#define BTRFS_TRANSACTION_H
|
|
|
|
#include <linux/refcount.h>
|
|
#include "btrfs_inode.h"
|
|
#include "delayed-ref.h"
|
|
#include "ctree.h"
|
|
|
|
enum btrfs_trans_state {
|
|
TRANS_STATE_RUNNING,
|
|
TRANS_STATE_COMMIT_START,
|
|
TRANS_STATE_COMMIT_DOING,
|
|
TRANS_STATE_UNBLOCKED,
|
|
TRANS_STATE_SUPER_COMMITTED,
|
|
TRANS_STATE_COMPLETED,
|
|
TRANS_STATE_MAX,
|
|
};
|
|
|
|
#define BTRFS_TRANS_HAVE_FREE_BGS 0
|
|
#define BTRFS_TRANS_DIRTY_BG_RUN 1
|
|
#define BTRFS_TRANS_CACHE_ENOSPC 2
|
|
|
|
struct btrfs_transaction {
|
|
u64 transid;
|
|
/*
|
|
* total external writers(USERSPACE/START/ATTACH) in this
|
|
* transaction, it must be zero before the transaction is
|
|
* being committed
|
|
*/
|
|
atomic_t num_extwriters;
|
|
/*
|
|
* total writers in this transaction, it must be zero before the
|
|
* transaction can end
|
|
*/
|
|
atomic_t num_writers;
|
|
refcount_t use_count;
|
|
|
|
unsigned long flags;
|
|
|
|
/* Be protected by fs_info->trans_lock when we want to change it. */
|
|
enum btrfs_trans_state state;
|
|
int aborted;
|
|
struct list_head list;
|
|
struct extent_io_tree dirty_pages;
|
|
time64_t start_time;
|
|
wait_queue_head_t writer_wait;
|
|
wait_queue_head_t commit_wait;
|
|
struct list_head pending_snapshots;
|
|
struct list_head dev_update_list;
|
|
struct list_head switch_commits;
|
|
struct list_head dirty_bgs;
|
|
|
|
/*
|
|
* There is no explicit lock which protects io_bgs, rather its
|
|
* consistency is implied by the fact that all the sites which modify
|
|
* it do so under some form of transaction critical section, namely:
|
|
*
|
|
* - btrfs_start_dirty_block_groups - This function can only ever be
|
|
* run by one of the transaction committers. Refer to
|
|
* BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
|
|
*
|
|
* - btrfs_write_dirty_blockgroups - this is called by
|
|
* commit_cowonly_roots from transaction critical section
|
|
* (TRANS_STATE_COMMIT_DOING)
|
|
*
|
|
* - btrfs_cleanup_dirty_bgs - called on transaction abort
|
|
*/
|
|
struct list_head io_bgs;
|
|
struct list_head dropped_roots;
|
|
struct extent_io_tree pinned_extents;
|
|
|
|
/*
|
|
* we need to make sure block group deletion doesn't race with
|
|
* free space cache writeout. This mutex keeps them from stomping
|
|
* on each other
|
|
*/
|
|
struct mutex cache_write_mutex;
|
|
spinlock_t dirty_bgs_lock;
|
|
/* Protected by spin lock fs_info->unused_bgs_lock. */
|
|
struct list_head deleted_bgs;
|
|
spinlock_t dropped_roots_lock;
|
|
struct btrfs_delayed_ref_root delayed_refs;
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
/*
|
|
* Number of ordered extents the transaction must wait for before
|
|
* committing. These are ordered extents started by a fast fsync.
|
|
*/
|
|
atomic_t pending_ordered;
|
|
wait_queue_head_t pending_wait;
|
|
};
|
|
|
|
#define __TRANS_FREEZABLE (1U << 0)
|
|
|
|
#define __TRANS_START (1U << 9)
|
|
#define __TRANS_ATTACH (1U << 10)
|
|
#define __TRANS_JOIN (1U << 11)
|
|
#define __TRANS_JOIN_NOLOCK (1U << 12)
|
|
#define __TRANS_DUMMY (1U << 13)
|
|
#define __TRANS_JOIN_NOSTART (1U << 14)
|
|
|
|
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
|
|
#define TRANS_ATTACH (__TRANS_ATTACH)
|
|
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
|
|
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
|
|
#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART)
|
|
|
|
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
|
|
|
|
#define BTRFS_SEND_TRANS_STUB ((void *)1)
|
|
|
|
struct btrfs_trans_handle {
|
|
u64 transid;
|
|
u64 bytes_reserved;
|
|
u64 chunk_bytes_reserved;
|
|
unsigned long delayed_ref_updates;
|
|
struct btrfs_transaction *transaction;
|
|
struct btrfs_block_rsv *block_rsv;
|
|
struct btrfs_block_rsv *orig_rsv;
|
|
refcount_t use_count;
|
|
unsigned int type;
|
|
/*
|
|
* Error code of transaction abort, set outside of locks and must use
|
|
* the READ_ONCE/WRITE_ONCE access
|
|
*/
|
|
short aborted;
|
|
bool adding_csums;
|
|
bool allocating_chunk;
|
|
bool can_flush_pending_bgs;
|
|
bool reloc_reserved;
|
|
bool dirty;
|
|
bool in_fsync;
|
|
struct btrfs_root *root;
|
|
struct btrfs_fs_info *fs_info;
|
|
struct list_head new_bgs;
|
|
};
|
|
|
|
/*
|
|
* The abort status can be changed between calls and is not protected by locks.
|
|
* This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
|
|
* set to a non-zero value it does not change, so the macro should be in checks
|
|
* but is not necessary for further reads of the value.
|
|
*/
|
|
#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
|
|
|
|
struct btrfs_pending_snapshot {
|
|
struct dentry *dentry;
|
|
struct inode *dir;
|
|
struct btrfs_root *root;
|
|
struct btrfs_root_item *root_item;
|
|
struct btrfs_root *snap;
|
|
struct btrfs_qgroup_inherit *inherit;
|
|
struct btrfs_path *path;
|
|
/* block reservation for the operation */
|
|
struct btrfs_block_rsv block_rsv;
|
|
/* extra metadata reservation for relocation */
|
|
int error;
|
|
/* Preallocated anonymous block device number */
|
|
dev_t anon_dev;
|
|
bool readonly;
|
|
struct list_head list;
|
|
};
|
|
|
|
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *inode)
|
|
{
|
|
spin_lock(&inode->lock);
|
|
inode->last_trans = trans->transaction->transid;
|
|
inode->last_sub_trans = inode->root->log_transid;
|
|
inode->last_log_commit = inode->root->last_log_commit;
|
|
spin_unlock(&inode->lock);
|
|
}
|
|
|
|
/*
|
|
* Make qgroup codes to skip given qgroupid, means the old/new_roots for
|
|
* qgroup won't contain the qgroupid in it.
|
|
*/
|
|
static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
|
|
u64 qgroupid)
|
|
{
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
WARN_ON(delayed_refs->qgroup_to_skip);
|
|
delayed_refs->qgroup_to_skip = qgroupid;
|
|
}
|
|
|
|
static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
|
|
{
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
WARN_ON(!delayed_refs->qgroup_to_skip);
|
|
delayed_refs->qgroup_to_skip = 0;
|
|
}
|
|
|
|
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
|
|
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
|
|
unsigned int num_items);
|
|
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
|
|
struct btrfs_root *root,
|
|
unsigned int num_items);
|
|
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
|
|
struct btrfs_root *root);
|
|
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
|
|
|
|
void btrfs_add_dead_root(struct btrfs_root *root);
|
|
int btrfs_defrag_root(struct btrfs_root *root);
|
|
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
|
|
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
|
|
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
|
|
int wait_for_unblock);
|
|
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
|
|
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
|
|
void btrfs_throttle(struct btrfs_fs_info *fs_info);
|
|
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root);
|
|
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
|
|
struct extent_io_tree *dirty_pages, int mark);
|
|
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
|
|
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
|
|
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
|
|
void btrfs_put_transaction(struct btrfs_transaction *transaction);
|
|
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
|
|
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root);
|
|
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
|
|
|
|
#endif
|