30b80f3ce0
When logging a directory we start by flushing all its delayed items. That results in adding dir index items to the subvolume btree, for new dentries, and removing dir index items from the subvolume btree for any dentries that were deleted. This makes it straightforward to log a directory simply by iterating over all the modified subvolume btree leaves, especially when we used to log both dir index keys and dir item keys (before commit339d035424
("btrfs: only copy dir index keys when logging a directory") and when we used to copy old dir index entries for leaves modified in the current transaction (before commit732d591a5d
("btrfs: stop copying old dir items when logging a directory")). From an efficiency point of view this has a couple of drawbacks: 1) Adds extra latency, due to copying delayed items to the subvolume btree and deleting dir index items from the btree. Further if there are other tasks accessing the btree, which is common (syscalls like creat, mkdir, rename, link, unlink, truncate, reflinks, etc, finishing an ordered extent, etc), lock contention can cause further delays, both to the task logging a directory and to the other tasks accessing the btree; 2) More time spent overall flushing delayed items, if after logging the directory further changes are done to the directory in the same transaction. For example, if we add 10 dentries to a directory, fsync it, add more 10 dentries, fsync it again, then add more 10 dentries and fsync it again, then we end up inserting 3 batches of 10 items to the subvolume btree. With the changes from this patch, we flush all the delayed items to the btree only once - a single batch of 30 items, and outside the logging code (transaction commit or when delayed items are flushed asynchronously). This change simply skips the flushing of delayed items every time we log a directory. Instead we copy the delayed insertion items directly to the log tree and delete delayed deletion items directly from the log tree. Therefore avoiding changing first the subvolume btree and then scanning it for new items to copy from it to the log tree and detecting deletions by observing gaps in consecutive dir index keys in subvolume btree leaves. Running the following tests on a non-debug kernel (Debian's default kernel config), on a box with a NVMe device, a 12 cores Intel CPU and 64G of ram, produced the results below. The results compare a branch without this patch and all the other patches it depends on versus the same branch with the patchset applied. The patchset is comprised of the following patches: btrfs: don't drop dir index range items when logging a directory btrfs: remove the root argument from log_new_dir_dentries() btrfs: update stale comment for log_new_dir_dentries() btrfs: free list element sooner at log_new_dir_dentries() btrfs: avoid memory allocation at log_new_dir_dentries() for common case btrfs: remove root argument from btrfs_delayed_item_reserve_metadata() btrfs: store index number instead of key in struct btrfs_delayed_item btrfs: remove unused logic when looking up delayed items btrfs: shrink the size of struct btrfs_delayed_item btrfs: search for last logged dir index if it's not cached in the inode btrfs: move need_log_inode() to above log_conflicting_inodes() btrfs: move log_new_dir_dentries() above btrfs_log_inode() btrfs: log conflicting inodes without holding log mutex of the initial inode btrfs: skip logging parent dir when conflicting inode is not a dir btrfs: use delayed items when logging a directory Custom test script for testing time spent at btrfs_log_inode(): #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 # Total number of files to create in the test directory. NUM_FILES=10000 # Fsync after creating or renaming N files. FSYNC_AFTER=100 umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT TEST_DIR=$MNT/testdir mkdir $TEST_DIR echo "Creating files..." for ((i = 1; i <= $NUM_FILES; i++)); do echo -n > $TEST_DIR/file_$i if (( ($i % $FSYNC_AFTER) == 0 )); then xfs_io -c "fsync" $TEST_DIR fi done sync echo "Renaming files..." for ((i = 1; i <= $NUM_FILES; i++)); do mv $TEST_DIR/file_$i $TEST_DIR/file_$i.renamed if (( ($i % $FSYNC_AFTER) == 0 )); then xfs_io -c "fsync" $TEST_DIR fi done umount $MNT And using the following bpftrace script to capture the total time that is spent at btrfs_log_inode(): #!/usr/bin/bpftrace k:btrfs_log_inode { @start_log_inode[tid] = nsecs; } kr:btrfs_log_inode /@start_log_inode[tid]/ { $dur = (nsecs - @start_log_inode[tid]) / 1000; @btrfs_log_inode_total_time = sum($dur); delete(@start_log_inode[tid]); } END { clear(@start_log_inode); } Result before applying patchset: @btrfs_log_inode_total_time: 622642 Result after applying patchset: @btrfs_log_inode_total_time: 354134 (-43.1% time spent) The following dbench script was also used for testing: #!/bin/bash NUM_JOBS=$(nproc --all) DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-O no-holes -R free-space-tree" echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT --skip-cleanup -t 120 -S $NUM_JOBS umount $MNT Before patchset: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 3322265 0.034 21.032 Close 2440562 0.002 0.994 Rename 140664 1.150 269.633 Unlink 670796 1.093 269.678 Deltree 96 5.481 15.510 Mkdir 48 0.004 0.052 Qpathinfo 3010924 0.014 8.127 Qfileinfo 528055 0.001 0.518 Qfsinfo 552113 0.003 0.372 Sfileinfo 270575 0.005 0.688 Find 1164176 0.052 13.931 WriteX 1658537 0.019 5.918 ReadX 5207412 0.003 1.034 LockX 10818 0.003 0.079 UnlockX 10818 0.002 0.313 Flush 232811 1.027 269.735 Throughput 869.867 MB/sec (sync dirs) 12 clients 12 procs max_latency=269.741 ms After patchset: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 4152738 0.029 20.863 Close 3050770 0.002 1.119 Rename 175829 0.871 211.741 Unlink 838447 0.845 211.724 Deltree 120 4.798 14.162 Mkdir 60 0.003 0.005 Qpathinfo 3763807 0.011 4.673 Qfileinfo 660111 0.001 0.400 Qfsinfo 690141 0.003 0.429 Sfileinfo 338260 0.005 0.725 Find 1455273 0.046 6.787 WriteX 2073307 0.017 5.690 ReadX 6509193 0.003 1.171 LockX 13522 0.003 0.077 UnlockX 13522 0.002 0.125 Flush 291044 0.811 211.631 Throughput 1089.27 MB/sec (sync dirs) 12 clients 12 procs max_latency=211.750 ms (+25.2% throughput, -21.5% max latency) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
177 lines
5.3 KiB
C
177 lines
5.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2011 Fujitsu. All rights reserved.
|
|
* Written by Miao Xie <miaox@cn.fujitsu.com>
|
|
*/
|
|
|
|
#ifndef BTRFS_DELAYED_INODE_H
|
|
#define BTRFS_DELAYED_INODE_H
|
|
|
|
#include <linux/rbtree.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/list.h>
|
|
#include <linux/wait.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/refcount.h>
|
|
#include "ctree.h"
|
|
|
|
enum btrfs_delayed_item_type {
|
|
BTRFS_DELAYED_INSERTION_ITEM,
|
|
BTRFS_DELAYED_DELETION_ITEM
|
|
};
|
|
|
|
struct btrfs_delayed_root {
|
|
spinlock_t lock;
|
|
struct list_head node_list;
|
|
/*
|
|
* Used for delayed nodes which is waiting to be dealt with by the
|
|
* worker. If the delayed node is inserted into the work queue, we
|
|
* drop it from this list.
|
|
*/
|
|
struct list_head prepare_list;
|
|
atomic_t items; /* for delayed items */
|
|
atomic_t items_seq; /* for delayed items */
|
|
int nodes; /* for delayed nodes */
|
|
wait_queue_head_t wait;
|
|
};
|
|
|
|
#define BTRFS_DELAYED_NODE_IN_LIST 0
|
|
#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
|
|
#define BTRFS_DELAYED_NODE_DEL_IREF 2
|
|
|
|
struct btrfs_delayed_node {
|
|
u64 inode_id;
|
|
u64 bytes_reserved;
|
|
struct btrfs_root *root;
|
|
/* Used to add the node into the delayed root's node list. */
|
|
struct list_head n_list;
|
|
/*
|
|
* Used to add the node into the prepare list, the nodes in this list
|
|
* is waiting to be dealt with by the async worker.
|
|
*/
|
|
struct list_head p_list;
|
|
struct rb_root_cached ins_root;
|
|
struct rb_root_cached del_root;
|
|
struct mutex mutex;
|
|
struct btrfs_inode_item inode_item;
|
|
refcount_t refs;
|
|
u64 index_cnt;
|
|
unsigned long flags;
|
|
int count;
|
|
/*
|
|
* The size of the next batch of dir index items to insert (if this
|
|
* node is from a directory inode). Protected by @mutex.
|
|
*/
|
|
u32 curr_index_batch_size;
|
|
/*
|
|
* Number of leaves reserved for inserting dir index items (if this
|
|
* node belongs to a directory inode). This may be larger then the
|
|
* actual number of leaves we end up using. Protected by @mutex.
|
|
*/
|
|
u32 index_item_leaves;
|
|
};
|
|
|
|
struct btrfs_delayed_item {
|
|
struct rb_node rb_node;
|
|
/* Offset value of the corresponding dir index key. */
|
|
u64 index;
|
|
struct list_head tree_list; /* used for batch insert/delete items */
|
|
struct list_head readdir_list; /* used for readdir items */
|
|
/*
|
|
* Used when logging a directory.
|
|
* Insertions and deletions to this list are protected by the parent
|
|
* delayed node's mutex.
|
|
*/
|
|
struct list_head log_list;
|
|
u64 bytes_reserved;
|
|
struct btrfs_delayed_node *delayed_node;
|
|
refcount_t refs;
|
|
enum btrfs_delayed_item_type type:8;
|
|
/*
|
|
* Track if this delayed item was already logged.
|
|
* Protected by the mutex of the parent delayed inode.
|
|
*/
|
|
bool logged;
|
|
/* The maximum leaf size is 64K, so u16 is more than enough. */
|
|
u16 data_len;
|
|
char data[];
|
|
};
|
|
|
|
static inline void btrfs_init_delayed_root(
|
|
struct btrfs_delayed_root *delayed_root)
|
|
{
|
|
atomic_set(&delayed_root->items, 0);
|
|
atomic_set(&delayed_root->items_seq, 0);
|
|
delayed_root->nodes = 0;
|
|
spin_lock_init(&delayed_root->lock);
|
|
init_waitqueue_head(&delayed_root->wait);
|
|
INIT_LIST_HEAD(&delayed_root->node_list);
|
|
INIT_LIST_HEAD(&delayed_root->prepare_list);
|
|
}
|
|
|
|
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
|
|
const char *name, int name_len,
|
|
struct btrfs_inode *dir,
|
|
struct btrfs_disk_key *disk_key, u8 type,
|
|
u64 index);
|
|
|
|
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *dir, u64 index);
|
|
|
|
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
|
|
|
|
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
|
|
int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
|
|
|
|
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
|
|
|
|
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *inode);
|
|
/* Used for evicting the inode. */
|
|
void btrfs_remove_delayed_node(struct btrfs_inode *inode);
|
|
void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode);
|
|
int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
|
|
|
|
|
|
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root,
|
|
struct btrfs_inode *inode);
|
|
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
|
|
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
|
|
|
|
/* Used for drop dead root */
|
|
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
|
|
|
|
/* Used for clean the transaction */
|
|
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
|
|
|
|
/* Used for readdir() */
|
|
bool btrfs_readdir_get_delayed_items(struct inode *inode,
|
|
struct list_head *ins_list,
|
|
struct list_head *del_list);
|
|
void btrfs_readdir_put_delayed_items(struct inode *inode,
|
|
struct list_head *ins_list,
|
|
struct list_head *del_list);
|
|
int btrfs_should_delete_dir_index(struct list_head *del_list,
|
|
u64 index);
|
|
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
|
|
struct list_head *ins_list);
|
|
|
|
/* Used during directory logging. */
|
|
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
|
|
struct list_head *ins_list,
|
|
struct list_head *del_list);
|
|
void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
|
|
struct list_head *ins_list,
|
|
struct list_head *del_list);
|
|
|
|
/* for init */
|
|
int __init btrfs_delayed_inode_init(void);
|
|
void __cold btrfs_delayed_inode_exit(void);
|
|
|
|
/* for debugging */
|
|
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
|
|
|
|
#endif
|