xfs: prepare repair for bulk loading [v28.3]
Before we start merging the online repair functions, let's improve the bulk loading code a bit. First, we need to fix a misinteraction between the AIL and the btree bulkloader wherein the delwri at the end of the bulk load fails to queue a buffer for writeback if it happens to be on the AIL list. Second, we introduce a defer ops barrier object so that the process of reaping blocks after a repair cannot queue more than two extents per EFI log item. This increases our exposure to leaking blocks if the system goes down during a reap, but also should prevent transaction overflows, which result in the system going down. Third, we change the bulkloader itself to copy multiple records into a block if possible, and add some debugging knobs so that developers can control the slack factors, just like they can do for xfs_repair. This has been running on the djcloud for months with no problems. Enjoy! Signed-off-by: Darrick J. Wong <djwong@kernel.org> -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZXzKAwAKCRBKO3ySh0YR pkcvAP0SEt4VLGrWQJqlWZ5e4sWnqDqVPyT/CQMvG86Qm9VcYwEAzbE0/DaA7uN0 DnceMdho49kTo6FC7+z/lQyGKbl89As= =8FAT -----END PGP SIGNATURE----- Merge tag 'repair-prep-for-bulk-loading-6.8_2023-12-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.8-mergeB xfs: prepare repair for bulk loading Before we start merging the online repair functions, let's improve the bulk loading code a bit. First, we need to fix a misinteraction between the AIL and the btree bulkloader wherein the delwri at the end of the bulk load fails to queue a buffer for writeback if it happens to be on the AIL list. Second, we introduce a defer ops barrier object so that the process of reaping blocks after a repair cannot queue more than two extents per EFI log item. This increases our exposure to leaking blocks if the system goes down during a reap, but also should prevent transaction overflows, which result in the system going down. Third, we change the bulkloader itself to copy multiple records into a block if possible, and add some debugging knobs so that developers can control the slack factors, just like they can do for xfs_repair. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org> * tag 'repair-prep-for-bulk-loading-6.8_2023-12-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux: xfs: constrain dirty buffers while formatting a staged btree xfs: move btree bulkload record initialization to ->get_record implementations xfs: add debug knobs to control btree bulk load slack factors xfs: read leaf blocks when computing keys for bulkloading into node blocks xfs: set XBF_DONE on newly formatted btree block that are ready for writing xfs: force all buffers to be written during btree bulk load
This commit is contained in:
commit
5e60ca3fad
@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block(
|
||||
* Read in the buffer at the given ptr and return the buffer and
|
||||
* the block pointer within the buffer.
|
||||
*/
|
||||
STATIC int
|
||||
int
|
||||
xfs_btree_read_buf_block(
|
||||
struct xfs_btree_cur *cur,
|
||||
const union xfs_btree_ptr *ptr,
|
||||
|
@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
|
||||
int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
|
||||
const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
|
||||
struct xfs_buf **bpp);
|
||||
int xfs_btree_read_buf_block(struct xfs_btree_cur *cur,
|
||||
const union xfs_btree_ptr *ptr, int flags,
|
||||
struct xfs_btree_block **block, struct xfs_buf **bpp);
|
||||
void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
|
||||
struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
|
||||
int lr);
|
||||
|
@ -333,20 +333,41 @@ xfs_btree_commit_ifakeroot(
|
||||
/*
|
||||
* Put a btree block that we're loading onto the ordered list and release it.
|
||||
* The btree blocks will be written to disk when bulk loading is finished.
|
||||
* If we reach the dirty buffer threshold, flush them to disk before
|
||||
* continuing.
|
||||
*/
|
||||
static void
|
||||
static int
|
||||
xfs_btree_bload_drop_buf(
|
||||
struct list_head *buffers_list,
|
||||
struct xfs_buf **bpp)
|
||||
struct xfs_btree_bload *bbl,
|
||||
struct list_head *buffers_list,
|
||||
struct xfs_buf **bpp)
|
||||
{
|
||||
if (*bpp == NULL)
|
||||
return;
|
||||
struct xfs_buf *bp = *bpp;
|
||||
int error;
|
||||
|
||||
if (!xfs_buf_delwri_queue(*bpp, buffers_list))
|
||||
ASSERT(0);
|
||||
if (!bp)
|
||||
return 0;
|
||||
|
||||
xfs_buf_relse(*bpp);
|
||||
/*
|
||||
* Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
|
||||
* xfs_buf_read will not pointlessly reread the contents from the disk.
|
||||
*/
|
||||
bp->b_flags |= XBF_DONE;
|
||||
|
||||
xfs_buf_delwri_queue_here(bp, buffers_list);
|
||||
xfs_buf_relse(bp);
|
||||
*bpp = NULL;
|
||||
bbl->nr_dirty++;
|
||||
|
||||
if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
|
||||
return 0;
|
||||
|
||||
error = xfs_buf_delwri_submit(buffers_list);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
bbl->nr_dirty = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -418,7 +439,10 @@ xfs_btree_bload_prep_block(
|
||||
*/
|
||||
if (*blockp)
|
||||
xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
|
||||
xfs_btree_bload_drop_buf(buffers_list, bpp);
|
||||
|
||||
ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Initialize the new btree block. */
|
||||
xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
|
||||
@ -436,22 +460,19 @@ STATIC int
|
||||
xfs_btree_bload_leaf(
|
||||
struct xfs_btree_cur *cur,
|
||||
unsigned int recs_this_block,
|
||||
xfs_btree_bload_get_record_fn get_record,
|
||||
xfs_btree_bload_get_records_fn get_records,
|
||||
struct xfs_btree_block *block,
|
||||
void *priv)
|
||||
{
|
||||
unsigned int j;
|
||||
unsigned int j = 1;
|
||||
int ret;
|
||||
|
||||
/* Fill the leaf block with records. */
|
||||
for (j = 1; j <= recs_this_block; j++) {
|
||||
union xfs_btree_rec *block_rec;
|
||||
|
||||
ret = get_record(cur, priv);
|
||||
if (ret)
|
||||
while (j <= recs_this_block) {
|
||||
ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
block_rec = xfs_btree_rec_addr(cur, j, block);
|
||||
cur->bc_ops->init_rec_from_cur(cur, block_rec);
|
||||
j += ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -485,7 +506,12 @@ xfs_btree_bload_node(
|
||||
|
||||
ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
|
||||
|
||||
ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
|
||||
/*
|
||||
* Read the lower-level block in case the buffer for it has
|
||||
* been reclaimed. LRU refs will be set on the block, which is
|
||||
* desirable if the new btree commits.
|
||||
*/
|
||||
ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
|
||||
&child_bp);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -764,6 +790,7 @@ xfs_btree_bload(
|
||||
cur->bc_nlevels = bbl->btree_height;
|
||||
xfs_btree_set_ptr_null(cur, &child_ptr);
|
||||
xfs_btree_set_ptr_null(cur, &ptr);
|
||||
bbl->nr_dirty = 0;
|
||||
|
||||
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
|
||||
&avg_per_block, &blocks, &blocks_with_extra);
|
||||
@ -789,7 +816,7 @@ xfs_btree_bload(
|
||||
trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
|
||||
nr_this_block);
|
||||
|
||||
ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
|
||||
ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
|
||||
block, priv);
|
||||
if (ret)
|
||||
goto out;
|
||||
@ -802,7 +829,10 @@ xfs_btree_bload(
|
||||
xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
|
||||
}
|
||||
total_blocks += blocks;
|
||||
xfs_btree_bload_drop_buf(&buffers_list, &bp);
|
||||
|
||||
ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* Populate the internal btree nodes. */
|
||||
for (level = 1; level < cur->bc_nlevels; level++) {
|
||||
@ -844,7 +874,11 @@ xfs_btree_bload(
|
||||
xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
|
||||
}
|
||||
total_blocks += blocks;
|
||||
xfs_btree_bload_drop_buf(&buffers_list, &bp);
|
||||
|
||||
ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
|
||||
}
|
||||
|
||||
|
@ -47,7 +47,9 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
|
||||
int whichfork, const struct xfs_btree_ops *ops);
|
||||
|
||||
/* Bulk loading of staged btrees. */
|
||||
typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
|
||||
typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
|
||||
unsigned int idx, struct xfs_btree_block *block,
|
||||
unsigned int nr_wanted, void *priv);
|
||||
typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
|
||||
union xfs_btree_ptr *ptr, void *priv);
|
||||
typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
|
||||
@ -55,11 +57,14 @@ typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
|
||||
|
||||
struct xfs_btree_bload {
|
||||
/*
|
||||
* This function will be called nr_records times to load records into
|
||||
* the btree. The function does this by setting the cursor's bc_rec
|
||||
* field in in-core format. Records must be returned in sort order.
|
||||
* This function will be called to load @nr_wanted records into the
|
||||
* btree. The implementation does this by setting the cursor's bc_rec
|
||||
* field in in-core format and using init_rec_from_cur to set the
|
||||
* records in the btree block. Records must be returned in sort order.
|
||||
* The function must return the number of records loaded or the usual
|
||||
* negative errno.
|
||||
*/
|
||||
xfs_btree_bload_get_record_fn get_record;
|
||||
xfs_btree_bload_get_records_fn get_records;
|
||||
|
||||
/*
|
||||
* This function will be called nr_blocks times to obtain a pointer
|
||||
@ -107,6 +112,16 @@ struct xfs_btree_bload {
|
||||
* height of the new btree.
|
||||
*/
|
||||
unsigned int btree_height;
|
||||
|
||||
/*
|
||||
* Flush the new btree block buffer list to disk after this many blocks
|
||||
* have been formatted. Zero prohibits writing any buffers until all
|
||||
* blocks have been formatted.
|
||||
*/
|
||||
uint16_t max_dirty;
|
||||
|
||||
/* Number of dirty buffers. */
|
||||
uint16_t nr_dirty;
|
||||
};
|
||||
|
||||
int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
|
||||
|
@ -32,6 +32,7 @@
|
||||
* btree bulk loading code calculates for us. However, there are some
|
||||
* exceptions to this rule:
|
||||
*
|
||||
* (0) If someone turned one of the debug knobs.
|
||||
* (1) If this is a per-AG btree and the AG has less than 10% space free.
|
||||
* (2) If this is an inode btree and the FS has less than 10% space free.
|
||||
|
||||
@ -47,9 +48,13 @@ xrep_newbt_estimate_slack(
|
||||
uint64_t free;
|
||||
uint64_t sz;
|
||||
|
||||
/* Let the btree code compute the default slack values. */
|
||||
bload->leaf_slack = -1;
|
||||
bload->node_slack = -1;
|
||||
/*
|
||||
* The xfs_globals values are set to -1 (i.e. take the bload defaults)
|
||||
* unless someone has set them otherwise, so we just pull the values
|
||||
* here.
|
||||
*/
|
||||
bload->leaf_slack = xfs_globals.bload_leaf_slack;
|
||||
bload->node_slack = xfs_globals.bload_node_slack;
|
||||
|
||||
if (sc->ops->type == ST_PERAG) {
|
||||
free = sc->sa.pag->pagf_freeblks;
|
||||
@ -89,6 +94,7 @@ xrep_newbt_init_ag(
|
||||
xnr->alloc_hint = alloc_hint;
|
||||
xnr->resv = resv;
|
||||
INIT_LIST_HEAD(&xnr->resv_list);
|
||||
xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
|
||||
xrep_newbt_estimate_slack(xnr);
|
||||
}
|
||||
|
||||
|
@ -2049,6 +2049,14 @@ error_free:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void
|
||||
xfs_buf_list_del(
|
||||
struct xfs_buf *bp)
|
||||
{
|
||||
list_del_init(&bp->b_list);
|
||||
wake_up_var(&bp->b_list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cancel a delayed write list.
|
||||
*
|
||||
@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel(
|
||||
|
||||
xfs_buf_lock(bp);
|
||||
bp->b_flags &= ~_XBF_DELWRI_Q;
|
||||
list_del_init(&bp->b_list);
|
||||
xfs_buf_list_del(bp);
|
||||
xfs_buf_relse(bp);
|
||||
}
|
||||
}
|
||||
@ -2119,6 +2127,34 @@ xfs_buf_delwri_queue(
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Queue a buffer to this delwri list as part of a data integrity operation.
|
||||
* If the buffer is on any other delwri list, we'll wait for that to clear
|
||||
* so that the caller can submit the buffer for IO and wait for the result.
|
||||
* Callers must ensure the buffer is not already on the list.
|
||||
*/
|
||||
void
|
||||
xfs_buf_delwri_queue_here(
|
||||
struct xfs_buf *bp,
|
||||
struct list_head *buffer_list)
|
||||
{
|
||||
/*
|
||||
* We need this buffer to end up on the /caller's/ delwri list, not any
|
||||
* old list. This can happen if the buffer is marked stale (which
|
||||
* clears DELWRI_Q) after the AIL queues the buffer to its list but
|
||||
* before the AIL has a chance to submit the list.
|
||||
*/
|
||||
while (!list_empty(&bp->b_list)) {
|
||||
xfs_buf_unlock(bp);
|
||||
wait_var_event(&bp->b_list, list_empty(&bp->b_list));
|
||||
xfs_buf_lock(bp);
|
||||
}
|
||||
|
||||
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
|
||||
|
||||
xfs_buf_delwri_queue(bp, buffer_list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare function is more complex than it needs to be because
|
||||
* the return value is only 32 bits and we are doing comparisons
|
||||
@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers(
|
||||
* reference and remove it from the list here.
|
||||
*/
|
||||
if (!(bp->b_flags & _XBF_DELWRI_Q)) {
|
||||
list_del_init(&bp->b_list);
|
||||
xfs_buf_list_del(bp);
|
||||
xfs_buf_relse(bp);
|
||||
continue;
|
||||
}
|
||||
@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers(
|
||||
list_move_tail(&bp->b_list, wait_list);
|
||||
} else {
|
||||
bp->b_flags |= XBF_ASYNC;
|
||||
list_del_init(&bp->b_list);
|
||||
xfs_buf_list_del(bp);
|
||||
}
|
||||
__xfs_buf_submit(bp, false);
|
||||
}
|
||||
@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit(
|
||||
while (!list_empty(&wait_list)) {
|
||||
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
|
||||
|
||||
list_del_init(&bp->b_list);
|
||||
xfs_buf_list_del(bp);
|
||||
|
||||
/*
|
||||
* Wait on the locked buffer, check for errors and unlock and
|
||||
|
@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp);
|
||||
/* Delayed Write Buffer Routines */
|
||||
extern void xfs_buf_delwri_cancel(struct list_head *);
|
||||
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
|
||||
void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
|
||||
extern int xfs_buf_delwri_submit(struct list_head *);
|
||||
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
|
||||
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
|
||||
|
@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = {
|
||||
.pwork_threads = -1, /* automatic thread detection */
|
||||
.larp = false, /* log attribute replay */
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Leave this many record slots empty when bulk loading btrees. By
|
||||
* default we load new btree leaf blocks 75% full.
|
||||
*/
|
||||
.bload_leaf_slack = -1,
|
||||
|
||||
/*
|
||||
* Leave this many key/ptr slots empty when bulk loading btrees. By
|
||||
* default we load new btree node blocks 75% full.
|
||||
*/
|
||||
.bload_node_slack = -1,
|
||||
};
|
||||
|
@ -85,6 +85,8 @@ struct xfs_globals {
|
||||
int pwork_threads; /* parallel workqueue threads */
|
||||
bool larp; /* log attribute replay */
|
||||
#endif
|
||||
int bload_leaf_slack; /* btree bulk load leaf slack */
|
||||
int bload_node_slack; /* btree bulk load node slack */
|
||||
int log_recovery_delay; /* log recovery delay (secs) */
|
||||
int mount_delay; /* mount setup delay (secs) */
|
||||
bool bug_on_assert; /* BUG() the kernel on assert failure */
|
||||
|
@ -262,6 +262,58 @@ larp_show(
|
||||
XFS_SYSFS_ATTR_RW(larp);
|
||||
#endif /* DEBUG */
|
||||
|
||||
STATIC ssize_t
|
||||
bload_leaf_slack_store(
|
||||
struct kobject *kobject,
|
||||
const char *buf,
|
||||
size_t count)
|
||||
{
|
||||
int ret;
|
||||
int val;
|
||||
|
||||
ret = kstrtoint(buf, 0, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
xfs_globals.bload_leaf_slack = val;
|
||||
return count;
|
||||
}
|
||||
|
||||
STATIC ssize_t
|
||||
bload_leaf_slack_show(
|
||||
struct kobject *kobject,
|
||||
char *buf)
|
||||
{
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
|
||||
}
|
||||
XFS_SYSFS_ATTR_RW(bload_leaf_slack);
|
||||
|
||||
STATIC ssize_t
|
||||
bload_node_slack_store(
|
||||
struct kobject *kobject,
|
||||
const char *buf,
|
||||
size_t count)
|
||||
{
|
||||
int ret;
|
||||
int val;
|
||||
|
||||
ret = kstrtoint(buf, 0, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
xfs_globals.bload_node_slack = val;
|
||||
return count;
|
||||
}
|
||||
|
||||
STATIC ssize_t
|
||||
bload_node_slack_show(
|
||||
struct kobject *kobject,
|
||||
char *buf)
|
||||
{
|
||||
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
|
||||
}
|
||||
XFS_SYSFS_ATTR_RW(bload_node_slack);
|
||||
|
||||
static struct attribute *xfs_dbg_attrs[] = {
|
||||
ATTR_LIST(bug_on_assert),
|
||||
ATTR_LIST(log_recovery_delay),
|
||||
@ -271,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = {
|
||||
ATTR_LIST(pwork_threads),
|
||||
ATTR_LIST(larp),
|
||||
#endif
|
||||
ATTR_LIST(bload_leaf_slack),
|
||||
ATTR_LIST(bload_node_slack),
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(xfs_dbg);
|
||||
|
Loading…
x
Reference in New Issue
Block a user