From 0a683794ace283984ae95ea6796f37b5f3afc446 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Jun 2021 13:40:02 +1000 Subject: [PATCH 001/102] xfs: split up xfs_buf_allocate_memory Based on a patch from Christoph Hellwig. This splits out the heap allocation and page allocation portions of the buffer memory allocation into two separate helper functions. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 130 +++++++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 592800c8852f..2e35d344a69b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -347,65 +347,55 @@ xfs_buf_free( kmem_cache_free(xfs_buf_zone, bp); } -/* - * Allocates all the pages for buffer in question and builds it's page list. - */ -STATIC int -xfs_buf_allocate_memory( - struct xfs_buf *bp, - uint flags) +static int +xfs_buf_alloc_kmem( + struct xfs_buf *bp, + size_t size, + xfs_buf_flags_t flags) { - size_t size; - size_t nbytes, offset; - gfp_t gfp_mask = xb_to_gfp(flags); - unsigned short page_count, i; - xfs_off_t start, end; - int error; - xfs_km_flags_t kmflag_mask = 0; + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); + xfs_km_flags_t kmflag_mask = KM_NOFS; - /* - * assure zeroed buffer for non-read cases. - */ - if (!(flags & XBF_READ)) { + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) kmflag_mask |= KM_ZERO; + + bp->b_addr = kmem_alloc_io(size, align_mask, kmflag_mask); + if (!bp->b_addr) + return -ENOMEM; + + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + return -ENOMEM; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = kmem_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= _XBF_KMEM; + return 0; +} + +static int +xfs_buf_alloc_pages( + struct xfs_buf *bp, + uint page_count, + xfs_buf_flags_t flags) +{ + gfp_t gfp_mask = xb_to_gfp(flags); + size_t size; + size_t offset; + size_t nbytes; + int i; + int error; + + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) gfp_mask |= __GFP_ZERO; - } - /* - * for buffers that are contained within a single page, just allocate - * the memory from the heap - there's no need for the complexity of - * page arrays to keep allocation down to order 0. - */ - size = BBTOB(bp->b_length); - if (size < PAGE_SIZE) { - int align_mask = xfs_buftarg_dma_alignment(bp->b_target); - bp->b_addr = kmem_alloc_io(size, align_mask, - KM_NOFS | kmflag_mask); - if (!bp->b_addr) { - /* low memory - use alloc_page loop instead */ - goto use_alloc_page; - } - - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != - ((unsigned long)bp->b_addr & PAGE_MASK)) { - /* b_addr spans two pages - use alloc_page instead */ - kmem_free(bp->b_addr); - bp->b_addr = NULL; - goto use_alloc_page; - } - bp->b_offset = offset_in_page(bp->b_addr); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; - bp->b_flags |= _XBF_KMEM; - return 0; - } - -use_alloc_page: - start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) - >> PAGE_SHIFT; - page_count = end - start; error = _xfs_buf_get_pages(bp, page_count); if (unlikely(error)) return error; @@ -458,6 +448,38 @@ out_free_pages: return error; } + +/* + * Allocates all the pages for buffer in question and builds it's page list. + */ +static int +xfs_buf_allocate_memory( + struct xfs_buf *bp, + uint flags) +{ + size_t size; + xfs_off_t start, end; + int error; + + /* + * For buffers that fit entirely within a single page, first attempt to + * allocate the memory from the heap to minimise memory usage. If we + * can't get heap memory for these small buffers, we fall back to using + * the page allocator. + */ + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + error = xfs_buf_alloc_kmem(bp, size, flags); + if (!error) + return 0; + } + + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; + return xfs_buf_alloc_pages(bp, end - start, flags); +} + /* * Map buffer into kernel address-space if necessary. */ From 07b5c5add42a0afccf79401b12d78043ed6b8240 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Jun 2021 13:40:35 +1000 Subject: [PATCH 002/102] xfs: use xfs_buf_alloc_pages for uncached buffers Use the newly factored out page allocation code. This adds automatic buffer zeroing for non-read uncached buffers. This also allows us to greatly simply the error handling in xfs_buf_get_uncached(). Because xfs_buf_alloc_pages() cleans up partial allocation failure, we can just call xfs_buf_free() in all error cases now to clean up after failures. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 1 - fs/xfs/xfs_buf.c | 27 ++++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index c68a36688474..be0087825ae0 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -43,7 +43,6 @@ xfs_get_aghdr_buf( if (error) return error; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2e35d344a69b..b1610115d401 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -973,7 +973,7 @@ xfs_buf_get_uncached( struct xfs_buf **bpp) { unsigned long page_count; - int error, i; + int error; struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); @@ -982,41 +982,26 @@ xfs_buf_get_uncached( /* flags might contain irrelevant bits, pass only what we care about */ error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); if (error) - goto fail; + return error; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = _xfs_buf_get_pages(bp, page_count); + error = xfs_buf_alloc_pages(bp, page_count, flags); if (error) goto fail_free_buf; - for (i = 0; i < page_count; i++) { - bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); - if (!bp->b_pages[i]) { - error = -ENOMEM; - goto fail_free_mem; - } - } - bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages", __func__); - goto fail_free_mem; + goto fail_free_buf; } trace_xfs_buf_get_uncached(bp, _RET_IP_); *bpp = bp; return 0; - fail_free_mem: - while (--i >= 0) - __free_page(bp->b_pages[i]); - _xfs_buf_free_pages(bp); - fail_free_buf: - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_zone, bp); - fail: +fail_free_buf: + xfs_buf_free(bp); return error; } From c9fa563072e13337713a441cf30171feb4e96e6d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Jun 2021 13:40:36 +1000 Subject: [PATCH 003/102] xfs: use alloc_pages_bulk_array() for buffers Because it's more efficient than allocating pages one at a time in a loop. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 62 +++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b1610115d401..2749bc0bf726 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -386,10 +386,7 @@ xfs_buf_alloc_pages( xfs_buf_flags_t flags) { gfp_t gfp_mask = xb_to_gfp(flags); - size_t size; - size_t offset; - size_t nbytes; - int i; + long filled = 0; int error; /* Assure zeroed buffer for non-read cases. */ @@ -400,50 +397,39 @@ xfs_buf_alloc_pages( if (unlikely(error)) return error; - offset = bp->b_offset; bp->b_flags |= _XBF_PAGES; - for (i = 0; i < bp->b_page_count; i++) { - struct page *page; - uint retries = 0; -retry: - page = alloc_page(gfp_mask); - if (unlikely(page == NULL)) { - if (flags & XBF_READ_AHEAD) { - bp->b_page_count = i; - error = -ENOMEM; - goto out_free_pages; - } + /* + * Bulk filling of pages can take multiple calls. Not filling the entire + * array is not an allocation failure, so don't back off if we get at + * least one extra page. + */ + for (;;) { + long last = filled; - /* - * This could deadlock. - * - * But until all the XFS lowlevel code is revamped to - * handle buffer allocation failures we can't do much. - */ - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", - current->comm, current->pid, - __func__, gfp_mask); - - XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; + filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, + bp->b_pages); + if (filled == bp->b_page_count) { + XFS_STATS_INC(bp->b_mount, xb_page_found); + break; } - XFS_STATS_INC(bp->b_mount, xb_page_found); + if (filled != last) + continue; - nbytes = min_t(size_t, size, PAGE_SIZE - offset); - size -= nbytes; - bp->b_pages[i] = page; - offset = 0; + if (flags & XBF_READ_AHEAD) { + error = -ENOMEM; + goto out_free_pages; + } + + XFS_STATS_INC(bp->b_mount, xb_page_retries); + congestion_wait(BLK_RW_ASYNC, HZ / 50); } return 0; out_free_pages: - for (i = 0; i < bp->b_page_count; i++) - __free_page(bp->b_pages[i]); + while (--filled >= 0) + __free_page(bp->b_pages[filled]); bp->b_flags &= ~_XBF_PAGES; return error; } From 02c5117386884e06b6e78b72288f1e0af4320dc1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Jun 2021 13:40:36 +1000 Subject: [PATCH 004/102] xfs: merge _xfs_buf_get_pages() Only called from one place now, so merge it into xfs_buf_alloc_pages(). Because page array allocation is dependent on bp->b_pages being null, always ensure that when the pages array is freed we always set bp->b_pages to null. Also convert the page array to use kmalloc() rather than kmem_alloc() so we can use the gfp flags we've already calculated for the allocation context instead of hard coding KM_NOFS semantics. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 48 ++++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2749bc0bf726..a6fcd829c1ea 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -272,31 +272,6 @@ _xfs_buf_alloc( return 0; } -/* - * Allocate a page array capable of holding a specified number - * of pages, and point the page buf at it. - */ -STATIC int -_xfs_buf_get_pages( - struct xfs_buf *bp, - int page_count) -{ - /* Make sure that we have a page list */ - if (bp->b_pages == NULL) { - bp->b_page_count = page_count; - if (page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, KM_NOFS); - if (bp->b_pages == NULL) - return -ENOMEM; - } - memset(bp->b_pages, 0, sizeof(struct page *) * page_count); - } - return 0; -} - /* * Frees b_pages if it was allocated. */ @@ -304,10 +279,9 @@ STATIC void _xfs_buf_free_pages( struct xfs_buf *bp) { - if (bp->b_pages != bp->b_page_array) { + if (bp->b_pages != bp->b_page_array) kmem_free(bp->b_pages); - bp->b_pages = NULL; - } + bp->b_pages = NULL; } /* @@ -389,16 +363,22 @@ xfs_buf_alloc_pages( long filled = 0; int error; + /* Make sure that we have a page list */ + bp->b_page_count = page_count; + if (bp->b_page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; + } else { + bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, + gfp_mask); + if (!bp->b_pages) + return -ENOMEM; + } + bp->b_flags |= _XBF_PAGES; + /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) gfp_mask |= __GFP_ZERO; - error = _xfs_buf_get_pages(bp, page_count); - if (unlikely(error)) - return error; - - bp->b_flags |= _XBF_PAGES; - /* * Bulk filling of pages can take multiple calls. Not filling the entire * array is not an allocation failure, so don't back off if we get at From e7d236a6fe5102092c463112124cf52e4d71885e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 1 Jun 2021 13:40:36 +1000 Subject: [PATCH 005/102] xfs: move page freeing into _xfs_buf_free_pages() Rather than open coding it just before we call _xfs_buf_free_pages(). Also, rename the function to xfs_buf_free_pages() as the leading underscore has no useful meaning. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 61 ++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a6fcd829c1ea..d02edb683cfd 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -272,25 +272,30 @@ _xfs_buf_alloc( return 0; } -/* - * Frees b_pages if it was allocated. - */ -STATIC void -_xfs_buf_free_pages( +static void +xfs_buf_free_pages( struct xfs_buf *bp) { + uint i; + + ASSERT(bp->b_flags & _XBF_PAGES); + + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); + + for (i = 0; i < bp->b_page_count; i++) { + if (bp->b_pages[i]) + __free_page(bp->b_pages[i]); + } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += bp->b_page_count; + if (bp->b_pages != bp->b_page_array) kmem_free(bp->b_pages); bp->b_pages = NULL; + bp->b_flags &= ~_XBF_PAGES; } -/* - * Releases the specified buffer. - * - * The modification state of any associated pages is left unchanged. - * The buffer must not be on any hash - use xfs_buf_rele instead for - * hashed and refcounted buffers - */ static void xfs_buf_free( struct xfs_buf *bp) @@ -299,24 +304,11 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & _XBF_PAGES) { - uint i; - - if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr - bp->b_offset, - bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - struct page *page = bp->b_pages[i]; - - __free_page(page); - } - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += - bp->b_page_count; - } else if (bp->b_flags & _XBF_KMEM) + if (bp->b_flags & _XBF_PAGES) + xfs_buf_free_pages(bp); + else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); - _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); kmem_cache_free(xfs_buf_zone, bp); } @@ -361,7 +353,6 @@ xfs_buf_alloc_pages( { gfp_t gfp_mask = xb_to_gfp(flags); long filled = 0; - int error; /* Make sure that we have a page list */ bp->b_page_count = page_count; @@ -398,20 +389,14 @@ xfs_buf_alloc_pages( continue; if (flags & XBF_READ_AHEAD) { - error = -ENOMEM; - goto out_free_pages; + xfs_buf_free_pages(bp); + return -ENOMEM; } XFS_STATS_INC(bp->b_mount, xb_page_retries); congestion_wait(BLK_RW_ASYNC, HZ / 50); } return 0; - -out_free_pages: - while (--filled >= 0) - __free_page(bp->b_pages[filled]); - bp->b_flags &= ~_XBF_PAGES; - return error; } From 4126c06e25b38842a254b2de6ffc3019a7b2f0ca Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 18 Mar 2021 17:21:18 -0700 Subject: [PATCH 006/102] xfs: Reverse apply 72b97ea40d Originally we added this patch to help modularize the attr code in preparation for delayed attributes and the state machine it requires. However, later reviews found that this slightly alters the transaction handling as the helper function is ambiguous as to whether the transaction is diry or clean. This may cause a dirty transaction to be included in the next roll, where previously it had not. To preserve the existing code flow, we reverse apply this commit. Signed-off-by: Allison Henderson Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 96146f425e50..190b46dc0d26 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1214,24 +1214,6 @@ int xfs_attr_node_removename_setup( return 0; } -STATIC int -xfs_attr_node_remove_rmt( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - int error = 0; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - - /* - * Refill the state structure with buffers, the prior calls released our - * buffers. - */ - return xfs_attr_refillstate(state); -} - /* * Remove a name from a B-tree attribute list. * @@ -1260,7 +1242,15 @@ xfs_attr_node_removename( * overflow the maximum size of a transaction and/or hit a deadlock. */ if (args->rmtblkno > 0) { - error = xfs_attr_node_remove_rmt(args, state); + error = xfs_attr_rmtval_remove(args); + if (error) + goto out; + + /* + * Refill the state structure with buffers, the prior calls + * released our buffers. + */ + error = xfs_attr_refillstate(state); if (error) goto out; } From a8490f699f6ec88843879b92cbb21953dab379ee Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 18 Mar 2021 17:25:59 -0700 Subject: [PATCH 007/102] xfs: Add xfs_attr_node_remove_name This patch pulls a new helper function xfs_attr_node_remove_name out of xfs_attr_node_remove_step. This helps to modularize xfs_attr_node_remove_step which will help make the delayed attribute code easier to follow Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_attr.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 190b46dc0d26..8a08d5b8c433 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1214,6 +1214,25 @@ int xfs_attr_node_removename_setup( return 0; } +STATIC int +xfs_attr_node_remove_name( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *blk; + int retval; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + return retval; +} + /* * Remove a name from a B-tree attribute list. * @@ -1226,7 +1245,6 @@ xfs_attr_node_removename( struct xfs_da_args *args) { struct xfs_da_state *state; - struct xfs_da_state_blk *blk; int retval, error; struct xfs_inode *dp = args->dp; @@ -1254,14 +1272,7 @@ xfs_attr_node_removename( if (error) goto out; } - - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + retval = xfs_attr_node_remove_name(args, state); /* * Check to see if the tree needs to be collapsed. From 6286514b63e12d7bedc67e46aa1aeff9ed8378ce Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 18 Feb 2021 01:09:18 -0700 Subject: [PATCH 008/102] xfs: Refactor xfs_attr_set_shortform This patch is actually the combination of patches from the previous version (v18). Initially patch 3 hoisted xfs_attr_set_shortform, and the next added the helper xfs_attr_set_fmt. xfs_attr_set_fmt is similar the old xfs_attr_set_shortform. It returns 0 when the attr has been set and no further action is needed. It returns -EAGAIN when shortform has been transformed to leaf, and the calling function should proceed the set the attr in leaf form. Signed-off-by: Allison Henderson Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 42 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 8a08d5b8c433..0ec1547a00f8 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -236,16 +236,11 @@ xfs_attr_is_shortform( ip->i_afp->if_nextents == 0); } -/* - * Attempts to set an attr in shortform, or converts short form to leaf form if - * there is not enough room. If the attr is set, the transaction is committed - * and set to NULL. - */ STATIC int -xfs_attr_set_shortform( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) +xfs_attr_set_fmt( + struct xfs_da_args *args) { + struct xfs_buf *leaf_bp = NULL; struct xfs_inode *dp = args->dp; int error, error2 = 0; @@ -258,29 +253,29 @@ xfs_attr_set_shortform( args->trans = NULL; return error ? error : error2; } + /* * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); + error = xfs_attr_shortform_to_leaf(args, &leaf_bp); if (error) return error; /* * Prevent the leaf buffer from being unlocked so that a concurrent AIL * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. Once we're done rolling the transaction we - * can release the hold and add the attr to the leaf. + * with the write verifier. */ - xfs_trans_bhold(args->trans, *leaf_bp); + xfs_trans_bhold(args->trans, leaf_bp); error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, *leaf_bp); + xfs_trans_bhold_release(args->trans, leaf_bp); if (error) { - xfs_trans_brelse(args->trans, *leaf_bp); + xfs_trans_brelse(args->trans, leaf_bp); return error; } - return 0; + return -EAGAIN; } /* @@ -291,8 +286,7 @@ xfs_attr_set_args( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; - struct xfs_buf *leaf_bp = NULL; - int error = 0; + int error; /* * If the attribute list is already in leaf format, jump straight to @@ -301,15 +295,8 @@ xfs_attr_set_args( * again. */ if (xfs_attr_is_shortform(dp)) { - - /* - * If the attr was successfully set in shortform, the - * transaction is committed and set to NULL. Otherwise, is it - * converted from shortform to leaf, and the transaction is - * retained. - */ - error = xfs_attr_set_shortform(args, &leaf_bp); - if (error || !args->trans) + error = xfs_attr_set_fmt(args); + if (error != -EAGAIN) return error; } @@ -344,8 +331,7 @@ xfs_attr_set_args( return error; } - error = xfs_attr_node_addname(args); - return error; + return xfs_attr_node_addname(args); } /* From f0f7c502c728d0c6947219739631bad101f8737b Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 18 Feb 2021 01:24:24 -0700 Subject: [PATCH 009/102] xfs: Separate xfs_attr_node_addname and xfs_attr_node_addname_clear_incomplete This patch separate xfs_attr_node_addname into two functions. This will help to make it easier to hoist parts of xfs_attr_node_addname that need state management Signed-off-by: Allison Henderson Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0ec1547a00f8..ad44d779650d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -54,6 +54,7 @@ STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_da_args *args); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); @@ -1073,6 +1074,28 @@ restart: return error; } + error = xfs_attr_node_addname_clear_incomplete(args); + if (error) + goto out; + retval = 0; +out: + if (state) + xfs_da_state_free(state); + if (error) + return error; + return retval; +} + + +STATIC int +xfs_attr_node_addname_clear_incomplete( + struct xfs_da_args *args) +{ + struct xfs_da_state *state = NULL; + struct xfs_da_state_blk *blk; + int retval = 0; + int error = 0; + /* * Re-find the "old" attribute entry after any split ops. The INCOMPLETE * flag means that we will find the "old" attr, not the "new" one. From 6ca5a4a1f52952790a40099b79b5631d91163ba4 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 12 Apr 2021 14:15:31 -0700 Subject: [PATCH 010/102] xfs: Add helper xfs_attr_node_addname_find_attr This patch separates the first half of xfs_attr_node_addname into a helper function xfs_attr_node_addname_find_attr. It also replaces the restart goto with an EAGAIN return code driven by a loop in the calling function. This looks odd now, but will clean up nicly once we introduce the state machine. It will also enable hoisting the last state out of xfs_attr_node_addname with out having to plumb in a "done" parameter to know if we need to move to the next state or not. Signed-off-by: Allison Henderson Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 87 +++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ad44d779650d..5f56b059dfc7 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -52,7 +52,10 @@ STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); * Internal routines when attribute list is more than one block. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); -STATIC int xfs_attr_node_addname(xfs_da_args_t *args); +STATIC int xfs_attr_node_addname(struct xfs_da_args *args, + struct xfs_da_state *state); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_da_args *args, + struct xfs_da_state **state); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_da_args *args); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, @@ -287,6 +290,7 @@ xfs_attr_set_args( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; + struct xfs_da_state *state; int error; /* @@ -332,7 +336,14 @@ xfs_attr_set_args( return error; } - return xfs_attr_node_addname(args); + do { + error = xfs_attr_node_addname_find_attr(args, &state); + if (error) + return error; + error = xfs_attr_node_addname(args, state); + } while (error == -EAGAIN); + + return error; } /* @@ -896,48 +907,26 @@ xfs_attr_node_hasname( * External routines when attribute list size > geo->blksize *========================================================================*/ -/* - * Add a name to a Btree-format attribute list. - * - * This will involve walking down the Btree, and may involve splitting - * leaf nodes and even splitting intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - */ STATIC int -xfs_attr_node_addname( - struct xfs_da_args *args) +xfs_attr_node_addname_find_attr( + struct xfs_da_args *args, + struct xfs_da_state **state) { - struct xfs_da_state *state; - struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - int retval, error; + int retval; - trace_xfs_attr_node_addname(args); - - /* - * Fill in bucket of arguments/results/context to carry around. - */ - dp = args->dp; -restart: /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = 0; - retval = xfs_attr_node_hasname(args, &state); + retval = xfs_attr_node_hasname(args, state); if (retval != -ENOATTR && retval != -EEXIST) - goto out; + goto error; - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out; + goto error; if (retval == -EEXIST) { if (args->attr_flags & XATTR_CREATE) - goto out; + goto error; trace_xfs_attr_node_replace(args); @@ -955,6 +944,38 @@ restart: args->rmtvaluelen = 0; } + return 0; +error: + if (*state) + xfs_da_state_free(*state); + return retval; +} + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + */ +STATIC int +xfs_attr_node_addname( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *blk; + struct xfs_inode *dp; + int retval, error; + + trace_xfs_attr_node_addname(args); + + dp = args->dp; + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_add(blk->bp, state->args); if (retval == -ENOSPC) { if (state->path.active == 1) { @@ -980,7 +1001,7 @@ restart: if (error) goto out; - goto restart; + return -EAGAIN; } /* From 5d954cc09f6baed80458ea02ec092031608ea3fe Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 26 Apr 2021 16:50:26 -0700 Subject: [PATCH 011/102] xfs: Hoist xfs_attr_node_addname This patch hoists the later half of xfs_attr_node_addname into the calling function. We do this because it is this area that will need the most state management, and we want to keep such code in the same scope as much as possible Signed-off-by: Allison Henderson Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 159 ++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 84 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 5f56b059dfc7..b35c7424152c 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -52,6 +52,7 @@ STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); * Internal routines when attribute list is more than one block. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); +STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); STATIC int xfs_attr_node_addname(struct xfs_da_args *args, struct xfs_da_state *state); STATIC int xfs_attr_node_addname_find_attr(struct xfs_da_args *args, @@ -290,8 +291,8 @@ xfs_attr_set_args( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; - struct xfs_da_state *state; - int error; + struct xfs_da_state *state = NULL; + int error = 0; /* * If the attribute list is already in leaf format, jump straight to @@ -342,7 +343,75 @@ xfs_attr_set_args( return error; error = xfs_attr_node_addname(args, state); } while (error == -EAGAIN); + if (error) + return error; + /* + * Commit the leaf addition or btree split and start the next + * trans in the chain. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + goto out; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return error; + } + + if (!(args->op_flags & XFS_DA_OP_RENAME)) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + goto out; + } + + /* + * If this is an atomic rename operation, we must "flip" the incomplete + * flags on the "new" and "old" attribute/value pairs so that one + * disappears and one appears atomically. Then we must remove the "old" + * attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the "old" attr + * and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in series + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing a "remote" value + * (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + error = xfs_attr_node_addname_clear_incomplete(args); +out: return error; } @@ -968,7 +1037,7 @@ xfs_attr_node_addname( { struct xfs_da_state_blk *blk; struct xfs_inode *dp; - int retval, error; + int error; trace_xfs_attr_node_addname(args); @@ -976,8 +1045,8 @@ xfs_attr_node_addname( blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_add(blk->bp, state->args); - if (retval == -ENOSPC) { + error = xfs_attr3_leaf_add(blk->bp, state->args); + if (error == -ENOSPC) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had @@ -1023,88 +1092,10 @@ xfs_attr_node_addname( xfs_da3_fixhashpath(state, &state->path); } - /* - * Kill the state structure, we're done with it and need to - * allow the buffers to come back later. - */ - xfs_da_state_free(state); - state = NULL; - - /* - * Commit the leaf addition or btree split and start the next - * trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } - - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - retval = error; - goto out; - } - - /* - * If this is an atomic rename operation, we must "flip" the incomplete - * flags on the "new" and "old" attribute/value pairs so that one - * disappears and one appears atomically. Then we must remove the "old" - * attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the "old" attr - * and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - /* - * Commit the flag value change and start the next trans in series - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - goto out; - - /* - * Dismantle the "old" attribute/value pair by removing a "remote" value - * (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - - if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - error = xfs_attr_node_addname_clear_incomplete(args); - if (error) - goto out; - retval = 0; out: if (state) xfs_da_state_free(state); - if (error) - return error; - return retval; + return error; } From 83c6e70789ff371c4eebc54f2c8d979305a1bae8 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 19 Apr 2021 12:55:26 -0700 Subject: [PATCH 012/102] xfs: Hoist xfs_attr_leaf_addname This patch hoists xfs_attr_leaf_addname into the calling function. The goal being to get all the code that will require state management into the same scope. This isn't particularly aesthetic right away, but it is a preliminary step to merging in the state machine code. Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_attr.c | 209 ++++++++++++++++++--------------------- fs/xfs/xfs_trace.h | 1 - 2 files changed, 96 insertions(+), 114 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index b35c7424152c..4bbf34cc44e3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -44,9 +44,9 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); * Internal routines when attribute list is one block. */ STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); -STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); /* * Internal routines when attribute list is more than one block. @@ -291,8 +291,9 @@ xfs_attr_set_args( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; struct xfs_da_state *state = NULL; - int error = 0; + int forkoff, error = 0; /* * If the attribute list is already in leaf format, jump straight to @@ -307,10 +308,101 @@ xfs_attr_set_args( } if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_addname(args); - if (error != -ENOSPC) + error = xfs_attr_leaf_try_add(args, bp); + if (error == -ENOSPC) + goto node; + else if (error) return error; + /* + * Commit the transaction that added the attr name so that + * later routines can manage their own transactions. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return error; + } + + if (!(args->op_flags & XFS_DA_OP_RENAME)) { + /* + * Added a "remote" value, just clear the incomplete + *flag. + */ + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans in + * series. + */ + error = xfs_trans_roll_inode(&args->trans, args->dp); + if (error) + return error; + + /* + * Dismantle the "old" attribute/value pair by removing a + * "remote" value (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + if (args->rmtblkno) { + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Read in the block containing the "old" attr, then remove the + * "old" attr from that block (neat, huh!) + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; + + xfs_attr3_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + + return error; +node: /* * Promote the attribute list to the Btree format. */ @@ -737,115 +829,6 @@ out_brelse: return retval; } - -/* - * Add a name to the leaf attribute list structure - * - * This leaf block cannot have a "remote" value, we only call this routine - * if bmap_one_block() says there is only one block (ie: no remote blks). - */ -STATIC int -xfs_attr_leaf_addname( - struct xfs_da_args *args) -{ - int error, forkoff; - struct xfs_buf *bp = NULL; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_leaf_addname(args); - - error = xfs_attr_leaf_try_add(args, bp); - if (error) - return error; - - /* - * Commit the transaction that added the attr name so that - * later routines can manage their own transactions. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } - - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - - return error; - } - - /* - * If this is an atomic rename operation, we must "flip" the incomplete - * flags on the "new" and "old" attribute/value pairs so that one - * disappears and one appears atomically. Then we must remove the "old" - * attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the "old" attr - * and clear the incomplete flag on the "new" attr. - */ - - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - return error; - - /* - * Dismantle the "old" attribute/value pair by removing a "remote" value - * (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - - if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Read in the block containing the "old" attr, then remove the "old" - * attr from that block (neat, huh!) - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; - - xfs_attr3_leaf_remove(bp, args); - - /* - * If the result is small enough, shrink it all into the inode. - */ - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - - return error; -} - /* * Return EEXIST if attr is found, or ENOATTR if not */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 808ae337b222..3c1c830befba 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1914,7 +1914,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_add); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); -DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); DEFINE_ATTR_EVENT(xfs_attr_leaf_create); DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); DEFINE_ATTR_EVENT(xfs_attr_leaf_get); From 3f562d092bb1edd39bfc0e6808d7108d47f8aa3a Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 12 Feb 2021 12:27:14 -0700 Subject: [PATCH 013/102] xfs: Hoist node transaction handling This patch basically hoists the node transaction handling around the leaf code we just hoisted. This will helps setup this area for the state machine since the goto is easily replaced with a state since it ends with a transaction roll. Signed-off-by: Allison Henderson Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 57 +++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 4bbf34cc44e3..812dd1a5b5ad 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -309,10 +309,36 @@ xfs_attr_set_args( if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_try_add(args, bp); - if (error == -ENOSPC) + if (error == -ENOSPC) { + /* + * Promote the attribute list to the Btree format. + */ + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + + /* + * Finish any deferred work items and roll the transaction once + * more. The goal here is to call node_addname with the inode + * and transaction in the same state (inode locked and joined, + * transaction clean) no matter how we got to this step. + */ + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + /* + * Commit the current trans (including the inode) and + * start a new one. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + goto node; - else if (error) + } else if (error) { return error; + } /* * Commit the transaction that added the attr name so that @@ -402,32 +428,9 @@ xfs_attr_set_args( /* bp is gone due to xfs_da_shrink_inode */ return error; -node: - /* - * Promote the attribute list to the Btree format. - */ - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the transaction once - * more. The goal here is to call node_addname with the inode - * and transaction in the same state (inode locked and joined, - * transaction clean) no matter how we got to this step. - */ - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - /* - * Commit the current trans (including the inode) and - * start a new one. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; } +node: + do { error = xfs_attr_node_addname_find_attr(args, &state); From 2b74b03c13c444cb5af56804cc975534e2058d06 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 26 Apr 2021 15:00:33 -0700 Subject: [PATCH 014/102] xfs: Add delay ready attr remove routines This patch modifies the attr remove routines to be delay ready. This means they no longer roll or commit transactions, but instead return -EAGAIN to have the calling routine roll and refresh the transaction. In this series, xfs_attr_remove_args is merged with xfs_attr_node_removename become a new function, xfs_attr_remove_iter. This new version uses a sort of state machine like switch to keep track of where it was when EAGAIN was returned. A new version of xfs_attr_remove_args consists of a simple loop to refresh the transaction until the operation is completed. A new XFS_DAC_DEFER_FINISH flag is used to finish the transaction where ever the existing code used to. Calls to xfs_attr_rmtval_remove are replaced with the delay ready version __xfs_attr_rmtval_remove. We will rename __xfs_attr_rmtval_remove back to xfs_attr_rmtval_remove when we are done. xfs_attr_rmtval_remove itself is still in use by the set routines (used during a rename). For reasons of preserving existing function, we modify xfs_attr_rmtval_remove to call xfs_defer_finish when the flag is set. Similar to how xfs_attr_remove_args does here. Once we transition the set routines to be delay ready, xfs_attr_rmtval_remove is no longer used and will be removed. This patch also adds a new struct xfs_delattr_context, which we will use to keep track of the current state of an attribute operation. The new xfs_delattr_state enum is used to track various operations that are in progress so that we know not to repeat them, and resume where we left off before EAGAIN was returned to cycle out the transaction. Other members take the place of local variables that need to retain their values across multiple function calls. See xfs_attr.h for a more detailed diagram of the states. Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 237 +++++++++++++++++++++++--------- fs/xfs/libxfs/xfs_attr.h | 131 ++++++++++++++++++ fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_attr_remote.c | 53 ++++--- fs/xfs/libxfs/xfs_attr_remote.h | 2 +- fs/xfs/xfs_attr_inactive.c | 2 +- 6 files changed, 334 insertions(+), 93 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 812dd1a5b5ad..513d9caab21e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -57,7 +57,6 @@ STATIC int xfs_attr_node_addname(struct xfs_da_args *args, struct xfs_da_state *state); STATIC int xfs_attr_node_addname_find_attr(struct xfs_da_args *args, struct xfs_da_state **state); -STATIC int xfs_attr_node_removename(xfs_da_args_t *args); STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_da_args *args); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); @@ -241,6 +240,31 @@ xfs_attr_is_shortform( ip->i_afp->if_nextents == 0); } +/* + * Checks to see if a delayed attribute transaction should be rolled. If so, + * transaction is finished or rolled as needed. + */ +int +xfs_attr_trans_roll( + struct xfs_delattr_context *dac) +{ + struct xfs_da_args *args = dac->da_args; + int error; + + if (dac->flags & XFS_DAC_DEFER_FINISH) { + /* + * The caller wants us to finish all the deferred ops so that we + * avoid pinning the log tail with a large number of deferred + * ops. + */ + dac->flags &= ~XFS_DAC_DEFER_FINISH; + error = xfs_defer_finish(&args->trans); + } else + error = xfs_trans_roll_inode(&args->trans, args->dp); + + return error; +} + STATIC int xfs_attr_set_fmt( struct xfs_da_args *args) @@ -544,16 +568,25 @@ xfs_has_attr( */ int xfs_attr_remove_args( - struct xfs_da_args *args) + struct xfs_da_args *args) { - if (!xfs_inode_hasattr(args->dp)) - return -ENOATTR; + int error; + struct xfs_delattr_context dac = { + .da_args = args, + }; - if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_shortform_remove(args); - if (xfs_attr_is_leaf(args->dp)) - return xfs_attr_leaf_removename(args); - return xfs_attr_node_removename(args); + do { + error = xfs_attr_remove_iter(&dac); + if (error != -EAGAIN) + break; + + error = xfs_attr_trans_roll(&dac); + if (error) + return error; + + } while (true); + + return error; } /* @@ -1197,14 +1230,16 @@ xfs_attr_leaf_mark_incomplete( */ STATIC int xfs_attr_node_removename_setup( - struct xfs_da_args *args, - struct xfs_da_state **state) + struct xfs_delattr_context *dac) { - int error; + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state **state = &dac->da_state; + int error; error = xfs_attr_node_hasname(args, state); if (error != -EEXIST) return error; + error = 0; ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == @@ -1213,12 +1248,15 @@ int xfs_attr_node_removename_setup( if (args->rmtblkno > 0) { error = xfs_attr_leaf_mark_incomplete(args, *state); if (error) - return error; + goto out; - return xfs_attr_rmtval_invalidate(args); + error = xfs_attr_rmtval_invalidate(args); } +out: + if (error) + xfs_da_state_free(*state); - return 0; + return error; } STATIC int @@ -1241,70 +1279,133 @@ xfs_attr_node_remove_name( } /* - * Remove a name from a B-tree attribute list. + * Remove the attribute specified in @args. * * This will involve walking down the Btree, and may involve joining * leaf nodes and even joining intermediate nodes up to and including * the root node (a special case of an intermediate node). + * + * This routine is meant to function as either an in-line or delayed operation, + * and may return -EAGAIN when the transaction needs to be rolled. Calling + * functions will need to handle this, and call the function until a + * successful error code is returned. */ -STATIC int -xfs_attr_node_removename( - struct xfs_da_args *args) +int +xfs_attr_remove_iter( + struct xfs_delattr_context *dac) { - struct xfs_da_state *state; - int retval, error; - struct xfs_inode *dp = args->dp; + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state *state = dac->da_state; + int retval, error; + struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); - error = xfs_attr_node_removename_setup(args, &state); - if (error) + switch (dac->dela_state) { + case XFS_DAS_UNINIT: + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + /* + * Shortform or leaf formats don't require transaction rolls and + * thus state transitions. Call the right helper and return. + */ + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + return xfs_attr_shortform_remove(args); + + if (xfs_attr_is_leaf(dp)) + return xfs_attr_leaf_removename(args); + + /* + * Node format may require transaction rolls. Set up the + * state context and fall into the state machine. + */ + if (!dac->da_state) { + error = xfs_attr_node_removename_setup(dac); + if (error) + return error; + state = dac->da_state; + } + + /* fallthrough */ + case XFS_DAS_RMTBLK: + dac->dela_state = XFS_DAS_RMTBLK; + + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a + * deadlock. + */ + if (args->rmtblkno > 0) { + /* + * May return -EAGAIN. Roll and repeat until all remote + * blocks are removed. + */ + error = __xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) + return error; + else if (error) + goto out; + + /* + * Refill the state structure with buffers (the prior + * calls released our buffers) and close out this + * transaction before proceeding. + */ + ASSERT(args->rmtblkno == 0); + error = xfs_attr_refillstate(state); + if (error) + goto out; + dac->dela_state = XFS_DAS_RM_NAME; + dac->flags |= XFS_DAC_DEFER_FINISH; + return -EAGAIN; + } + + /* fallthrough */ + case XFS_DAS_RM_NAME: + /* + * If we came here fresh from a transaction roll, reattach all + * the buffers to the current transaction. + */ + if (dac->dela_state == XFS_DAS_RM_NAME) { + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + retval = xfs_attr_node_remove_name(args, state); + + /* + * Check to see if the tree needs to be collapsed. If so, roll + * the transacton and fall into the shrink state. + */ + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); + if (error) + goto out; + + dac->flags |= XFS_DAC_DEFER_FINISH; + dac->dela_state = XFS_DAS_RM_SHRINK; + return -EAGAIN; + } + + /* fallthrough */ + case XFS_DAS_RM_SHRINK: + /* + * If the result is small enough, push it all into the inode. + * This is our final state so it's safe to return a dirty + * transaction. + */ + if (xfs_attr_is_leaf(dp)) + error = xfs_attr_node_shrink(args, state); + ASSERT(error != -EAGAIN); + break; + default: + ASSERT(0); + error = -EINVAL; goto out; - - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_remove(args); - if (error) - goto out; - - /* - * Refill the state structure with buffers, the prior calls - * released our buffers. - */ - error = xfs_attr_refillstate(state); - if (error) - goto out; } - retval = xfs_attr_node_remove_name(args, state); - - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; - /* - * Commit the Btree join operation and start a new trans. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - } - - /* - * If the result is small enough, push it all into the inode. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); - out: if (state) xfs_da_state_free(state); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 2b1f61987a9d..1267ea86ce7a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -74,6 +74,133 @@ struct xfs_attr_list_context { }; +/* + * ======================================================================== + * Structure used to pass context around among the delayed routines. + * ======================================================================== + */ + +/* + * Below is a state machine diagram for attr remove operations. The XFS_DAS_* + * states indicate places where the function would return -EAGAIN, and then + * immediately resume from after being called by the calling function. States + * marked as a "subroutine state" indicate that they belong to a subroutine, and + * so the calling function needs to pass them back to that subroutine to allow + * it to finish where it left off. But they otherwise do not have a role in the + * calling function other than just passing through. + * + * xfs_attr_remove_iter() + * │ + * v + * have attr to remove? ──n──> done + * │ + * y + * │ + * v + * are we short form? ──y──> xfs_attr_shortform_remove ──> done + * │ + * n + * │ + * V + * are we leaf form? ──y──> xfs_attr_leaf_removename ──> done + * │ + * n + * │ + * V + * ┌── need to setup state? + * │ │ + * n y + * │ │ + * │ v + * │ find attr and get state + * │ attr has remote blks? ──n─┐ + * │ │ v + * │ │ find and invalidate + * │ y the remote blocks. + * │ │ mark attr incomplete + * │ ├────────────────┘ + * └──────────┤ + * │ + * v + * Have remote blks to remove? ───y─────┐ + * │ ^ remove the blks + * │ │ │ + * │ │ v + * │ XFS_DAS_RMTBLK <─n── done? + * │ re-enter with │ + * │ one less blk to y + * │ remove │ + * │ V + * │ refill the state + * n │ + * │ v + * │ XFS_DAS_RM_NAME + * │ │ + * ├─────────────────────────┘ + * │ + * v + * remove leaf and + * update hash with + * xfs_attr_node_remove_cleanup + * │ + * v + * need to + * shrink tree? ─n─┐ + * │ │ + * y │ + * │ │ + * v │ + * join leaf │ + * │ │ + * v │ + * XFS_DAS_RM_SHRINK │ + * │ │ + * v │ + * do the shrink │ + * │ │ + * v │ + * free state <──┘ + * │ + * v + * done + * + */ + +/* + * Enum values for xfs_delattr_context.da_state + * + * These values are used by delayed attribute operations to keep track of where + * they were before they returned -EAGAIN. A return code of -EAGAIN signals the + * calling function to roll the transaction, and then call the subroutine to + * finish the operation. The enum is then used by the subroutine to jump back + * to where it was and resume executing where it left off. + */ +enum xfs_delattr_state { + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + XFS_DAS_RMTBLK, /* Removing remote blks */ + XFS_DAS_RM_NAME, /* Remove attr name */ + XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ +}; + +/* + * Defines for xfs_delattr_context.flags + */ +#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ + +/* + * Context used for keeping track of delayed attribute operations + */ +struct xfs_delattr_context { + struct xfs_da_args *da_args; + + /* Used in xfs_attr_node_removename to roll through removing blocks */ + struct xfs_da_state *da_state; + + /* Used to keep track of current state of delayed operation */ + unsigned int flags; + enum xfs_delattr_state dela_state; +}; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -92,6 +219,10 @@ int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); +int xfs_attr_remove_iter(struct xfs_delattr_context *dac); +int xfs_attr_trans_roll(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); +void xfs_delattr_context_init(struct xfs_delattr_context *dac, + struct xfs_da_args *args); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 556184b63061..d97de2083feb 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -19,8 +19,8 @@ #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_attr_sf.h" -#include "xfs_attr_remote.h" #include "xfs_attr.h" +#include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 48d8e9caf86f..c26193bc3278 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -674,10 +674,12 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_da_args *args) + struct xfs_da_args *args) { - int error; - int retval; + int error; + struct xfs_delattr_context dac = { + .da_args = args, + }; trace_xfs_attr_rmtval_remove(args); @@ -685,31 +687,30 @@ xfs_attr_rmtval_remove( * Keep de-allocating extents until the remote-value region is gone. */ do { - retval = __xfs_attr_rmtval_remove(args); - if (retval && retval != -EAGAIN) - return retval; + error = __xfs_attr_rmtval_remove(&dac); + if (error && error != -EAGAIN) + break; - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); + error = xfs_attr_trans_roll(&dac); if (error) return error; - } while (retval == -EAGAIN); + } while (true); - return 0; + return error; } /* * Remove the value associated with an attribute by deleting the out-of-line - * buffer that it is stored on. Returns EAGAIN for the caller to refresh the - * transaction and re-call the function + * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the + * transaction and re-call the function. Callers should keep calling this + * routine until it returns something other than -EAGAIN. */ int __xfs_attr_rmtval_remove( - struct xfs_da_args *args) + struct xfs_delattr_context *dac) { - int error, done; + struct xfs_da_args *args = dac->da_args; + int error, done; /* * Unmap value blocks for this attr. @@ -719,12 +720,20 @@ __xfs_attr_rmtval_remove( if (error) return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - if (!done) + /* + * We don't need an explicit state here to pick up where we left off. We + * can figure it out using the !done return code. The actual value of + * attr->xattri_dela_state may be some value reminiscent of the calling + * function, but it's value is irrelevant with in the context of this + * function. Once we are done here, the next state is set as needed by + * the parent + */ + if (!done) { + dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; + } - return error; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9eee615da156..002fd300364d 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -14,5 +14,5 @@ int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int __xfs_attr_rmtval_remove(struct xfs_da_args *args); +int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index bfad669e6b2f..aaa7e66c42d7 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -15,10 +15,10 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_attr.h" #include "xfs_attr_remote.h" #include "xfs_trans.h" #include "xfs_bmap.h" -#include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_dir2.h" From 8f502a4009822a6972772ae65b34078645b3ba16 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 21 May 2021 15:48:13 -0700 Subject: [PATCH 015/102] xfs: Add delay ready attr set routines This patch modifies the attr set routines to be delay ready. This means they no longer roll or commit transactions, but instead return -EAGAIN to have the calling routine roll and refresh the transaction. In this series, xfs_attr_set_args has become xfs_attr_set_iter, which uses a state machine like switch to keep track of where it was when EAGAIN was returned. See xfs_attr.h for a more detailed diagram of the states. Two new helper functions have been added: xfs_attr_rmtval_find_space and xfs_attr_rmtval_set_blk. They provide a subset of logic similar to xfs_attr_rmtval_set, but they store the current block in the delay attr context to allow the caller to roll the transaction between allocations. This helps to simplify and consolidate code used by xfs_attr_leaf_addname and xfs_attr_node_addname. xfs_attr_set_args has now become a simple loop to refresh the transaction until the operation is completed. Lastly, xfs_attr_rmtval_remove is no longer used, and is removed. Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_attr.c | 474 +++++++++++++++++++------------- fs/xfs/libxfs/xfs_attr.h | 274 +++++++++++++++++- fs/xfs/libxfs/xfs_attr_remote.c | 100 ++++--- fs/xfs/libxfs/xfs_attr_remote.h | 5 +- fs/xfs/xfs_trace.h | 1 - 5 files changed, 622 insertions(+), 232 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 513d9caab21e..df20537c5533 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -53,15 +53,16 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_da_args *args, - struct xfs_da_state *state); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_da_args *args, - struct xfs_da_state **state); -STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_da_args *args); +STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); +STATIC int xfs_attr_node_addname_clear_incomplete( + struct xfs_delattr_context *dac); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); +STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp); int xfs_inode_hasattr( @@ -244,7 +245,7 @@ xfs_attr_is_shortform( * Checks to see if a delayed attribute transaction should be rolled. If so, * transaction is finished or rolled as needed. */ -int +STATIC int xfs_attr_trans_roll( struct xfs_delattr_context *dac) { @@ -265,29 +266,58 @@ xfs_attr_trans_roll( return error; } +/* + * Set the attribute specified in @args. + */ +int +xfs_attr_set_args( + struct xfs_da_args *args) +{ + struct xfs_buf *leaf_bp = NULL; + int error = 0; + struct xfs_delattr_context dac = { + .da_args = args, + }; + + do { + error = xfs_attr_set_iter(&dac, &leaf_bp); + if (error != -EAGAIN) + break; + + error = xfs_attr_trans_roll(&dac); + if (error) { + if (leaf_bp) + xfs_trans_brelse(args->trans, leaf_bp); + return error; + } + } while (true); + + return error; +} + STATIC int xfs_attr_set_fmt( - struct xfs_da_args *args) + struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp) { - struct xfs_buf *leaf_bp = NULL; - struct xfs_inode *dp = args->dp; - int error, error2 = 0; + struct xfs_da_args *args = dac->da_args; + struct xfs_inode *dp = args->dp; + int error = 0; /* * Try to add the attr to the attribute list in the inode. */ error = xfs_attr_try_sf_addname(dp, args); - if (error != -ENOSPC) { - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; - } + + /* Should only be 0, -EEXIST or -ENOSPC */ + if (error != -ENOSPC) + return error; /* * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, &leaf_bp); + error = xfs_attr_shortform_to_leaf(args, leaf_bp); if (error) return error; @@ -296,102 +326,130 @@ xfs_attr_set_fmt( * push cannot grab the half-baked leaf buffer and run into problems * with the write verifier. */ - xfs_trans_bhold(args->trans, leaf_bp); - error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, leaf_bp); - if (error) { - xfs_trans_brelse(args->trans, leaf_bp); - return error; - } + xfs_trans_bhold(args->trans, *leaf_bp); + /* + * We're still in XFS_DAS_UNINIT state here. We've converted + * the attr fork to leaf format and will restart with the leaf + * add. + */ + dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; } /* * Set the attribute specified in @args. + * This routine is meant to function as a delayed operation, and may return + * -EAGAIN when the transaction needs to be rolled. Calling functions will need + * to handle this, and recall the function until a successful error code is + * returned. */ int -xfs_attr_set_args( - struct xfs_da_args *args) +xfs_attr_set_iter( + struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp) { - struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = NULL; - struct xfs_da_state *state = NULL; - int forkoff, error = 0; + struct xfs_da_args *args = dac->da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff, error = 0; - /* - * If the attribute list is already in leaf format, jump straight to - * leaf handling. Otherwise, try to add the attribute to the shortform - * list; if there's no room then convert the list to leaf format and try - * again. - */ - if (xfs_attr_is_shortform(dp)) { - error = xfs_attr_set_fmt(args); - if (error != -EAGAIN) - return error; - } - - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, bp); - if (error == -ENOSPC) { - /* - * Promote the attribute list to the Btree format. - */ - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the transaction once - * more. The goal here is to call node_addname with the inode - * and transaction in the same state (inode locked and joined, - * transaction clean) no matter how we got to this step. - */ - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - /* - * Commit the current trans (including the inode) and - * start a new one. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - - goto node; - } else if (error) { - return error; + /* State machine switch */ + switch (dac->dela_state) { + case XFS_DAS_UNINIT: + /* + * If the fork is shortform, attempt to add the attr. If there + * is no space, this converts to leaf format and returns + * -EAGAIN with the leaf buffer held across the roll. The caller + * will deal with a transaction roll error, but otherwise + * release the hold once we return with a clean transaction. + */ + if (xfs_attr_is_shortform(dp)) + return xfs_attr_set_fmt(dac, leaf_bp); + if (*leaf_bp != NULL) { + xfs_trans_bhold_release(args->trans, *leaf_bp); + *leaf_bp = NULL; } - /* - * Commit the transaction that added the attr name so that - * later routines can manage their own transactions. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; + if (xfs_attr_is_leaf(dp)) { + error = xfs_attr_leaf_try_add(args, *leaf_bp); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + /* + * Finish any deferred work items and roll the + * transaction once more. The goal here is to + * call node_addname with the inode and + * transaction in the same state (inode locked + * and joined, transaction clean) no matter how + * we got to this step. + * + * At this point, we are still in + * XFS_DAS_UNINIT, but when we come back, we'll + * be a node, so we'll fall down into the node + * handling code below + */ + dac->flags |= XFS_DAC_DEFER_FINISH; + return -EAGAIN; + } else if (error) { + return error; + } + + dac->dela_state = XFS_DAS_FOUND_LBLK; + } else { + error = xfs_attr_node_addname_find_attr(dac); + if (error) + return error; + + error = xfs_attr_node_addname(dac); + if (error) + return error; + + dac->dela_state = XFS_DAS_FOUND_NBLK; + } + return -EAGAIN; + case XFS_DAS_FOUND_LBLK: /* * If there was an out-of-line value, allocate the blocks we * identified for its storage and copy the value. This is done * after we create the attribute so that we don't overflow the * maximum size of a transaction and/or hit a deadlock. */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; + + /* Open coded xfs_attr_rmtval_set without trans handling */ + if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { + dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_find_space(dac); + if (error) + return error; + } } + /* + * Repeat allocating remote blocks for the attr value until + * blkcnt drops to zero. + */ + if (dac->blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(dac); + if (error) + return error; + return -EAGAIN; + } + + error = xfs_attr_rmtval_set_value(args); + if (error) + return error; + + /* + * If this is not a rename, clear the incomplete flag and we're + * done. + */ if (!(args->op_flags & XFS_DA_OP_RENAME)) { - /* - * Added a "remote" value, just clear the incomplete - *flag. - */ if (args->rmtblkno > 0) error = xfs_attr3_leaf_clearflag(args); - return error; } @@ -404,7 +462,6 @@ xfs_attr_set_args( * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr3_leaf_flipflags(args); if (error) return error; @@ -412,29 +469,37 @@ xfs_attr_set_args( * Commit the flag value change and start the next trans in * series. */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - return error; - + dac->dela_state = XFS_DAS_FLIP_LFLAG; + return -EAGAIN; + case XFS_DAS_FLIP_LFLAG: /* * Dismantle the "old" attribute/value pair by removing a * "remote" value (if it exists). */ xfs_attr_restore_rmt_blk(args); + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + /* fallthrough */ + case XFS_DAS_RM_LBLK: + /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ + dac->dela_state = XFS_DAS_RM_LBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); + error = __xfs_attr_rmtval_remove(dac); if (error) return error; - error = xfs_attr_rmtval_remove(args); - if (error) - return error; + dac->dela_state = XFS_DAS_RD_LEAF; + return -EAGAIN; } + /* fallthrough */ + case XFS_DAS_RD_LEAF: /* - * Read in the block containing the "old" attr, then remove the - * "old" attr from that block (neat, huh!) + * This is the last step for leaf format. Read the block with + * the old attr, remove the old attr, check for shortform + * conversion and return. */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); @@ -443,97 +508,116 @@ xfs_attr_set_args( xfs_attr3_leaf_remove(bp, args); - /* - * If the result is small enough, shrink it all into the inode. - */ forkoff = xfs_attr_shortform_allfit(bp, dp); if (forkoff) error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ return error; - } -node: - - do { - error = xfs_attr_node_addname_find_attr(args, &state); - if (error) - return error; - error = xfs_attr_node_addname(args, state); - } while (error == -EAGAIN); - if (error) - return error; - - /* - * Commit the leaf addition or btree split and start the next - * trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } - - if (!(args->op_flags & XFS_DA_OP_RENAME)) { + case XFS_DAS_FOUND_NBLK: /* - * Added a "remote" value, just clear the incomplete flag. + * Find space for remote blocks and fall into the allocation + * state. */ - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - goto out; - } + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_find_space(dac); + if (error) + return error; + } - /* - * If this is an atomic rename operation, we must "flip" the incomplete - * flags on the "new" and "old" attribute/value pairs so that one - * disappears and one appears atomically. Then we must remove the "old" - * attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the "old" attr - * and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - /* - * Commit the flag value change and start the next trans in series - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - goto out; + /* fallthrough */ + case XFS_DAS_ALLOC_NODE: + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + dac->dela_state = XFS_DAS_ALLOC_NODE; + if (args->rmtblkno > 0) { + if (dac->blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(dac); + if (error) + return error; + return -EAGAIN; + } - /* - * Dismantle the "old" attribute/value pair by removing a "remote" value - * (if it exists). - */ - xfs_attr_restore_rmt_blk(args); + error = xfs_attr_rmtval_set_value(args); + if (error) + return error; + } + + /* + * If this was not a rename, clear the incomplete flag and we're + * done. + */ + if (!(args->op_flags & XFS_DA_OP_RENAME)) { + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + goto out; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in + * series + */ + dac->dela_state = XFS_DAS_FLIP_NFLAG; + return -EAGAIN; + + case XFS_DAS_FLIP_NFLAG: + /* + * Dismantle the "old" attribute/value pair by removing a + * "remote" value (if it exists). + */ + xfs_attr_restore_rmt_blk(args); - if (args->rmtblkno) { error = xfs_attr_rmtval_invalidate(args); if (error) return error; - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } + /* fallthrough */ + case XFS_DAS_RM_NBLK: + /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ + dac->dela_state = XFS_DAS_RM_NBLK; + if (args->rmtblkno) { + error = __xfs_attr_rmtval_remove(dac); + if (error) + return error; - error = xfs_attr_node_addname_clear_incomplete(args); + dac->dela_state = XFS_DAS_CLR_FLAG; + return -EAGAIN; + } + + /* fallthrough */ + case XFS_DAS_CLR_FLAG: + /* + * The last state for node format. Look up the old attr and + * remove it. + */ + error = xfs_attr_node_addname_clear_incomplete(dac); + break; + default: + ASSERT(dac->dela_state != XFS_DAS_RM_SHRINK); + break; + } out: return error; } + /* * Return EEXIST if attr is found, or ENOATTR if not */ @@ -997,18 +1081,18 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_da_args *args, - struct xfs_da_state **state) + struct xfs_delattr_context *dac) { - int retval; + struct xfs_da_args *args = dac->da_args; + int retval; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, state); + retval = xfs_attr_node_hasname(args, &dac->da_state); if (retval != -ENOATTR && retval != -EEXIST) - goto error; + return retval; if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto error; @@ -1034,8 +1118,8 @@ xfs_attr_node_addname_find_attr( return 0; error: - if (*state) - xfs_da_state_free(*state); + if (dac->da_state) + xfs_da_state_free(dac->da_state); return retval; } @@ -1048,19 +1132,23 @@ error: * * "Remote" attribute values confuse the issue and atomic rename operations * add a whole extra layer of confusion on top of that. + * + * This routine is meant to function as a delayed operation, and may return + * -EAGAIN when the transaction needs to be rolled. Calling functions will need + * to handle this, and recall the function until a successful error code is + *returned. */ STATIC int xfs_attr_node_addname( - struct xfs_da_args *args, - struct xfs_da_state *state) + struct xfs_delattr_context *dac) { - struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - int error; + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state *state = dac->da_state; + struct xfs_da_state_blk *blk; + int error; trace_xfs_attr_node_addname(args); - dp = args->dp; blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1075,20 +1163,17 @@ xfs_attr_node_addname( xfs_da_state_free(state); state = NULL; error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); if (error) goto out; /* - * Commit the node conversion and start the next - * trans in the chain. + * Now that we have converted the leaf to a node, we can + * roll the transaction, and try xfs_attr3_leaf_add + * again on re-entry. No need to set dela_state to do + * this. dela_state is still unset by this function at + * this point. */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - + dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; } @@ -1101,9 +1186,7 @@ xfs_attr_node_addname( error = xfs_da3_split(state); if (error) goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; + dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1120,8 +1203,9 @@ out: STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_da_args *args) + struct xfs_delattr_context *dac) { + struct xfs_da_args *args = dac->da_args; struct xfs_da_state *state = NULL; struct xfs_da_state_blk *blk; int retval = 0; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 1267ea86ce7a..8de5d1d2733e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -164,6 +164,264 @@ struct xfs_attr_list_context { * v * done * + * + * Below is a state machine diagram for attr set operations. + * + * It seems the challenge with understanding this system comes from trying to + * absorb the state machine all at once, when really one should only be looking + * at it with in the context of a single function. Once a state sensitive + * function is called, the idea is that it "takes ownership" of the + * state machine. It isn't concerned with the states that may have belonged to + * it's calling parent. Only the states relevant to itself or any other + * subroutines there in. Once a calling function hands off the state machine to + * a subroutine, it needs to respect the simple rule that it doesn't "own" the + * state machine anymore, and it's the responsibility of that calling function + * to propagate the -EAGAIN back up the call stack. Upon reentry, it is + * committed to re-calling that subroutine until it returns something other than + * -EAGAIN. Once that subroutine signals completion (by returning anything other + * than -EAGAIN), the calling function can resume using the state machine. + * + * xfs_attr_set_iter() + * │ + * v + * ┌─y─ has an attr fork? + * │ | + * │ n + * │ | + * │ V + * │ add a fork + * │ │ + * └──────────┤ + * │ + * V + * ┌─── is shortform? + * │ │ + * │ y + * │ │ + * │ V + * │ xfs_attr_set_fmt + * │ | + * │ V + * │ xfs_attr_try_sf_addname + * │ │ + * │ V + * │ had enough ──y──> done + * │ space? + * n │ + * │ n + * │ │ + * │ V + * │ transform to leaf + * │ │ + * │ V + * │ hold the leaf buffer + * │ │ + * │ V + * │ return -EAGAIN + * │ Re-enter in + * │ leaf form + * │ + * └─> release leaf buffer + * if needed + * │ + * V + * ┌───n── fork has + * │ only 1 blk? + * │ │ + * │ y + * │ │ + * │ v + * │ xfs_attr_leaf_try_add() + * │ │ + * │ v + * │ had enough ──────────────y─────────────┐ + * │ space? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ return -EAGAIN │ + * │ re-enter in │ + * │ node form │ + * │ │ │ + * └──────────┤ │ + * │ │ + * V │ + * xfs_attr_node_addname_find_attr │ + * determines if this │ + * is create or rename │ + * find space to store attr │ + * │ │ + * v │ + * xfs_attr_node_addname │ + * │ │ + * v │ + * fits in a node leaf? ────n─────┐ │ + * │ ^ v │ + * │ │ single leaf node? │ + * │ │ │ │ │ + * y │ y n │ + * │ │ │ │ │ + * v │ v v │ + * update │ grow the leaf split if │ + * hashvals └── return -EAGAIN needed │ + * │ retry leaf add │ │ + * │ on reentry │ │ + * ├────────────────────────────┘ │ + * │ │ + * v │ + * need to alloc │ + * ┌─y── or flip flag? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ done │ + * │ │ + * │ │ + * │ XFS_DAS_FOUND_LBLK <────────────────┘ + * │ │ + * │ V + * │ xfs_attr_leaf_addname() + * │ │ + * │ v + * │ ┌──first time through? + * │ │ │ + * │ │ y + * │ │ │ + * │ n v + * │ │ if we have rmt blks + * │ │ find space for them + * │ │ │ + * │ └──────────┤ + * │ │ + * │ v + * │ still have + * │ ┌─n─ blks to alloc? <──┐ + * │ │ │ │ + * │ │ y │ + * │ │ │ │ + * │ │ v │ + * │ │ alloc one blk │ + * │ │ return -EAGAIN ──┘ + * │ │ re-enter with one + * │ │ less blk to alloc + * │ │ + * │ │ + * │ └───> set the rmt + * │ value + * │ │ + * │ v + * │ was this + * │ a rename? ──n─┐ + * │ │ │ + * │ y │ + * │ │ │ + * │ v │ + * │ flip incomplete │ + * │ flag │ + * │ │ │ + * │ v │ + * │ XFS_DAS_FLIP_LFLAG │ + * │ │ │ + * │ v │ + * │ need to remove │ + * │ old bks? ──n──┤ + * │ │ │ + * │ y │ + * │ │ │ + * │ V │ + * │ remove │ + * │ ┌───> old blks │ + * │ │ │ │ + * │ XFS_DAS_RM_LBLK │ │ + * │ ^ │ │ + * │ │ v │ + * │ └──y── more to │ + * │ remove? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ XFS_DAS_RD_LEAF │ + * │ │ │ + * │ v │ + * │ remove leaf │ + * │ │ │ + * │ v │ + * │ shrink to sf │ + * │ if needed │ + * │ │ │ + * │ v │ + * │ done <──────┘ + * │ + * └──────> XFS_DAS_FOUND_NBLK + * │ + * v + * ┌─────n── need to + * │ alloc blks? + * │ │ + * │ y + * │ │ + * │ v + * │ find space + * │ │ + * │ v + * │ ┌─>XFS_DAS_ALLOC_NODE + * │ │ │ + * │ │ v + * │ │ alloc blk + * │ │ │ + * │ │ v + * │ └──y── need to alloc + * │ more blocks? + * │ │ + * │ n + * │ │ + * │ v + * │ set the rmt value + * │ │ + * │ v + * │ was this + * └────────> a rename? ──n─┐ + * │ │ + * y │ + * │ │ + * v │ + * flip incomplete │ + * flag │ + * │ │ + * v │ + * XFS_DAS_FLIP_NFLAG │ + * │ │ + * v │ + * need to │ + * remove blks? ─n──┤ + * │ │ + * y │ + * │ │ + * v │ + * remove │ + * ┌────────> old blks │ + * │ │ │ + * XFS_DAS_RM_NBLK │ │ + * ^ │ │ + * │ v │ + * └──────y── more to │ + * remove │ + * │ │ + * n │ + * │ │ + * v │ + * XFS_DAS_CLR_FLAG │ + * │ │ + * v │ + * clear flags │ + * │ │ + * ├──────────┘ + * │ + * v + * done */ /* @@ -180,12 +438,22 @@ enum xfs_delattr_state { XFS_DAS_RMTBLK, /* Removing remote blks */ XFS_DAS_RM_NAME, /* Remove attr name */ XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ + XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ + XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ + XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ + XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ + XFS_DAS_RD_LEAF, /* Read in the new leaf */ + XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ + XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ + XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ + XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ }; /* * Defines for xfs_delattr_context.flags */ #define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ +#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ /* * Context used for keeping track of delayed attribute operations @@ -193,6 +461,11 @@ enum xfs_delattr_state { struct xfs_delattr_context { struct xfs_da_args *da_args; + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + int blkcnt; + /* Used in xfs_attr_node_removename to roll through removing blocks */ struct xfs_da_state *da_state; @@ -220,7 +493,6 @@ int xfs_attr_set_args(struct xfs_da_args *args); int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_remove_iter(struct xfs_delattr_context *dac); -int xfs_attr_trans_roll(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); void xfs_delattr_context_init(struct xfs_delattr_context *dac, struct xfs_da_args *args); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index c26193bc3278..c1b09fa64ea7 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -439,9 +439,9 @@ xfs_attr_rmtval_get( /* * Find a "hole" in the attribute address space large enough for us to drop the - * new attribute's value into + * new attributes value into */ -STATIC int +int xfs_attr_rmt_find_hole( struct xfs_da_args *args) { @@ -468,7 +468,7 @@ xfs_attr_rmt_find_hole( return 0; } -STATIC int +int xfs_attr_rmtval_set_value( struct xfs_da_args *args) { @@ -627,6 +627,69 @@ xfs_attr_rmtval_set( return xfs_attr_rmtval_set_value(args); } +/* + * Find a hole for the attr and store it in the delayed attr context. This + * initializes the context to roll through allocating an attr extent for a + * delayed attr operation + */ +int +xfs_attr_rmtval_find_space( + struct xfs_delattr_context *dac) +{ + struct xfs_da_args *args = dac->da_args; + struct xfs_bmbt_irec *map = &dac->map; + int error; + + dac->lblkno = 0; + dac->blkcnt = 0; + args->rmtblkcnt = 0; + args->rmtblkno = 0; + memset(map, 0, sizeof(struct xfs_bmbt_irec)); + + error = xfs_attr_rmt_find_hole(args); + if (error) + return error; + + dac->blkcnt = args->rmtblkcnt; + dac->lblkno = args->rmtblkno; + + return 0; +} + +/* + * Write one block of the value associated with an attribute into the + * out-of-line buffer that we have defined for it. This is similar to a subset + * of xfs_attr_rmtval_set, but records the current block to the delayed attr + * context, and leaves transaction handling to the caller. + */ +int +xfs_attr_rmtval_set_blk( + struct xfs_delattr_context *dac) +{ + struct xfs_da_args *args = dac->da_args; + struct xfs_inode *dp = args->dp; + struct xfs_bmbt_irec *map = &dac->map; + int nmap; + int error; + + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, + dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + map, &nmap); + if (error) + return error; + + ASSERT(nmap == 1); + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && + (map->br_startblock != HOLESTARTBLOCK)); + + /* roll attribute extent map forwards */ + dac->lblkno += map->br_blockcount; + dac->blkcnt -= map->br_blockcount; + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -668,37 +731,6 @@ xfs_attr_rmtval_invalidate( return 0; } -/* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -int -xfs_attr_rmtval_remove( - struct xfs_da_args *args) -{ - int error; - struct xfs_delattr_context dac = { - .da_args = args, - }; - - trace_xfs_attr_rmtval_remove(args); - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - do { - error = __xfs_attr_rmtval_remove(&dac); - if (error && error != -EAGAIN) - break; - - error = xfs_attr_trans_roll(&dac); - if (error) - return error; - } while (true); - - return error; -} - /* * Remove the value associated with an attribute by deleting the out-of-line * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 002fd300364d..8ad68d5d4cc2 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -10,9 +10,12 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmt_find_hole(struct xfs_da_args *args); +int xfs_attr_rmtval_set_value(struct xfs_da_args *args); +int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3c1c830befba..96f93a70e820 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1943,7 +1943,6 @@ DEFINE_ATTR_EVENT(xfs_attr_refillstate); DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); -DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ From 0e6acf29db6f463027d1ff7cea86a641da89f0d4 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 20 May 2021 23:51:23 -0700 Subject: [PATCH 016/102] xfs: Remove xfs_attr_rmtval_set This function is no longer used, so it is safe to remove Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_remote.c | 66 --------------------------------- fs/xfs/libxfs/xfs_attr_remote.h | 1 - 2 files changed, 67 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index c1b09fa64ea7..0c8bee3abc3b 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -561,72 +561,6 @@ xfs_attr_rmtval_stale( return 0; } -/* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. - */ -int -xfs_attr_rmtval_set( - struct xfs_da_args *args) -{ - struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; - int nmap; - int error; - - trace_xfs_attr_rmtval_set(args); - - error = xfs_attr_rmt_find_hole(args); - if (error) - return error; - - blkcnt = args->rmtblkcnt; - lblkno = (xfs_dablk_t)args->rmtblkno; - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - * - * Note that we have to consider this a data allocation as we - * write the remote attribute without logging the contents. - * Hence we must ensure that we aren't using blocks that are on - * the busy list so that we don't overwrite blocks which have - * recently been freed but their transactions are not yet - * committed to disk. If we overwrite the contents of a busy - * extent and then crash then the block may not contain the - * correct metadata after log recovery occurs. - */ - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, - &nmap); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - } - - return xfs_attr_rmtval_set_value(args); -} - /* * Find a hole for the attr and store it in the delayed attr context. This * initializes the context to roll through allocating an attr extent for a diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 8ad68d5d4cc2..61b85b918db8 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -9,7 +9,6 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); -int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); From 4fd084dbbd05402bb6e24782b8e9f9ea3e8ab3d6 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 21 May 2021 00:53:40 -0700 Subject: [PATCH 017/102] xfs: Clean up xfs_attr_node_addname_clear_incomplete We can use the helper function xfs_attr_node_remove_name to reduce duplicate code in this function Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index df20537c5533..2387a41b705e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -63,6 +63,8 @@ STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, struct xfs_buf **leaf_bp); +STATIC int xfs_attr_node_remove_name(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( @@ -1207,7 +1209,6 @@ xfs_attr_node_addname_clear_incomplete( { struct xfs_da_args *args = dac->da_args; struct xfs_da_state *state = NULL; - struct xfs_da_state_blk *blk; int retval = 0; int error = 0; @@ -1222,13 +1223,7 @@ xfs_attr_node_addname_clear_incomplete( if (error) goto out; - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[state->path.active-1]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + error = xfs_attr_node_remove_name(args, state); /* * Check to see if the tree needs to be collapsed. From a7bcb147fef39054fe324a1a988470f5da127196 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:56 -0700 Subject: [PATCH 018/102] xfs: clean up open-coded fs block unit conversions Replace some open-coded fs block unit conversions with the standard conversion macro. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- fs/xfs/xfs_iops.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f3254a4f4cb4..04ce361688f7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -612,7 +612,7 @@ xfs_inode_validate_extsize( */ if (rt_flag) - blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); else blocksize_bytes = mp->m_sb.sb_blocksize; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index dfe24b7f26e5..93c082db04b7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -543,7 +543,7 @@ xfs_stat_blksize( * always return the realtime extent size. */ if (XFS_IS_REALTIME_INODE(ip)) - return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip)); /* * Allow large block sizes to be reported to userspace programs if the @@ -560,7 +560,7 @@ xfs_stat_blksize( */ if (mp->m_flags & XFS_MOUNT_LARGEIO) { if (mp->m_swidth) - return mp->m_swidth << mp->m_sb.sb_blocklog; + return XFS_FSB_TO_B(mp, mp->m_swidth); if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) return 1U << mp->m_allocsize_log; } From 20bd8e63f30be23ff544d6bd77fc3b933464100b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:56 -0700 Subject: [PATCH 019/102] xfs: remove unnecessary shifts The superblock verifier already validates that (1 << blocklog) == blocksize, so use the value directly instead of doing math. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_bmap_util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0936f3a96fe6..997eb5c6e9b4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -945,7 +945,7 @@ xfs_flush_unmap_range( xfs_off_t rounding, start, end; int error; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE); start = round_down(offset, rounding); end = round_up(offset + len, rounding) - 1; @@ -1053,9 +1053,9 @@ xfs_prepare_shift( * extent (after split) during the shift and corrupt the file. Start * with the block just prior to the start to stabilize the boundary. */ - offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + offset = round_down(offset, mp->m_sb.sb_blocksize); if (offset) - offset -= (1 << mp->m_sb.sb_blocklog); + offset -= mp->m_sb.sb_blocksize; /* * Writeback and invalidate cache for the remainder of the file as we're From 9bbafc71919adfdf83fafd2ce909853b493e7d86 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 020/102] xfs: move xfs_perag_get/put to xfs_ag.[ch] They are AG functions, not superblock functions, so move them to the appropriate location. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 135 +++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_ag.h | 10 +++ fs/xfs/libxfs/xfs_ag_resv.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_alloc_btree.c | 2 +- fs/xfs/libxfs/xfs_attr_leaf.c | 1 + fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/libxfs/xfs_rmap.c | 1 + fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/libxfs/xfs_sb.c | 134 ---------------------------- fs/xfs/libxfs/xfs_sb.h | 9 -- fs/xfs/scrub/agheader.c | 1 + fs/xfs/scrub/agheader_repair.c | 1 + fs/xfs/scrub/common.c | 2 +- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/health.c | 2 +- fs/xfs/scrub/repair.c | 1 + fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_extent_busy.c | 2 +- fs/xfs/xfs_filestream.c | 2 +- fs/xfs/xfs_health.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log_recover.c | 1 + fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_qm.c | 1 + fs/xfs/xfs_reflink.c | 2 +- fs/xfs/xfs_super.c | 1 + 31 files changed, 172 insertions(+), 160 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index c68a36688474..2ca31dc46fe8 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -27,6 +27,141 @@ #include "xfs_defer.h" #include "xfs_log_format.h" #include "xfs_trans.h" +#include "xfs_trace.h" + +/* + * Passive reference counting access wrappers to the perag structures. If the + * per-ag structure is to be freed, the freeing code is responsible for cleaning + * up objects with passive references before freeing the structure. This is + * things like cached buffers. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + uint64_t fdblocks; + int error = 0; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + fdblocks = bfree + bfreelst + btree; + + /* + * If the new summary counts are obviously incorrect, fail the + * mount operation because that implies the AGFs are also corrupt. + * Clear FS_COUNTERS so that we don't unmount with a dirty log, which + * will prevent xfs_repair from fixing anything. + */ + if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { + xfs_alert(mp, "AGF corruption. Please run xfs_repair."); + error = -EFSCORRUPTED; + goto out; + } + + /* Overwrite incore superblock counters with just-read data */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = fdblocks; + spin_unlock(&mp->m_sb_lock); + + xfs_reinit_percpu_counters(mp); +out: + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); + return error; +} static int xfs_get_aghdr_buf( diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 4535de1d88ea..cb1bd1c03cd7 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -9,6 +9,16 @@ struct xfs_mount; struct xfs_trans; +struct xfs_perag; + +/* + * perag get/put wrappers for ref counting + */ +int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); +struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +void xfs_perag_put(struct xfs_perag *pag); struct aghdr_init_data { /* per ag data */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e32a1833d523..2e3dcdfd4984 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -19,7 +19,7 @@ #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 82b7cbb1f24f..dc2b77829915 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -10,7 +10,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_btree.h" @@ -24,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a43e4c50e69b..a540b6e799e0 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" @@ -19,6 +18,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans.h" +#include "xfs_ag.h" STATIC struct xfs_btree_cur * diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 556184b63061..aa371d005131 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -27,6 +27,7 @@ #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_log.h" +#include "xfs_ag.h" /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 7e3b9b01431e..2086c55b67bd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -31,6 +31,7 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index eefdb518fe64..8dc9225a5353 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -10,7 +10,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -27,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_rmap.h" +#include "xfs_ag.h" /* * Lookup a record by ino in the btree given by cur. diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index a6ac60ae9421..b281f0c674f5 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" @@ -20,6 +19,7 @@ #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_rmap.h" +#include "xfs_ag.h" static struct xfs_btree_cur * xfs_refcountbt_dup_cursor( diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 10e0cf9949a2..61e8f10436ac 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -21,6 +21,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_inode.h" +#include "xfs_ag.h" /* * Lookup the first record less than or equal to [bno, len, owner, offset] diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9f5bcbd834c3..f1fee42dda2d 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_alloc.h" @@ -20,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dfbbcbd448c1..99dc905b4f89 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -15,7 +15,6 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_bmap_btree.h" @@ -30,67 +29,6 @@ * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ -/* - * Reference counting access wrappers to the perag structures. - * Because we never free per-ag structures, the only thing we - * have to protect against changes is the tree structure itself. - */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ref = 0; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); - } - rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - int ref; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - ref = atomic_inc_return(&pag->pag_ref); - rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); - return pag; -} - -void -xfs_perag_put( - struct xfs_perag *pag) -{ - int ref; - - ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); -} - /* Check all the superblock fields we care about when reading one in. */ STATIC int xfs_validate_sb_read( @@ -841,78 +779,6 @@ xfs_sb_mount_common( mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); } -/* - * xfs_initialize_perag_data - * - * Read in each per-ag structure so we can count up the number of - * allocated inodes, free inodes and used filesystem blocks as this - * information is no longer persistent in the superblock. Once we have - * this information, write it into the in-core superblock structure. - */ -int -xfs_initialize_perag_data( - struct xfs_mount *mp, - xfs_agnumber_t agcount) -{ - xfs_agnumber_t index; - xfs_perag_t *pag; - xfs_sb_t *sbp = &mp->m_sb; - uint64_t ifree = 0; - uint64_t ialloc = 0; - uint64_t bfree = 0; - uint64_t bfreelst = 0; - uint64_t btree = 0; - uint64_t fdblocks; - int error = 0; - - for (index = 0; index < agcount; index++) { - /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. - */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) - return error; - - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); - ifree += pag->pagi_freecount; - ialloc += pag->pagi_count; - bfree += pag->pagf_freeblks; - bfreelst += pag->pagf_flcount; - btree += pag->pagf_btreeblks; - xfs_perag_put(pag); - } - fdblocks = bfree + bfreelst + btree; - - /* - * If the new summary counts are obviously incorrect, fail the - * mount operation because that implies the AGFs are also corrupt. - * Clear FS_COUNTERS so that we don't unmount with a dirty log, which - * will prevent xfs_repair from fixing anything. - */ - if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { - xfs_alert(mp, "AGF corruption. Please run xfs_repair."); - error = -EFSCORRUPTED; - goto out; - } - - /* Overwrite incore superblock counters with just-read data */ - spin_lock(&mp->m_sb_lock); - sbp->sb_ifree = ifree; - sbp->sb_icount = ialloc; - sbp->sb_fdblocks = fdblocks; - spin_unlock(&mp->m_sb_lock); - - xfs_reinit_percpu_counters(mp); -out: - xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); - return error; -} - /* * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock * into the superblock buffer to be logged. It does not provide the higher diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index f79f9dc632b6..0c1602d9b53d 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -13,15 +13,6 @@ struct xfs_trans; struct xfs_fsop_geom; struct xfs_perag; -/* - * perag get/put wrappers for ref counting - */ -extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); -extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); -extern void xfs_perag_put(struct xfs_perag *pag); -extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); - extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); extern int xfs_sync_sb_buf(struct xfs_mount *mp); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 7a2f9b5f2db5..64a7a30f4ac0 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -14,6 +14,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 23690f824ffa..1cdfbd57f36b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -20,6 +20,7 @@ #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index aa874607618a..c8da976b50fc 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -12,7 +12,6 @@ #include "xfs_btree.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_alloc.h" @@ -26,6 +25,7 @@ #include "xfs_trans_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index f1d1a8c58853..453ae9adf94c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -9,11 +9,11 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" #include "xfs_btree.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 3de59b5c2ce6..2e61df3bca83 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -8,7 +8,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_btree.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/health.h" diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c2857d854c83..1308b62a8170 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -22,6 +22,7 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_quota.h" #include "scrub/scrub.h" diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 592800c8852f..5788b92fc0d2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -10,7 +10,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -19,6 +18,7 @@ #include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_ag.h" static kmem_zone_t *xfs_buf_zone; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index f979d0d7e6cd..3bf6dba1a040 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -8,7 +8,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" @@ -18,6 +17,7 @@ #include "xfs_extent_busy.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_ag.h" STATIC int xfs_trim_extents( diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index a4075685d9eb..cb037d7c72b2 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -11,13 +11,13 @@ #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_log.h" +#include "xfs_ag.h" void xfs_extent_busy_insert( diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index db23e455eb91..eed6ca5f8f91 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -9,13 +9,13 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" #include "xfs_trace.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trans.h" #include "xfs_filestream.h" diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 8e0cb05a7142..b79475ea3dbd 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -9,11 +9,11 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_health.h" +#include "xfs_ag.h" /* * Warn about metadata corruption that we detected but haven't fixed, and diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c81daca0e9a..588ea2bf88bb 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -23,6 +22,7 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_ialloc.h" +#include "xfs_ag.h" #include diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0369eb22c1bb..4d397c29ff83 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -11,7 +11,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" @@ -35,6 +34,7 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" +#include "xfs_ag.h" kmem_zone_t *xfs_inode_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e5dd1c0c2f03..fee2a4e80241 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -25,6 +25,7 @@ #include "xfs_icache.h" #include "xfs_error.h" #include "xfs_buf_item.h" +#include "xfs_ag.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bdfee1943796..21c630dde476 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -32,6 +32,7 @@ #include "xfs_extent_busy.h" #include "xfs_health.h" #include "xfs_trace.h" +#include "xfs_ag.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 4bf949a89d0d..f7baf4dc2554 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -23,6 +23,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_error.h" +#include "xfs_ag.h" /* * The global quota manager. There is only one of these for the entire diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 060695d6d56a..f297d68a931b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -27,7 +27,7 @@ #include "xfs_quota.h" #include "xfs_reflink.h" #include "xfs_iomap.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a2dab05332ac..688309dbe18b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -36,6 +36,7 @@ #include "xfs_bmap_item.h" #include "xfs_reflink.h" #include "xfs_pwork.h" +#include "xfs_ag.h" #include #include From 61aa005a5bd7705e0bdca8b40c694369d40fb93f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 021/102] xfs: prepare for moving perag definitions and support to libxfs The perag structures really need to be defined with the rest of the AG support infrastructure. The struct xfs_perag and init/teardown has been placed in xfs_mount.[ch] because there are differences in the structure between kernel and userspace. Mainly that userspace doesn't have a lot of the internal stuff that the kernel has for caches and discard and other such structures. However, it makes more sense to move this to libxfs than to keep this separation because we are now moving to use struct perags everywhere in the code instead of passing raw agnumber_t values about. Hence we shoudl really move the support infrastructure to libxfs/xfs_ag.[ch]. To do this without breaking userspace, first we need to rearrange the structures and code so that all the kernel specific code is located together. This makes it simple for userspace to ifdef out the all the parts it does not need, minimising the code differences between kernel and userspace. The next commit will do the move... Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_mount.c | 56 ++++++++++++++++++++++++++-------------------- fs/xfs/xfs_mount.h | 19 ++++++++-------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 21c630dde476..6966d7b12a13 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -148,9 +148,11 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); + call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -175,14 +177,14 @@ xfs_sb_validate_fsb_count( int xfs_initialize_perag( - xfs_mount_t *mp, - xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi) + struct xfs_mount *mp, + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) { - xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; - xfs_perag_t *pag; - int error = -ENOMEM; + struct xfs_perag *pag; + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = NULLAGNUMBER; + int error; /* * Walk the current per-ag tree so we don't try to initialise AGs @@ -203,21 +205,10 @@ xfs_initialize_perag( } pag->pag_agno = index; pag->pag_mount = mp; - spin_lock_init(&pag->pag_ici_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - - error = xfs_buf_hash_init(pag); - if (error) - goto out_free_pag; - init_waitqueue_head(&pag->pagb_wait); - spin_lock_init(&pag->pagb_lock); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; error = radix_tree_preload(GFP_NOFS); if (error) - goto out_hash_destroy; + goto out_free_pag; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { @@ -225,17 +216,32 @@ xfs_initialize_perag( spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; - goto out_hash_destroy; + goto out_free_pag; } spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; + + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + spin_lock_init(&pag->pagb_lock); + spin_lock_init(&pag->pag_state_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + init_waitqueue_head(&pag->pagb_wait); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; + + error = xfs_buf_hash_init(pag); + if (error) + goto out_remove_pag; + error = xfs_iunlink_init(pag); if (error) goto out_hash_destroy; - spin_lock_init(&pag->pag_state_lock); + + /* first new pag is fully initialized */ + if (first_initialised == NULLAGNUMBER) + first_initialised = index; } index = xfs_set_inode_alloc(mp, agcount); @@ -248,6 +254,8 @@ xfs_initialize_perag( out_hash_destroy: xfs_buf_hash_destroy(pag); +out_remove_pag: + radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: kmem_free(pag); out_unwind_new_pags: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bb67274ee23f..6e534be5eea8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -338,6 +338,16 @@ typedef struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; + int pagb_count; /* pagb slots in use */ + uint8_t pagf_refcount_level; /* recount btree height */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for the reverse mapping btree. */ + struct xfs_ag_resv pag_rmapbt_resv; + + /* -- kernel only structures below this line -- */ + /* * Bitsets of per-ag metadata that have been checked and/or are sick. * Callers should hold pag_state_lock before accessing this field. @@ -364,19 +374,10 @@ typedef struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; - int pagb_count; /* pagb slots in use */ - - /* Blocks reserved for all kinds of metadata. */ - struct xfs_ag_resv pag_meta_resv; - /* Blocks reserved for the reverse mapping btree. */ - struct xfs_ag_resv pag_rmapbt_resv; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - /* reference count */ - uint8_t pagf_refcount_level; - /* * Unlinked inode information. This incore information reflects * data stored in the AGI, so callers must hold the AGI buffer lock From 07b6403a6873045344b0c18cbb4a4360854f6d76 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 022/102] xfs: move perag structure and setup to libxfs/xfs_ag.[ch] Move the xfs_perag infrastructure to the libxfs files that contain all the per AG infrastructure. This helps set up for passing perags around all the code instead of bare agnos with minimal extra includes for existing files. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 135 ++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_ag.h | 98 +++++++++++++++++++++++++- fs/xfs/libxfs/xfs_ag_resv.h | 15 ++++ fs/xfs/libxfs/xfs_btree.c | 1 + fs/xfs/xfs_mount.c | 133 ----------------------------------- fs/xfs/xfs_mount.h | 111 +---------------------------- fs/xfs/xfs_trace.c | 2 + 7 files changed, 252 insertions(+), 243 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 2ca31dc46fe8..97fb160e01de 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -28,6 +28,9 @@ #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" + /* * Passive reference counting access wrappers to the perag structures. If the @@ -163,6 +166,138 @@ out: return error; } +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} + +/* + * Free up the per-ag resources associated with the mount structure. + */ +void +xfs_free_perag( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_iunlink_destroy(pag); + xfs_buf_hash_destroy(pag); + + call_rcu(&pag->rcu_head, __xfs_free_perag); + } +} + +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = NULLAGNUMBER; + int error; + + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) { + error = -ENOMEM; + goto out_unwind_new_pags; + } + pag->pag_agno = index; + pag->pag_mount = mp; + + error = radix_tree_preload(GFP_NOFS); + if (error) + goto out_free_pag; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + WARN_ON_ONCE(1); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_free_pag; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + spin_lock_init(&pag->pagb_lock); + spin_lock_init(&pag->pag_state_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + init_waitqueue_head(&pag->pagb_wait); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; + + error = xfs_buf_hash_init(pag); + if (error) + goto out_remove_pag; + + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; + + /* first new pag is fully initialized */ + if (first_initialised == NULLAGNUMBER) + first_initialised = index; + } + + index = xfs_set_inode_alloc(mp, agcount); + + if (maxagi) + *maxagi = index; + + mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); + return 0; + +out_hash_destroy: + xfs_buf_hash_destroy(pag); +out_remove_pag: + radix_tree_delete(&mp->m_perag_tree, index); +out_free_pag: + kmem_free(pag); +out_unwind_new_pags: + /* unwind any prior newly initialized pags */ + for (index = first_initialised; index < agcount; index++) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + if (!pag) + break; + xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); + kmem_free(pag); + } + return error; +} + static int xfs_get_aghdr_buf( struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index cb1bd1c03cd7..ec37f9d9f310 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -12,9 +12,103 @@ struct xfs_trans; struct xfs_perag; /* - * perag get/put wrappers for ref counting + * Per-ag infrastructure */ -int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); + +/* per-AG block reservation data structures*/ +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. + */ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + bool pagf_agflreset; /* agfl requires reset before use */ + uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + + int pagb_count; /* pagb slots in use */ + uint8_t pagf_refcount_level; /* recount btree height */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for the reverse mapping btree. */ + struct xfs_ag_resv pag_rmapbt_resv; + + /* -- kernel only structures below this line -- */ + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold pag_state_lock before accessing this field. + */ + uint16_t pag_checked; + uint16_t pag_sick; + spinlock_t pag_state_lock; + + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + unsigned int pagb_gen; /* generation count for pagb_tree */ + wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + + /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; +} xfs_perag_t; + +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); +int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); +void xfs_free_perag(struct xfs_mount *mp); + struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, int tag); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index 8a8eb4bc48bb..b74b210008ea 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -18,6 +18,21 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_trans *tp, xfs_extlen_t len); +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_RMAPBT: + return &pag->pag_rmapbt_resv; + default: + return NULL; + } +} + /* * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from * the AGFL, they are allocated one at a time and the reservation updates don't diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5b6fcb9b44e2..0f12b885600d 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -21,6 +21,7 @@ #include "xfs_alloc.h" #include "xfs_log.h" #include "xfs_btree_staging.h" +#include "xfs_ag.h" /* * Cursor allocation zone. diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6966d7b12a13..c3a96fb3ad80 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -120,43 +120,6 @@ xfs_uuid_unmount( mutex_unlock(&xfs_uuid_table_mutex); } - -STATIC void -__xfs_free_perag( - struct rcu_head *head) -{ - struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); - - ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); - kmem_free(pag); -} - -/* - * Free up the per-ag resources associated with the mount structure. - */ -STATIC void -xfs_free_perag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - struct xfs_perag *pag; - - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, agno); - spin_unlock(&mp->m_perag_lock); - ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); - - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_iunlink_destroy(pag); - xfs_buf_hash_destroy(pag); - - call_rcu(&pag->rcu_head, __xfs_free_perag); - } -} - /* * Check size of device based on the (data/realtime) block count. * Note: this check is used by the growfs code as well as mount. @@ -175,102 +138,6 @@ xfs_sb_validate_fsb_count( return 0; } -int -xfs_initialize_perag( - struct xfs_mount *mp, - xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi) -{ - struct xfs_perag *pag; - xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; - int error; - - /* - * Walk the current per-ag tree so we don't try to initialise AGs - * that already exist (growfs case). Allocate and insert all the - * AGs we don't find ready for initialisation. - */ - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - if (pag) { - xfs_perag_put(pag); - continue; - } - - pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); - if (!pag) { - error = -ENOMEM; - goto out_unwind_new_pags; - } - pag->pag_agno = index; - pag->pag_mount = mp; - - error = radix_tree_preload(GFP_NOFS); - if (error) - goto out_free_pag; - - spin_lock(&mp->m_perag_lock); - if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - WARN_ON_ONCE(1); - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - error = -EEXIST; - goto out_free_pag; - } - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - - /* Place kernel structure only init below this point. */ - spin_lock_init(&pag->pag_ici_lock); - spin_lock_init(&pag->pagb_lock); - spin_lock_init(&pag->pag_state_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - init_waitqueue_head(&pag->pagb_wait); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; - - error = xfs_buf_hash_init(pag); - if (error) - goto out_remove_pag; - - error = xfs_iunlink_init(pag); - if (error) - goto out_hash_destroy; - - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; - } - - index = xfs_set_inode_alloc(mp, agcount); - - if (maxagi) - *maxagi = index; - - mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); - return 0; - -out_hash_destroy: - xfs_buf_hash_destroy(pag); -out_remove_pag: - radix_tree_delete(&mp->m_perag_tree, index); -out_free_pag: - kmem_free(pag); -out_unwind_new_pags: - /* unwind any prior newly initialized pags */ - for (index = first_initialised; index < agcount; index++) { - pag = radix_tree_delete(&mp->m_perag_tree, index); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_iunlink_destroy(pag); - kmem_free(pag); - } - return error; -} - /* * xfs_readsb * diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6e534be5eea8..c78b63fe779a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -12,6 +12,7 @@ struct xfs_mru_cache; struct xfs_ail; struct xfs_quotainfo; struct xfs_da_geometry; +struct xfs_perag; /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { @@ -297,118 +298,12 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -/* per-AG block reservation data structures*/ -struct xfs_ag_resv { - /* number of blocks originally reserved here */ - xfs_extlen_t ar_orig_reserved; - /* number of blocks reserved here */ - xfs_extlen_t ar_reserved; - /* number of blocks originally asked for */ - xfs_extlen_t ar_asked; -}; - -/* - * Per-ag incore structure, copies of information in agf and agi, to improve the - * performance of allocation group selection. - */ -typedef struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* perag reference count */ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ - uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ - bool pagf_agflreset; /* agfl requires reset before use */ - uint32_t pagf_flcount; /* count of blocks in freelist */ - xfs_extlen_t pagf_freeblks; /* total free blocks */ - xfs_extlen_t pagf_longest; /* longest free space */ - uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ - xfs_agino_t pagi_freecount; /* number of free inodes */ - xfs_agino_t pagi_count; /* number of allocated inodes */ - - /* - * Inode allocation search lookup optimisation. - * If the pagino matches, the search for new inodes - * doesn't need to search the near ones again straight away - */ - xfs_agino_t pagl_pagino; - xfs_agino_t pagl_leftrec; - xfs_agino_t pagl_rightrec; - - int pagb_count; /* pagb slots in use */ - uint8_t pagf_refcount_level; /* recount btree height */ - - /* Blocks reserved for all kinds of metadata. */ - struct xfs_ag_resv pag_meta_resv; - /* Blocks reserved for the reverse mapping btree. */ - struct xfs_ag_resv pag_rmapbt_resv; - - /* -- kernel only structures below this line -- */ - - /* - * Bitsets of per-ag metadata that have been checked and/or are sick. - * Callers should hold pag_state_lock before accessing this field. - */ - uint16_t pag_checked; - uint16_t pag_sick; - spinlock_t pag_state_lock; - - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - unsigned int pagb_gen; /* generation count for pagb_tree */ - wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ - - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - - spinlock_t pag_ici_lock; /* incore inode cache lock */ - struct radix_tree_root pag_ici_root; /* incore inode cache root */ - int pag_ici_reclaimable; /* reclaimable inodes */ - unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; - - /* for rcu-safe freeing */ - struct rcu_head rcu_head; - - /* background prealloc block trimming */ - struct delayed_work pag_blockgc_work; - - /* - * Unlinked inode information. This incore information reflects - * data stored in the AGI, so callers must hold the AGI buffer lock - * or have some other means to control concurrency. - */ - struct rhashtable pagi_unlinked_hash; -} xfs_perag_t; - -static inline struct xfs_ag_resv * -xfs_perag_resv( - struct xfs_perag *pag, - enum xfs_ag_resv_type type) -{ - switch (type) { - case XFS_AG_RESV_METADATA: - return &pag->pag_meta_resv; - case XFS_AG_RESV_RMAPBT: - return &pag->pag_rmapbt_resv; - default: - return NULL; - } -} - -int xfs_buf_hash_init(xfs_perag_t *pag); -void xfs_buf_hash_destroy(xfs_perag_t *pag); +int xfs_buf_hash_init(struct xfs_perag *pag); +void xfs_buf_hash_destroy(struct xfs_perag *pag); extern void xfs_uuid_table_free(void); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); -extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9b8d703dc9fd..7e01e00550ac 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -30,6 +30,8 @@ #include "xfs_fsmap.h" #include "xfs_btree_staging.h" #include "xfs_icache.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" /* * We include this last to have the helpers above available for the trace From f250eedcf7621b9a56d563912b4eeacd524422c7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 023/102] xfs: make for_each_perag... a first class citizen for_each_perag_tag() is defined in xfs_icache.c for local use. Promote this to xfs_ag.h and define equivalent iteration functions so that we can use them to iterate AGs instead to replace open coded perag walks and perag lookups. We also convert as many of the straight forward open coded AG walks to use these iterators as possible. Anything that is not a direct conversion to an iterator is ignored and will be updated in future commits. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.h | 17 +++++++++++++++++ fs/xfs/scrub/fscounters.c | 40 +++++++++++++++------------------------ fs/xfs/xfs_extent_busy.c | 7 ++----- fs/xfs/xfs_fsops.c | 8 ++------ fs/xfs/xfs_health.c | 4 +--- fs/xfs/xfs_icache.c | 15 ++------------- 6 files changed, 39 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index ec37f9d9f310..33783120263c 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -114,6 +114,23 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, int tag); void xfs_perag_put(struct xfs_perag *pag); +/* + * Perag iteration APIs + */ +#define for_each_perag(mp, next_agno, pag) \ + for ((next_agno) = 0, (pag) = xfs_perag_get((mp), 0); \ + (pag) != NULL; \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get((mp), (next_agno))) + +#define for_each_perag_tag(mp, next_agno, pag, tag) \ + for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) + struct aghdr_init_data { /* per ag data */ xfs_agblock_t agno; /* ag to init */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 453ae9adf94c..fd7941e04ae1 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -71,11 +71,11 @@ xchk_fscount_warmup( xfs_agnumber_t agno; int error = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); - + for_each_perag(mp, agno, pag) { + if (xchk_should_terminate(sc, &error)) + break; if (pag->pagi_init && pag->pagf_init) - goto next_loop_perag; + continue; /* Lock both AG headers. */ error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); @@ -89,21 +89,15 @@ xchk_fscount_warmup( * These are supposed to be initialized by the header read * function. */ - error = -EFSCORRUPTED; - if (!pag->pagi_init || !pag->pagf_init) + if (!pag->pagi_init || !pag->pagf_init) { + error = -EFSCORRUPTED; break; + } xfs_buf_relse(agf_bp); agf_bp = NULL; xfs_buf_relse(agi_bp); agi_bp = NULL; -next_loop_perag: - xfs_perag_put(pag); - pag = NULL; - error = 0; - - if (xchk_should_terminate(sc, &error)) - break; } if (agf_bp) @@ -196,13 +190,14 @@ retry: fsc->ifree = 0; fsc->fdblocks = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { + if (xchk_should_terminate(sc, &error)) + break; /* This somehow got unset since the warmup? */ if (!pag->pagi_init || !pag->pagf_init) { - xfs_perag_put(pag); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + break; } /* Count all the inodes */ @@ -216,10 +211,8 @@ retry: fsc->fdblocks += pag->pagf_btreeblks; } else { error = xchk_fscount_btreeblks(sc, fsc, agno); - if (error) { - xfs_perag_put(pag); + if (error) break; - } } /* @@ -229,12 +222,9 @@ retry: fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; - xfs_perag_put(pag); - - if (xchk_should_terminate(sc, &error)) - break; } - + if (pag) + xfs_perag_put(pag); if (error) return error; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index cb037d7c72b2..422667e0668b 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -605,12 +605,11 @@ void xfs_extent_busy_wait_all( struct xfs_mount *mp) { + struct xfs_perag *pag; DEFINE_WAIT (wait); xfs_agnumber_t agno; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - struct xfs_perag *pag = xfs_perag_get(mp, agno); - + for_each_perag(mp, agno, pag) { do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (RB_EMPTY_ROOT(&pag->pagb_tree)) @@ -618,8 +617,6 @@ xfs_extent_busy_wait_all( schedule(); } while (1); finish_wait(&pag->pagb_wait, &wait); - - xfs_perag_put(pag); } } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index be9cf88d2ad7..07c745cd483e 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -576,10 +576,8 @@ xfs_fs_reserve_ag_blocks( int err2; mp->m_finobt_nores = false; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_init(pag, NULL); - xfs_perag_put(pag); if (err2 && !error) error = err2; } @@ -605,10 +603,8 @@ xfs_fs_unreserve_ag_blocks( int error = 0; int err2; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_free(pag); - xfs_perag_put(pag); if (err2 && !error) error = err2; } diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index b79475ea3dbd..5de3195f6cb2 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -34,14 +34,12 @@ xfs_health_unmount( return; /* Measure AG corruption levels. */ - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { xfs_ag_measure_sickness(pag, &sick, &checked); if (sick) { trace_xfs_ag_unfixed_corruption(mp, agno, sick); warn = true; } - xfs_perag_put(pag); } /* Measure realtime volume corruption levels. */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 588ea2bf88bb..7dad83a6f586 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1061,15 +1061,13 @@ xfs_reclaim_inodes_ag( int *nr_to_scan) { struct xfs_perag *pag; - xfs_agnumber_t ag = 0; + xfs_agnumber_t agno; - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + for_each_perag_tag(mp, agno, pag, XFS_ICI_RECLAIM_TAG) { unsigned long first_index = 0; int done = 0; int nr_found = 0; - ag = pag->pag_agno + 1; - first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; @@ -1134,7 +1132,6 @@ xfs_reclaim_inodes_ag( if (done) first_index = 0; WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); - xfs_perag_put(pag); } } @@ -1554,14 +1551,6 @@ xfs_inode_clear_cowblocks_tag( return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); } -#define for_each_perag_tag(mp, next_agno, pag, tag) \ - for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ - (pag) != NULL; \ - (next_agno) = (pag)->pag_agno + 1, \ - xfs_perag_put(pag), \ - (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) - - /* Disable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_stop( From 934933c3eec9e4a5826d3d7a47aca0742337fded Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 024/102] xfs: convert raw ag walks to use for_each_perag Convert the raw walks to an iterator, pulling the current AG out of pag->pag_agno instead of the loop iterator variable. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_types.c | 4 ++- fs/xfs/scrub/bmap.c | 13 +++++---- fs/xfs/xfs_log_recover.c | 55 ++++++++++++++++++--------------------- fs/xfs/xfs_reflink.c | 9 ++++--- 4 files changed, 43 insertions(+), 38 deletions(-) diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 04801362e1a7..e8f4abee7892 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -11,6 +11,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_ag.h" /* Find the size of the AG, in blocks. */ inline xfs_agblock_t @@ -222,12 +223,13 @@ xfs_icount_range( unsigned long long *max) { unsigned long long nr_inos = 0; + struct xfs_perag *pag; xfs_agnumber_t agno; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag(mp, agno, pag) { xfs_agino_t first, last; xfs_agino_range(mp, agno, &first, &last); diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index b5ebf1d1b4db..e457c086887f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -22,6 +22,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* Set us up with an inode's bmap. */ int @@ -575,6 +576,7 @@ xchk_bmap_check_rmaps( int whichfork) { struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_perag *pag; xfs_agnumber_t agno; bool zero_size; int error; @@ -607,15 +609,16 @@ xchk_bmap_check_rmaps( (zero_size || ifp->if_nextents > 0)) return 0; - for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { - error = xchk_bmap_check_ag_rmaps(sc, whichfork, agno); + for_each_perag(sc->mp, agno, pag) { + error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag->pag_agno); if (error) - return error; + break; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) break; } - - return 0; + if (pag) + xfs_perag_put(pag); + return error; } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index fee2a4e80241..1227503d2246 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2742,21 +2742,17 @@ STATIC void xlog_recover_process_iunlinks( struct xlog *log) { - xfs_mount_t *mp; - xfs_agnumber_t agno; - xfs_agi_t *agi; - struct xfs_buf *agibp; - xfs_agino_t agino; - int bucket; - int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t agino; + int bucket; + int error; - mp = log->l_mp; - - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - /* - * Find the agi for this ag. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); + for_each_perag(mp, agno, pag) { + error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); if (error) { /* * AGI is b0rked. Don't process it. @@ -2782,7 +2778,7 @@ xlog_recover_process_iunlinks( agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { agino = xlog_recover_process_one_iunlink(mp, - agno, agino, bucket); + pag->pag_agno, agino, bucket); cond_resched(); } } @@ -3494,27 +3490,28 @@ xlog_recover_cancel( */ STATIC void xlog_recover_check_summary( - struct xlog *log) + struct xlog *log) { - xfs_mount_t *mp; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_perag *pag; + struct xfs_buf *agfbp; + struct xfs_buf *agibp; + xfs_agnumber_t agno; + uint64_t freeblks; + uint64_t itotal; + uint64_t ifree; + int error; mp = log->l_mp; freeblks = 0LL; itotal = 0LL; ifree = 0LL; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + for_each_perag(mp, agno, pag) { + error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); if (error) { xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, agno, error); + __func__, pag->pag_agno, error); } else { struct xfs_agf *agfp = agfbp->b_addr; @@ -3523,10 +3520,10 @@ xlog_recover_check_summary( xfs_buf_relse(agfbp); } - error = xfs_read_agi(mp, NULL, agno, &agibp); + error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); if (error) { xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, agno, error); + __func__, pag->pag_agno, error); } else { struct xfs_agi *agi = agibp->b_addr; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f297d68a931b..0e430b0c1b16 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -755,16 +755,19 @@ int xfs_reflink_recover_cow( struct xfs_mount *mp) { + struct xfs_perag *pag; xfs_agnumber_t agno; int error = 0; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - error = xfs_refcount_recover_cow_leftovers(mp, agno); - if (error) + for_each_perag(mp, agno, pag) { + error = xfs_refcount_recover_cow_leftovers(mp, pag->pag_agno); + if (error) { + xfs_perag_put(pag); break; + } } return error; From 6f4118fc6482b1989cdcb19a1a0ab53b2dca7ab9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 025/102] xfs: convert xfs_iwalk to use perag references Rather than manually walking the ags and passing agnunbers around, pass the perag for the AG we are currently working on around in the iwalk structure. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.h | 16 +++++--- fs/xfs/xfs_iwalk.c | 86 ++++++++++++++++++++++++++---------------- 2 files changed, 64 insertions(+), 38 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 33783120263c..f87a60a4a849 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -117,19 +117,23 @@ void xfs_perag_put(struct xfs_perag *pag); /* * Perag iteration APIs */ -#define for_each_perag(mp, next_agno, pag) \ - for ((next_agno) = 0, (pag) = xfs_perag_get((mp), 0); \ +#define for_each_perag_from(mp, next_agno, pag) \ + for ((pag) = xfs_perag_get((mp), (next_agno)); \ (pag) != NULL; \ (next_agno) = (pag)->pag_agno + 1, \ xfs_perag_put(pag), \ (pag) = xfs_perag_get((mp), (next_agno))) -#define for_each_perag_tag(mp, next_agno, pag, tag) \ - for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ +#define for_each_perag(mp, agno, pag) \ + (agno) = 0; \ + for_each_perag_from((mp), (agno), (pag)) + +#define for_each_perag_tag(mp, agno, pag, tag) \ + for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ (pag) != NULL; \ - (next_agno) = (pag)->pag_agno + 1, \ + (agno) = (pag)->pag_agno + 1, \ xfs_perag_put(pag), \ - (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) + (pag) = xfs_perag_get_tag((mp), (agno), (tag))) struct aghdr_init_data { /* per ag data */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index c4a340f1f1e1..c7e8f48a3ec4 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -21,6 +21,7 @@ #include "xfs_health.h" #include "xfs_trans.h" #include "xfs_pwork.h" +#include "xfs_ag.h" /* * Walking Inodes in the Filesystem @@ -51,6 +52,7 @@ struct xfs_iwalk_ag { struct xfs_mount *mp; struct xfs_trans *tp; + struct xfs_perag *pag; /* Where do we start the traversal? */ xfs_ino_t startino; @@ -90,7 +92,7 @@ struct xfs_iwalk_ag { STATIC void xfs_iwalk_ichunk_ra( struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -106,7 +108,7 @@ xfs_iwalk_ichunk_ra( imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { - xfs_btree_reada_bufs(mp, agno, agbno, + xfs_btree_reada_bufs(mp, pag->pag_agno, agbno, igeo->blocks_per_cluster, &xfs_inode_buf_ops); } @@ -174,26 +176,25 @@ xfs_iwalk_free( /* For each inuse inode in each cached inobt record, call our function. */ STATIC int xfs_iwalk_ag_recs( - struct xfs_iwalk_ag *iwag) + struct xfs_iwalk_ag *iwag) { - struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; - xfs_ino_t ino; - unsigned int i, j; - xfs_agnumber_t agno; - int error; + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_perag *pag = iwag->pag; + xfs_ino_t ino; + unsigned int i, j; + int error; - agno = XFS_INO_TO_AGNO(mp, iwag->startino); for (i = 0; i < iwag->nr_recs; i++) { struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; - trace_xfs_iwalk_ag_rec(mp, agno, irec); + trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec); if (xfs_pwork_want_abort(&iwag->pwork)) return 0; if (iwag->inobt_walk_fn) { - error = iwag->inobt_walk_fn(mp, tp, agno, irec, + error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec, iwag->data); if (error) return error; @@ -211,7 +212,8 @@ xfs_iwalk_ag_recs( continue; /* Otherwise call our function. */ - ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + irec->ir_startino + j); error = iwag->iwalk_fn(mp, tp, ino, iwag->data); if (error) return error; @@ -257,7 +259,6 @@ xfs_iwalk_del_inobt( STATIC int xfs_iwalk_ag_start( struct xfs_iwalk_ag *iwag, - xfs_agnumber_t agno, xfs_agino_t agino, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, @@ -265,12 +266,14 @@ xfs_iwalk_ag_start( { struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; + struct xfs_perag *pag = iwag->pag; struct xfs_inobt_rec_incore *irec; int error; /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, tp, pag->pag_agno, XFS_BTNUM_INO, + curpp, agi_bpp); if (error) return error; @@ -304,7 +307,7 @@ xfs_iwalk_ag_start( if (XFS_IS_CORRUPT(mp, *has_more != 1)) return -EFSCORRUPTED; - iwag->lastino = XFS_AGINO_TO_INO(mp, agno, + iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); /* @@ -345,7 +348,6 @@ out_advance: STATIC int xfs_iwalk_run_callbacks( struct xfs_iwalk_ag *iwag, - xfs_agnumber_t agno, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, int *has_more) @@ -376,7 +378,8 @@ xfs_iwalk_run_callbacks( return 0; /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, tp, iwag->pag->pag_agno, XFS_BTNUM_INO, + curpp, agi_bpp); if (error) return error; @@ -390,17 +393,17 @@ xfs_iwalk_ag( { struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; + struct xfs_perag *pag = iwag->pag; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; - xfs_agnumber_t agno; xfs_agino_t agino; int has_more; int error = 0; /* Set up our cursor at the right place in the inode btree. */ - agno = XFS_INO_TO_AGNO(mp, iwag->startino); + ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino)); agino = XFS_INO_TO_AGINO(mp, iwag->startino); - error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); + error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); while (!error && has_more) { struct xfs_inobt_rec_incore *irec; @@ -417,7 +420,7 @@ xfs_iwalk_ag( break; /* Make sure that we always move forward. */ - rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino); + rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { error = -EFSCORRUPTED; @@ -438,7 +441,7 @@ xfs_iwalk_ag( * walking the inodes. */ if (iwag->iwalk_fn) - xfs_iwalk_ichunk_ra(mp, agno, irec); + xfs_iwalk_ichunk_ra(mp, pag, irec); /* * If there's space in the buffer for more records, increment @@ -458,15 +461,14 @@ xfs_iwalk_ag( * we would be if we had been able to increment like above. */ ASSERT(has_more); - error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, - &has_more); + error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); } if (iwag->nr_recs == 0 || error) goto out; /* Walk the unprocessed records in the cache. */ - error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more); + error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); out: xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); @@ -555,6 +557,7 @@ xfs_iwalk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; + struct xfs_perag *pag; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -565,15 +568,19 @@ xfs_iwalk( if (error) return error; - for (; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { + iwag.pag = pag; error = xfs_iwalk_ag(&iwag); if (error) break; iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) break; + iwag.pag = NULL; } + if (iwag.pag) + xfs_perag_put(pag); xfs_iwalk_free(&iwag); return error; } @@ -598,6 +605,7 @@ xfs_iwalk_ag_work( error = xfs_iwalk_ag(iwag); xfs_iwalk_free(iwag); out: + xfs_perag_put(iwag->pag); kmem_free(iwag); return error; } @@ -617,6 +625,7 @@ xfs_iwalk_threaded( void *data) { struct xfs_pwork_ctl pctl; + struct xfs_perag *pag; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -627,7 +636,7 @@ xfs_iwalk_threaded( if (error) return error; - for (; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { struct xfs_iwalk_ag *iwag; if (xfs_pwork_ctl_want_abort(&pctl)) @@ -635,17 +644,25 @@ xfs_iwalk_threaded( iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); iwag->mp = mp; + + /* + * perag is being handed off to async work, so take another + * reference for the async work to release. + */ + atomic_inc(&pag->pag_ref); + iwag->pag = pag; iwag->iwalk_fn = iwalk_fn; iwag->data = data; iwag->startino = startino; iwag->sz_recs = xfs_iwalk_prefetch(inode_records); iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); - startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) break; } - + if (pag) + xfs_perag_put(pag); if (polled) xfs_pwork_poll(&pctl); return xfs_pwork_destroy(&pctl); @@ -715,6 +732,7 @@ xfs_inobt_walk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; + struct xfs_perag *pag; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -725,15 +743,19 @@ xfs_inobt_walk( if (error) return error; - for (; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { + iwag.pag = pag; error = xfs_iwalk_ag(&iwag); if (error) break; - iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) break; + iwag.pag = NULL; } + if (iwag.pag) + xfs_perag_put(pag); xfs_iwalk_free(&iwag); return error; } From 7f8d3b3ca6fe9269b3c5deee0dcea38499288e06 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 026/102] xfs: convert secondary superblock walk to use perags Clean up the last external manual AG walk. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_sb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 99dc905b4f89..04f5386446db 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -24,6 +24,7 @@ #include "xfs_refcount_btree.h" #include "xfs_da_format.h" #include "xfs_health.h" +#include "xfs_ag.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -855,17 +856,18 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - xfs_agnumber_t agno; + struct xfs_perag *pag; + xfs_agnumber_t agno = 1; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -877,7 +879,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - agno); + pag->pag_agno); if (!saved_error) saved_error = error; continue; @@ -898,7 +900,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, agno); + error, pag->pag_agno); if (!saved_error) saved_error = error; continue; From 45d0662117565e6100f9e0cf356cd873542c95b1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 027/102] xfs: pass perags through to the busy extent code All of the callers of the busy extent API either have perag references available to use so we can pass a perag to the busy extent functions rather than having them have to do unnecessary lookups. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 37 +++++++++++++++++---------------- fs/xfs/libxfs/xfs_alloc.h | 2 +- fs/xfs/libxfs/xfs_alloc_btree.c | 5 ++--- fs/xfs/libxfs/xfs_rmap.c | 32 ++++++++++++++++------------ fs/xfs/libxfs/xfs_rmap_btree.c | 7 +++---- fs/xfs/scrub/repair.c | 4 ++-- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_extent_busy.c | 26 +++++++---------------- fs/xfs/xfs_extent_busy.h | 7 ++++--- 9 files changed, 58 insertions(+), 64 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index dc2b77829915..ce31c00dbf6f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1063,7 +1063,7 @@ xfs_alloc_ag_vextent_small( if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { @@ -1178,7 +1178,7 @@ xfs_alloc_ag_vextent( if (error) return error; - ASSERT(!xfs_extent_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->pag, args->agbno, args->len)); } @@ -3292,7 +3292,7 @@ error0: int xfs_free_extent_fix_freelist( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_buf **agbp) { struct xfs_alloc_arg args; @@ -3301,7 +3301,8 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = agno; + args.agno = pag->pag_agno; + args.pag = pag; /* * validate that the block number is legal - the enables us to detect @@ -3310,17 +3311,12 @@ xfs_free_extent_fix_freelist( if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto out; + return error; *agbp = args.agbp; -out: - xfs_perag_put(args.pag); - return error; + return 0; } /* @@ -3344,6 +3340,7 @@ __xfs_free_extent( struct xfs_agf *agf; int error; unsigned int busy_flags = 0; + struct xfs_perag *pag; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -3352,33 +3349,37 @@ __xfs_free_extent( XFS_ERRTAG_FREE_EXTENT)) return -EIO; - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + pag = xfs_perag_get(mp, agno); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - return error; + goto err; agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { error = -EFSCORRUPTED; - goto err; + goto err_release; } /* validate the extent size is legal now we have the agf locked */ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; - goto err; + goto err_release; } error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) - goto err; + goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_perag_put(pag); return 0; -err: +err_release: xfs_trans_brelse(tp, agbp); +err: + xfs_perag_put(pag); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a4427c5775c2..e30900b6f8ba 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -214,7 +214,7 @@ int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); -int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a540b6e799e0..19fdf87e86b9 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -72,7 +72,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false); new->s = cpu_to_be32(bno); @@ -86,7 +86,6 @@ xfs_allocbt_free_block( struct xfs_buf *bp) { struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; @@ -96,7 +95,7 @@ xfs_allocbt_free_block( return error; atomic64_dec(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 61e8f10436ac..1d0a6b686eea 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -11,6 +11,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_sb.h" #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_trans.h" @@ -2363,31 +2364,32 @@ xfs_rmap_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; - xfs_agnumber_t agno; struct xfs_owner_info oinfo; xfs_agblock_t bno; bool unwritten; - agno = XFS_FSB_TO_AGNO(mp, startblock); - ASSERT(agno != NULLAGNUMBER); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); bno = XFS_FSB_TO_AGBNO(mp, startblock); - trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork, + trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, startoff, blockcount, state); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_RMAP_FINISH_ONE)) - return -EIO; + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { + error = -EIO; + goto out_drop; + } + /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.agno != agno) { + if (rcur != NULL && rcur->bc_ag.agno != pag->pag_agno) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2398,13 +2400,15 @@ xfs_rmap_finish_one( * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) - return -EFSCORRUPTED; + goto out_drop; + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + error = -EFSCORRUPTED; + goto out_drop; + } - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag->pag_agno); } *pcur = rcur; @@ -2442,6 +2446,8 @@ xfs_rmap_finish_one( ASSERT(0); error = -EFSCORRUPTED; } +out_drop: + xfs_perag_put(pag); return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f1fee42dda2d..46a5295ecf35 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -100,8 +100,7 @@ xfs_rmapbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, - false); + xfs_extent_busy_reuse(cur->bc_mp, agbp->b_pag, bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); @@ -133,10 +132,10 @@ xfs_rmapbt_free_block( if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + pag = cur->bc_ag.agbp->b_pag; + xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - pag = cur->bc_ag.agbp->b_pag; xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1308b62a8170..6b62872c4d10 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -304,7 +304,7 @@ xrep_alloc_ag_block( return error; if (bno == NULLAGBLOCK) return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno, + xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); if (resv == XFS_AG_RESV_RMAPBT) @@ -519,7 +519,7 @@ xrep_put_freelist( agbno, 0); if (error) return error; - xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1, + xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 3bf6dba1a040..972864250bd2 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -108,7 +108,7 @@ xfs_trim_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_extent_busy_search(mp, agno, fbno, flen)) { + if (xfs_extent_busy_search(mp, pag, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 422667e0668b..ad22a003f959 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -22,28 +22,26 @@ void xfs_extent_busy_insert( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; - struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent = NULL; new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); - new->agno = agno; + new->agno = pag->pag_agno; new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); - pag = xfs_perag_get(tp->t_mountp, new->agno); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; while (*rbp) { @@ -66,7 +64,6 @@ xfs_extent_busy_insert( list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); } /* @@ -81,21 +78,17 @@ xfs_extent_busy_insert( int xfs_extent_busy_search( struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len) { - struct xfs_perag *pag; struct rb_node *rbp; struct xfs_extent_busy *busyp; int match = 0; - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - - rbp = pag->pagb_tree.rb_node; - /* find closest start bno overlap */ + spin_lock(&pag->pagb_lock); + rbp = pag->pagb_tree.rb_node; while (rbp) { busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); if (bno < busyp->bno) { @@ -115,7 +108,6 @@ xfs_extent_busy_search( } } spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); return match; } @@ -281,17 +273,14 @@ out_force_log: void xfs_extent_busy_reuse( struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) { - struct xfs_perag *pag; struct rb_node *rbp; ASSERT(flen > 0); - - pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); restart: rbp = pag->pagb_tree.rb_node; @@ -314,7 +303,6 @@ restart: goto restart; } spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); } /* diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 8aea07100092..4a118131059f 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -9,6 +9,7 @@ #define __XFS_EXTENT_BUSY_H__ struct xfs_mount; +struct xfs_perag; struct xfs_trans; struct xfs_alloc_arg; @@ -31,7 +32,7 @@ struct xfs_extent_busy { }; void -xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, +xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void @@ -39,11 +40,11 @@ xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); int -xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, +xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, +xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); bool From 30933120ad79f4549d6e364df7eda474cc0d9c65 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 028/102] xfs: push perags through the ag reservation callouts We currently pass an agno from the AG reservation functions to the individual feature accounting functions, which in future may have to do perag lookups to access per-AG state. Instead, pre-emptively plumb the perag through from the highest AG reservation layer to the feature callouts so they won't have to look it up again. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_ag_resv.c | 9 ++++----- fs/xfs/libxfs/xfs_ialloc_btree.c | 17 +++++++++-------- fs/xfs/libxfs/xfs_ialloc_btree.h | 2 +- fs/xfs/libxfs/xfs_refcount_btree.c | 7 +++---- fs/xfs/libxfs/xfs_refcount_btree.h | 3 ++- fs/xfs/libxfs/xfs_rmap_btree.c | 6 +++--- fs/xfs/libxfs/xfs_rmap_btree.h | 2 +- 7 files changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 2e3dcdfd4984..f7394a8ecf6b 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -250,7 +250,6 @@ xfs_ag_resv_init( struct xfs_trans *tp) { struct xfs_mount *mp = pag->pag_mount; - xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0, error2; @@ -260,11 +259,11 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; - error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -282,7 +281,7 @@ xfs_ag_resv_init( mp->m_finobt_nores = true; - error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, + error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -300,7 +299,7 @@ xfs_ag_resv_init( if (pag->pag_rmapbt_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 4c5831646bd9..4ec8ea1331a5 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" +#include "xfs_ag.h" STATIC int xfs_inobt_get_minrecs( @@ -680,7 +681,7 @@ static int xfs_inobt_count_blocks( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { @@ -688,7 +689,7 @@ xfs_inobt_count_blocks( struct xfs_btree_cur *cur = NULL; int error; - error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp); + error = xfs_inobt_cur(mp, tp, pag->pag_agno, btnum, &cur, &agbp); if (error) return error; @@ -704,14 +705,14 @@ static int xfs_finobt_read_blocks( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp; struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); if (error) return error; @@ -728,7 +729,7 @@ int xfs_finobt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -739,14 +740,14 @@ xfs_finobt_calc_reserves( return 0; if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) - error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len); + error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); else - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, + error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, &tree_len); if (error) return error; - *ask += xfs_inobt_max_size(mp, agno); + *ask += xfs_inobt_max_size(mp, pag->pag_agno); *used += tree_len; return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 35bbd978c272..d5afe01fb2de 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -64,7 +64,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #endif /* DEBUG */ int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index b281f0c674f5..c4ddf9ded00b 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -450,7 +450,7 @@ int xfs_refcountbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -463,8 +463,7 @@ xfs_refcountbt_calc_reserves( if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); if (error) return error; @@ -479,7 +478,7 @@ xfs_refcountbt_calc_reserves( * expansion. We therefore can pretend the space isn't there. */ if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 69dc515db671..eab1b0c672c0 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; struct xbtree_afakeroot; /* @@ -58,7 +59,7 @@ extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp, xfs_agblock_t agblocks); extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, - struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, + struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 46a5295ecf35..ba2f7064451b 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -595,7 +595,7 @@ int xfs_rmapbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -608,7 +608,7 @@ xfs_rmapbt_calc_reserves( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); if (error) return error; @@ -623,7 +623,7 @@ xfs_rmapbt_calc_reserves( * expansion. We therefore can pretend the space isn't there. */ if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 115c3455a734..57fab72e26ad 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -57,6 +57,6 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, xfs_agblock_t agblocks); extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); #endif /* __XFS_RMAP_BTREE_H__ */ From 58d43a7e3263766ade4974c86118e6b5737ea259 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 029/102] xfs: pass perags around in fsmap data dev functions Needs a [from, to] ranged AG walk, and the perag to be stuffed into the info structure for callouts to use. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.h | 15 +++++++-- fs/xfs/xfs_fsmap.c | 75 ++++++++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index f87a60a4a849..ebf997a8684e 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -116,14 +116,25 @@ void xfs_perag_put(struct xfs_perag *pag); /* * Perag iteration APIs + * + * XXX: for_each_perag_range() usage really needs an iterator to clean up when + * we terminate at end_agno because we may have taken a reference to the perag + * beyond end_agno. Right now callers have to be careful to catch and clean that + * up themselves. This is not necessary for the callers of for_each_perag() and + * for_each_perag_from() because they terminate at sb_agcount where there are + * no perag structures in tree beyond end_agno. */ -#define for_each_perag_from(mp, next_agno, pag) \ +#define for_each_perag_range(mp, next_agno, end_agno, pag) \ for ((pag) = xfs_perag_get((mp), (next_agno)); \ - (pag) != NULL; \ + (pag) != NULL && (next_agno) <= (end_agno); \ (next_agno) = (pag)->pag_agno + 1, \ xfs_perag_put(pag), \ (pag) = xfs_perag_get((mp), (next_agno))) +#define for_each_perag_from(mp, next_agno, pag) \ + for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag)) + + #define for_each_perag(mp, agno, pag) \ (agno) = 0; \ for_each_perag_from((mp), (agno), (pag)) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 34f2b971ce43..835dd6e3819b 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -24,6 +24,7 @@ #include "xfs_refcount_btree.h" #include "xfs_alloc_btree.h" #include "xfs_rtalloc.h" +#include "xfs_ag.h" /* Convert an xfs_fsmap to an fsmap. */ static void @@ -157,10 +158,10 @@ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ + struct xfs_perag *pag; /* AG info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ - xfs_agnumber_t agno; /* AG number, if applicable */ struct xfs_rmap_irec low; /* low rmap key */ struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ @@ -203,14 +204,14 @@ xfs_getfsmap_is_shared( *stat = false; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - /* rt files will have agno set to NULLAGNUMBER */ - if (info->agno == NULLAGNUMBER) + /* rt files will have no perag structure */ + if (!info->pag) return 0; /* Are there any shared blocks here? */ flen = 0; cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, - info->agno); + info->pag->pag_agno); error = xfs_refcount_find_shared(cur, rec->rm_startblock, rec->rm_blockcount, &fbno, &flen, false); @@ -311,7 +312,8 @@ xfs_getfsmap_helper( if (info->head->fmh_entries >= info->head->fmh_count) return -ECANCELED; - trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); + trace_xfs_fsmap_mapping(mp, info->dev, + info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec); fmr.fmr_device = info->dev; fmr.fmr_physical = rec_daddr; @@ -429,8 +431,8 @@ xfs_getfsmap_logdev( info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; info->missing_owner = XFS_FMR_OWN_FREE; - trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); if (keys[0].fmr_physical > 0) return 0; @@ -508,8 +510,8 @@ __xfs_getfsmap_rtdev( info->high.rm_blockcount = 0; xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); - trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); return query_fn(tp, info); } @@ -572,6 +574,7 @@ __xfs_getfsmap_datadev( void *priv) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_btree_cur *bt_cur = NULL; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; @@ -610,20 +613,20 @@ __xfs_getfsmap_datadev( start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); - /* Query each AG */ - for (info->agno = start_ag; info->agno <= end_ag; info->agno++) { + for_each_perag_range(mp, start_ag, end_ag, pag) { /* * Set the AG high key from the fsmap high key if this * is the last AG that we're querying. */ - if (info->agno == end_ag) { + info->pag = pag; + if (pag->pag_agno == end_ag) { info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsb); info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); if (error) - goto err; + break; xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); } @@ -634,38 +637,45 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } - error = xfs_alloc_read_agf(mp, tp, info->agno, 0, + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &info->agf_bp); if (error) - goto err; + break; - trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, info->agno, + trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno, + &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno, &info->high); error = query_fn(tp, info, &bt_cur, priv); if (error) - goto err; + break; /* * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (info->agno == start_ag) { + if (pag->pag_agno == start_ag) { info->low.rm_startblock = 0; info->low.rm_owner = 0; info->low.rm_offset = 0; info->low.rm_flags = 0; } + + /* + * If this is the last AG, report any gap at the end of it + * before we drop the reference to the perag when the loop + * terminates. + */ + if (pag->pag_agno == end_ag) { + info->last = true; + error = query_fn(tp, info, &bt_cur, priv); + if (error) + break; + } + info->pag = NULL; } - /* Report any gap at the end of the AG */ - info->last = true; - error = query_fn(tp, info, &bt_cur, priv); - if (error) - goto err; - -err: if (bt_cur) xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); @@ -673,6 +683,13 @@ err: xfs_trans_brelse(tp, info->agf_bp); info->agf_bp = NULL; } + if (info->pag) { + xfs_perag_put(info->pag); + info->pag = NULL; + } else if (pag) { + /* loop termination case */ + xfs_perag_put(pag); + } return error; } @@ -691,7 +708,7 @@ xfs_getfsmap_datadev_rmapbt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->agno); + info->pag->pag_agno); return xfs_rmap_query_range(*curpp, &info->low, &info->high, xfs_getfsmap_datadev_helper, info); } @@ -724,7 +741,7 @@ xfs_getfsmap_datadev_bnobt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->agno, XFS_BTNUM_BNO); + info->pag->pag_agno, XFS_BTNUM_BNO); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], @@ -937,7 +954,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; - info.agno = NULLAGNUMBER; + info.pag = NULL; error = handlers[i].fn(tp, dkeys, &info); if (error) break; From be9fb17d88f08af648a89784d30dbac83d893154 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 030/102] xfs: add a perag to the btree cursor Which will eventually completely replace the agno in it. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.c | 25 +++++++++++++++---------- fs/xfs/libxfs/xfs_alloc_btree.c | 13 ++++++++++--- fs/xfs/libxfs/xfs_alloc_btree.h | 3 ++- fs/xfs/libxfs/xfs_btree.c | 2 ++ fs/xfs/libxfs/xfs_btree.h | 11 ++++++++++- fs/xfs/libxfs/xfs_ialloc.c | 16 ++++++++-------- fs/xfs/libxfs/xfs_ialloc_btree.c | 15 +++++++++++---- fs/xfs/libxfs/xfs_ialloc_btree.h | 7 ++++--- fs/xfs/libxfs/xfs_refcount.c | 4 ++-- fs/xfs/libxfs/xfs_refcount_btree.c | 17 ++++++++++++----- fs/xfs/libxfs/xfs_refcount_btree.h | 2 +- fs/xfs/libxfs/xfs_rmap.c | 6 +++--- fs/xfs/libxfs/xfs_rmap_btree.c | 17 ++++++++++++----- fs/xfs/libxfs/xfs_rmap_btree.h | 2 +- fs/xfs/scrub/agheader_repair.c | 20 +++++++++++--------- fs/xfs/scrub/bmap.c | 2 +- fs/xfs/scrub/common.c | 12 ++++++------ fs/xfs/scrub/repair.c | 5 +++-- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_fsmap.c | 6 +++--- fs/xfs/xfs_reflink.c | 2 +- 21 files changed, 119 insertions(+), 70 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ce31c00dbf6f..7ec4af6bf494 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -776,7 +776,8 @@ xfs_alloc_cur_setup( */ if (!acur->cnt) acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_CNT); + args->agbp, args->agno, + args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -786,10 +787,12 @@ xfs_alloc_cur_setup( */ if (!acur->bnolt) acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); + args->agbp, args->agno, + args->pag, XFS_BTNUM_BNO); if (!acur->bnogt) acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); + args->agbp, args->agno, + args->pag, XFS_BTNUM_BNO); return i == 1 ? 0 : -ENOSPC; } @@ -1217,7 +1220,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-number freespace btree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->agno, args->pag, XFS_BTNUM_BNO); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1277,7 +1280,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); + args->agno, args->pag, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); @@ -1674,7 +1677,7 @@ restart: * Allocate and initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); + args->agno, args->pag, XFS_BTNUM_CNT); bno_cur = NULL; busy = false; @@ -1837,7 +1840,7 @@ restart: * Allocate and initialize a cursor for the by-block tree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->agno, args->pag, XFS_BTNUM_BNO); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1909,7 +1912,8 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, + NULL, XFS_BTNUM_BNO); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1979,7 +1983,8 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, + NULL, XFS_BTNUM_CNT); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2490,7 +2495,7 @@ xfs_exact_minlen_extent_available( int error = 0; cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, - args->agno, XFS_BTNUM_CNT); + args->agno, args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 19fdf87e86b9..a52ab25bbf0b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -27,7 +27,7 @@ xfs_allocbt_dup_cursor( { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, cur->bc_ag.agno, - cur->bc_btnum); + cur->bc_ag.pag, cur->bc_btnum); } STATIC void @@ -473,6 +473,7 @@ xfs_allocbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; @@ -497,6 +498,11 @@ xfs_allocbt_init_common( cur->bc_ag.agno = agno; cur->bc_ag.abt.active = false; + if (pag) { + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + } + cur->bc_ag.pag = pag; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -513,12 +519,13 @@ xfs_allocbt_init_cursor( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agf structure */ xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, xfs_btnum_t btnum) /* btree identifier */ { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, tp, agno, btnum); + cur = xfs_allocbt_init_common(mp, tp, agno, pag, btnum); if (btnum == XFS_BTNUM_CNT) cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); else @@ -539,7 +546,7 @@ xfs_allocbt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, NULL, agno, btnum); + cur = xfs_allocbt_init_common(mp, NULL, agno, NULL, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index a5b998e950fe..a10cedba18d8 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; struct xbtree_afakeroot; /* @@ -48,7 +49,7 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, - xfs_agnumber_t, xfs_btnum_t); + xfs_agnumber_t, struct xfs_perag *pag, xfs_btnum_t); struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, struct xbtree_afakeroot *afake, xfs_agnumber_t agno, xfs_btnum_t btnum); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0f12b885600d..44044317c0fb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -377,6 +377,8 @@ xfs_btree_del_cursor( XFS_FORCED_SHUTDOWN(cur->bc_mp)); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); + if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) + xfs_perag_put(cur->bc_ag.pag); kmem_cache_free(xfs_btree_cur_zone, cur); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 10e50cbacacf..e71f33f1f111 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -11,6 +11,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_ifork; +struct xfs_perag; extern kmem_zone_t *xfs_btree_cur_zone; @@ -180,11 +181,12 @@ union xfs_btree_irec { /* Per-AG btree information. */ struct xfs_btree_cur_ag { + xfs_agnumber_t agno; + struct xfs_perag *pag; union { struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ }; - xfs_agnumber_t agno; union { struct { unsigned long nr_ops; /* # record updates */ @@ -231,6 +233,13 @@ typedef struct xfs_btree_cur uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ + + /* + * Short btree pointers need an agno to be able to turn the pointers + * into physical addresses for IO, so the btree cursor switches between + * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the + * cursor. + */ union { struct xfs_btree_cur_ag bc_ag; struct xfs_btree_cur_ino bc_ino; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 8dc9225a5353..905872bab426 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -183,7 +183,7 @@ xfs_inobt_insert( int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, btnum); for (thisino = newino; thisino < newino + newlen; @@ -531,7 +531,7 @@ xfs_inobt_insert_sprec( int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -1145,7 +1145,7 @@ xfs_dialloc_ag_inobt( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1598,7 +1598,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_FINO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1641,7 +1641,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); error = xfs_check_agi_freecount(icur, agi); if (error) @@ -1954,7 +1954,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -2080,7 +2080,7 @@ xfs_difree_finobt( int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2281,7 +2281,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 4ec8ea1331a5..6c4efdf01674 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -36,7 +36,7 @@ xfs_inobt_dup_cursor( { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, cur->bc_ag.agno, - cur->bc_btnum); + cur->bc_ag.pag, cur->bc_btnum); } STATIC void @@ -429,6 +429,7 @@ xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_btree_cur *cur; @@ -451,6 +452,11 @@ xfs_inobt_init_common( cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_ag.agno = agno; + if (pag) { + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + } + cur->bc_ag.pag = pag; return cur; } @@ -461,12 +467,13 @@ xfs_inobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(mp, tp, agno, btnum); + cur = xfs_inobt_init_common(mp, tp, agno, pag, btnum); if (btnum == XFS_BTNUM_INO) cur->bc_nlevels = be32_to_cpu(agi->agi_level); else @@ -485,7 +492,7 @@ xfs_inobt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_inobt_init_common(mp, NULL, agno, btnum); + cur = xfs_inobt_init_common(mp, NULL, agno, NULL, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -672,7 +679,7 @@ xfs_inobt_cur( if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which); + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, NULL, which); *curpp = cur; return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d5afe01fb2de..04dfa7eee81f 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; /* * Btree block header size depends on a superblock flag. @@ -45,9 +46,9 @@ struct xfs_mount; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, - xfs_btnum_t); +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum); struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, struct xbtree_afakeroot *afake, xfs_agnumber_t agno, xfs_btnum_t btnum); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 2037b9f23069..1c2bd2949d7d 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1178,7 +1178,7 @@ xfs_refcount_finish_one( if (error) return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); rcur->bc_ag.refc.nr_ops = nr_ops; rcur->bc_ag.refc.shape_changes = shape_changes; } @@ -1707,7 +1707,7 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index c4ddf9ded00b..74f8ac0209f1 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -26,7 +26,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno); + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_ag.pag); } STATIC void @@ -316,7 +316,8 @@ static struct xfs_btree_cur * xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno) + xfs_agnumber_t agno, + struct xfs_perag *pag) { struct xfs_btree_cur *cur; @@ -332,6 +333,11 @@ xfs_refcountbt_init_common( cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + if (pag) { + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + } + cur->bc_ag.pag = pag; cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; @@ -345,12 +351,13 @@ xfs_refcountbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno) + xfs_agnumber_t agno, + struct xfs_perag *pag) { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, tp, agno); + cur = xfs_refcountbt_init_common(mp, tp, agno, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); cur->bc_ag.agbp = agbp; return cur; @@ -365,7 +372,7 @@ xfs_refcountbt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, NULL, agno); + cur = xfs_refcountbt_init_common(mp, NULL, agno, NULL); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index eab1b0c672c0..8b82a39f104a 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -47,7 +47,7 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno); + xfs_agnumber_t agno, struct xfs_perag *pag); struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, struct xbtree_afakeroot *afake, xfs_agnumber_t agno); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 1d0a6b686eea..0d7a6997120c 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -708,7 +708,7 @@ xfs_rmap_free( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno, NULL); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); @@ -962,7 +962,7 @@ xfs_rmap_alloc( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno, NULL); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -2408,7 +2408,7 @@ xfs_rmap_finish_one( goto out_drop; } - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag->pag_agno); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag->pag_agno, pag); } *pcur = rcur; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index ba2f7064451b..7bef8feeded1 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -52,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno); + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_ag.pag); } STATIC void @@ -449,7 +449,8 @@ static struct xfs_btree_cur * xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno) + xfs_agnumber_t agno, + struct xfs_perag *pag) { struct xfs_btree_cur *cur; @@ -463,6 +464,11 @@ xfs_rmapbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; + if (pag) { + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + } + cur->bc_ag.pag = pag; return cur; } @@ -473,12 +479,13 @@ xfs_rmapbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno) + xfs_agnumber_t agno, + struct xfs_perag *pag) { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, tp, agno); + cur = xfs_rmapbt_init_common(mp, tp, agno, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); cur->bc_ag.agbp = agbp; return cur; @@ -493,7 +500,7 @@ xfs_rmapbt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, NULL, agno); + cur = xfs_rmapbt_init_common(mp, NULL, agno, NULL); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 57fab72e26ad..c94f418cc06b 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -43,7 +43,7 @@ struct xbtree_afakeroot; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - xfs_agnumber_t agno); + xfs_agnumber_t agno, struct xfs_perag *pag); struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, struct xbtree_afakeroot *afake, xfs_agnumber_t agno); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 1cdfbd57f36b..5dd91bf04c18 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -247,7 +247,7 @@ xrep_agf_calc_from_btrees( /* Update the AGF counters from the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_BNO); + sc->sa.pag, XFS_BTNUM_BNO); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) goto err; @@ -261,7 +261,7 @@ xrep_agf_calc_from_btrees( /* Update the AGF counters from the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_CNT); + sc->sa.pag, XFS_BTNUM_CNT); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -269,7 +269,8 @@ xrep_agf_calc_from_btrees( btreeblks += blocks - 1; /* Update the AGF counters from the rmapbt. */ - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -282,7 +283,7 @@ xrep_agf_calc_from_btrees( /* Update the AGF counters from the refcountbt. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.agno); + sc->sa.agno, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -490,7 +491,8 @@ xrep_agfl_collect_blocks( xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); if (error) goto err; @@ -498,7 +500,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_BNO); + sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -506,7 +508,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_CNT); + sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -807,7 +809,7 @@ xrep_agi_calc_from_btrees( int error; cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, - XFS_BTNUM_INO); + sc->sa.pag, XFS_BTNUM_INO); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -829,7 +831,7 @@ xrep_agi_calc_from_btrees( xfs_agblock_t blocks; cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, - XFS_BTNUM_FINO); + sc->sa.pag, XFS_BTNUM_FINO); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e457c086887f..28fd4b961bb4 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -556,7 +556,7 @@ xchk_bmap_check_ag_rmaps( if (error) return error; - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno, NULL); sbcri.sc = sc; sbcri.whichfork = whichfork; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index c8da976b50fc..50768559fb60 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -465,42 +465,42 @@ xchk_ag_btcur_init( xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, XFS_BTNUM_BNO); + agno, sa->pag, XFS_BTNUM_BNO); } if (sa->agf_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, XFS_BTNUM_CNT); + agno, sa->pag, XFS_BTNUM_CNT); } /* Set up a inobt cursor for cross-referencing. */ if (sa->agi_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - agno, XFS_BTNUM_INO); + agno, sa->pag, XFS_BTNUM_INO); } /* Set up a finobt cursor for cross-referencing. */ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - agno, XFS_BTNUM_FINO); + agno, sa->pag, XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno); + agno, sa->pag); } /* Set up a refcountbt cursor for cross-referencing. */ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, agno); + sa->agf_bp, agno, sa->pag); } } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 6b62872c4d10..862dc56fd8cd 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -555,7 +555,7 @@ xrep_reap_block( } else { agf_bp = sc->sa.agf_bp; } - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno, sc->sa.pag); /* Can we find any other rmappings? */ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); @@ -892,7 +892,8 @@ xrep_find_ag_btree_roots( fab->height = 0; } - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 972864250bd2..311ebaad4f5a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -50,7 +50,7 @@ xfs_trim_extents( goto out_put_perag; agf = agbp->b_addr; - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, pag, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 835dd6e3819b..b654a2bf9a9f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -211,7 +211,7 @@ xfs_getfsmap_is_shared( /* Are there any shared blocks here? */ flen = 0; cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, - info->pag->pag_agno); + info->pag->pag_agno, info->pag); error = xfs_refcount_find_shared(cur, rec->rm_startblock, rec->rm_blockcount, &fbno, &flen, false); @@ -708,7 +708,7 @@ xfs_getfsmap_datadev_rmapbt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag->pag_agno); + info->pag->pag_agno, info->pag); return xfs_rmap_query_range(*curpp, &info->low, &info->high, xfs_getfsmap_datadev_helper, info); } @@ -741,7 +741,7 @@ xfs_getfsmap_datadev_bnobt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag->pag_agno, XFS_BTNUM_BNO); + info->pag->pag_agno, info->pag, XFS_BTNUM_BNO); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0e430b0c1b16..28ffe1817f9b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -144,7 +144,7 @@ xfs_reflink_find_shared( if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); From fa9c3c197329fdab0efc48a8944d2c4a21c6a74f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 031/102] xfs: convert rmap btree cursor to using a perag Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 7 ++++--- fs/xfs/libxfs/xfs_rmap.c | 10 ++++----- fs/xfs/libxfs/xfs_rmap.h | 6 ++++-- fs/xfs/libxfs/xfs_rmap_btree.c | 37 +++++++++++++++------------------- fs/xfs/libxfs/xfs_rmap_btree.h | 4 ++-- fs/xfs/scrub/agheader_repair.c | 6 ++---- fs/xfs/scrub/bmap.c | 2 +- fs/xfs/scrub/common.c | 2 +- fs/xfs/scrub/repair.c | 10 ++++----- fs/xfs/xfs_fsmap.c | 2 +- 11 files changed, 42 insertions(+), 46 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 97fb160e01de..0e0819f6fb89 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -916,7 +916,7 @@ xfs_ag_extend_space( * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - error = xfs_rmap_free(tp, bp, id->agno, + error = xfs_rmap_free(tp, bp, bp->b_pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7ec4af6bf494..10747cc4d8f6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1092,7 +1092,7 @@ xfs_alloc_ag_vextent_small( * If we're feeding an AGFL block to something that doesn't live in the * free space, we need to clear out the OWN_AG rmap. */ - error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, + error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, &XFS_RMAP_OINFO_AG); if (error) goto error; @@ -1169,7 +1169,7 @@ xfs_alloc_ag_vextent( /* if not file data, insert new block into the reverse map btree */ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { - error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, args->agbno, args->len, &args->oinfo); if (error) return error; @@ -1899,12 +1899,13 @@ xfs_free_ag_extent( int haveright; /* have a right neighbor */ int i; int error; + struct xfs_perag *pag = agbp->b_pag; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; if (!xfs_rmap_should_skip_owner_update(oinfo)) { - error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); if (error) goto error0; } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 0d7a6997120c..b23f949ee15c 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -696,7 +696,7 @@ int xfs_rmap_free( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo) @@ -708,7 +708,7 @@ xfs_rmap_free( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno, NULL); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); @@ -950,7 +950,7 @@ int xfs_rmap_alloc( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo) @@ -962,7 +962,7 @@ xfs_rmap_alloc( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno, NULL); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -2408,7 +2408,7 @@ xfs_rmap_finish_one( goto out_drop; } - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag->pag_agno, pag); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); } *pcur = rcur; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index abe633403fd1..f2423cf7f1e2 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -6,6 +6,8 @@ #ifndef __XFS_RMAP_H__ #define __XFS_RMAP_H__ +struct xfs_perag; + static inline void xfs_rmap_ino_bmbt_owner( struct xfs_owner_info *oi, @@ -113,10 +115,10 @@ xfs_owner_info_pack( } int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 7bef8feeded1..cafe181bc92d 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -52,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_ag.pag); + cur->bc_ag.agbp, cur->bc_ag.pag); } STATIC void @@ -64,13 +64,12 @@ xfs_rmapbt_set_root( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - pag->pagf_levels[btnum] += inc; + cur->bc_ag.pag->pagf_levels[btnum] += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -84,6 +83,7 @@ xfs_rmapbt_alloc_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = cur->bc_ag.pag; int error; xfs_agblock_t bno; @@ -93,20 +93,19 @@ xfs_rmapbt_alloc_block( if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, - bno, 1); + trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, agbp->b_pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); *stat = 1; return 0; @@ -119,12 +118,12 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -132,7 +131,6 @@ xfs_rmapbt_free_block( if (error) return error; - pag = cur->bc_ag.agbp->b_pag; xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); @@ -214,7 +212,7 @@ xfs_rmapbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -449,7 +447,6 @@ static struct xfs_btree_cur * xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_perag *pag) { struct xfs_btree_cur *cur; @@ -462,13 +459,12 @@ xfs_rmapbt_init_common( cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); - cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; - if (pag) { - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - } + + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; + cur->bc_ag.agno = pag->pag_agno; return cur; } @@ -479,13 +475,12 @@ xfs_rmapbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, struct xfs_perag *pag) { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, tp, agno, pag); + cur = xfs_rmapbt_init_common(mp, tp, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); cur->bc_ag.agbp = agbp; return cur; @@ -496,11 +491,11 @@ struct xfs_btree_cur * xfs_rmapbt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, NULL, agno, NULL); + cur = xfs_rmapbt_init_common(mp, NULL, pag); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index c94f418cc06b..88d8d18788a2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -43,9 +43,9 @@ struct xbtree_afakeroot; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - xfs_agnumber_t agno, struct xfs_perag *pag); + struct xfs_perag *pag); struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno); + struct xbtree_afakeroot *afake, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 5dd91bf04c18..981c689e3d95 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -269,8 +269,7 @@ xrep_agf_calc_from_btrees( btreeblks += blocks - 1; /* Update the AGF counters from the rmapbt. */ - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - sc->sa.pag); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -491,8 +490,7 @@ xrep_agfl_collect_blocks( xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - sc->sa.pag); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); if (error) goto err; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 28fd4b961bb4..dbe7b65f8da1 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -556,7 +556,7 @@ xchk_bmap_check_ag_rmaps( if (error) return error; - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno, NULL); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, sc->sa.pag); sbcri.sc = sc; sbcri.whichfork = whichfork; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 50768559fb60..48381c1adeed 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -493,7 +493,7 @@ xchk_ag_btcur_init( if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, sa->pag); + sa->pag); } /* Set up a refcountbt cursor for cross-referencing. */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 862dc56fd8cd..5cf1c3707b6a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -509,7 +509,7 @@ xrep_put_freelist( * create an rmap for the block prior to merging it or else other * parts will break. */ - error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, &XFS_RMAP_OINFO_AG); if (error) return error; @@ -555,7 +555,7 @@ xrep_reap_block( } else { agf_bp = sc->sa.agf_bp; } - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno, sc->sa.pag); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); /* Can we find any other rmappings? */ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); @@ -577,7 +577,8 @@ xrep_reap_block( * to run xfs_repair. */ if (has_other_rmap) - error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo); + error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, + 1, oinfo); else if (resv == XFS_AG_RESV_AGFL) error = xrep_put_freelist(sc, agbno); else @@ -892,8 +893,7 @@ xrep_find_ag_btree_roots( fab->height = 0; } - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - sc->sa.pag); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index b654a2bf9a9f..7bfe9ea35de0 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -708,7 +708,7 @@ xfs_getfsmap_datadev_rmapbt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag->pag_agno, info->pag); + info->pag); return xfs_rmap_query_range(*curpp, &info->low, &info->high, xfs_getfsmap_datadev_helper, info); } From a81a06211fb43d80ee746e7a40a32ed812002f8e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 032/102] xfs: convert refcount btree cursor to use perags Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_refcount.c | 40 ++++++++++++++++-------------- fs/xfs/libxfs/xfs_refcount.h | 9 ++++++- fs/xfs/libxfs/xfs_refcount_btree.c | 22 +++++++--------- fs/xfs/libxfs/xfs_refcount_btree.h | 4 +-- fs/xfs/scrub/agheader_repair.c | 2 +- fs/xfs/scrub/bmap.c | 8 +++--- fs/xfs/scrub/common.c | 2 +- fs/xfs/xfs_fsmap.c | 3 +-- fs/xfs/xfs_reflink.c | 4 +-- 9 files changed, 50 insertions(+), 44 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 1c2bd2949d7d..fd2b9cd7ec66 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -22,6 +22,7 @@ #include "xfs_bit.h" #include "xfs_refcount.h" #include "xfs_rmap.h" +#include "xfs_ag.h" /* Allowable refcount adjustment amounts. */ enum xfs_refc_adjust_op { @@ -1142,30 +1143,30 @@ xfs_refcount_finish_one( struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; - xfs_agnumber_t agno; xfs_agblock_t bno; xfs_agblock_t new_agbno; unsigned long nr_ops = 0; int shape_changes = 0; + struct xfs_perag *pag; - agno = XFS_FSB_TO_AGNO(mp, startblock); - ASSERT(agno != NULLAGNUMBER); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); bno = XFS_FSB_TO_AGBNO(mp, startblock); trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), type, XFS_FSB_TO_AGBNO(mp, startblock), blockcount); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_REFCOUNT_FINISH_ONE)) - return -EIO; + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { + error = -EIO; + goto out_drop; + } /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.agno != agno) { + if (rcur != NULL && rcur->bc_ag.pag != pag) { nr_ops = rcur->bc_ag.refc.nr_ops; shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); @@ -1173,12 +1174,12 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, + error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno, XFS_ALLOC_FLAG_FREEING, &agbp); if (error) - return error; + goto out_drop; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); rcur->bc_ag.refc.nr_ops = nr_ops; rcur->bc_ag.refc.shape_changes = shape_changes; } @@ -1188,12 +1189,12 @@ xfs_refcount_finish_one( case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL); - *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL); - *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1210,8 +1211,10 @@ xfs_refcount_finish_one( error = -EFSCORRUPTED; } if (!error && *new_len > 0) - trace_xfs_refcount_finish_one_leftover(mp, agno, type, + trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, bno, blockcount, new_agbno, *new_len); +out_drop: + xfs_perag_put(pag); return error; } @@ -1672,7 +1675,7 @@ xfs_refcount_recover_extent( int xfs_refcount_recover_cow_leftovers( struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_trans *tp; struct xfs_btree_cur *cur; @@ -1704,10 +1707,10 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); if (error) goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); @@ -1729,11 +1732,12 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_free; - trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + trace_xfs_refcount_recover_extent(mp, pag->pag_agno, + &rr->rr_rrec); /* Free the orphan record */ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 209795539c8d..9f6e9aae4da0 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -6,6 +6,13 @@ #ifndef __XFS_REFCOUNT_H__ #define __XFS_REFCOUNT_H__ +struct xfs_trans; +struct xfs_mount; +struct xfs_perag; +struct xfs_btree_cur; +struct xfs_bmbt_irec; +struct xfs_refcount_irec; + extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, @@ -50,7 +57,7 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len); extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - xfs_agnumber_t agno); + struct xfs_perag *pag); /* * While we're adjusting the refcounts records of an extent, we have diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 74f8ac0209f1..8f6577cb3475 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -26,7 +26,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_ag.pag); + cur->bc_ag.agbp, cur->bc_ag.pag); } STATIC void @@ -316,13 +316,11 @@ static struct xfs_btree_cur * xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_perag *pag) { struct xfs_btree_cur *cur; - ASSERT(agno != NULLAGNUMBER); - ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; @@ -331,13 +329,12 @@ xfs_refcountbt_init_common( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - if (pag) { - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - } + + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; + cur->bc_ag.agno = pag->pag_agno; cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; @@ -351,13 +348,12 @@ xfs_refcountbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, struct xfs_perag *pag) { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, tp, agno, pag); + cur = xfs_refcountbt_init_common(mp, tp, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); cur->bc_ag.agbp = agbp; return cur; @@ -368,11 +364,11 @@ struct xfs_btree_cur * xfs_refcountbt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, NULL, agno, NULL); + cur = xfs_refcountbt_init_common(mp, NULL, pag); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 8b82a39f104a..bd9ed9e1e41f 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -47,9 +47,9 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, struct xfs_perag *pag); + struct xfs_perag *pag); struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno); + struct xbtree_afakeroot *afake, struct xfs_perag *pag); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 981c689e3d95..251410c19198 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -282,7 +282,7 @@ xrep_agf_calc_from_btrees( /* Update the AGF counters from the refcountbt. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.agno, sc->sa.pag); + sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index dbe7b65f8da1..864c107666d5 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -545,18 +545,18 @@ STATIC int xchk_bmap_check_ag_rmaps( struct xfs_scrub *sc, int whichfork, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xchk_bmap_check_rmap_info sbcri; struct xfs_btree_cur *cur; struct xfs_buf *agf; int error; - error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf); + error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf); if (error) return error; - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, sc->sa.pag); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, pag); sbcri.sc = sc; sbcri.whichfork = whichfork; @@ -610,7 +610,7 @@ xchk_bmap_check_rmaps( return 0; for_each_perag(sc->mp, agno, pag) { - error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag->pag_agno); + error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error) break; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 48381c1adeed..cc7688ce79b2 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -500,7 +500,7 @@ xchk_ag_btcur_init( if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, agno, sa->pag); + sa->agf_bp, sa->pag); } } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 7bfe9ea35de0..623cabaeafee 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -210,8 +210,7 @@ xfs_getfsmap_is_shared( /* Are there any shared blocks here? */ flen = 0; - cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, - info->pag->pag_agno, info->pag); + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag); error = xfs_refcount_find_shared(cur, rec->rm_startblock, rec->rm_blockcount, &fbno, &flen, false); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 28ffe1817f9b..c256104772cb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -144,7 +144,7 @@ xfs_reflink_find_shared( if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); @@ -763,7 +763,7 @@ xfs_reflink_recover_cow( return 0; for_each_perag(mp, agno, pag) { - error = xfs_refcount_recover_cow_leftovers(mp, pag->pag_agno); + error = xfs_refcount_recover_cow_leftovers(mp, pag); if (error) { xfs_perag_put(pag); break; From 289d38d22cd88960cb648dc480c50de5102519bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 033/102] xfs: convert allocbt cursors to use perags Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 25 ++++++++++--------------- fs/xfs/libxfs/xfs_alloc_btree.c | 26 ++++++++++---------------- fs/xfs/libxfs/xfs_alloc_btree.h | 8 ++++---- fs/xfs/scrub/agheader_repair.c | 8 ++++---- fs/xfs/scrub/common.c | 4 ++-- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_fsmap.c | 2 +- 7 files changed, 32 insertions(+), 43 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 10747cc4d8f6..c99a80286efa 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -776,8 +776,7 @@ xfs_alloc_cur_setup( */ if (!acur->cnt) acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, - args->pag, XFS_BTNUM_CNT); + args->agbp, args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -787,12 +786,10 @@ xfs_alloc_cur_setup( */ if (!acur->bnolt) acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, - args->pag, XFS_BTNUM_BNO); + args->agbp, args->pag, XFS_BTNUM_BNO); if (!acur->bnogt) acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, - args->pag, XFS_BTNUM_BNO); + args->agbp, args->pag, XFS_BTNUM_BNO); return i == 1 ? 0 : -ENOSPC; } @@ -1220,7 +1217,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-number freespace btree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, args->pag, XFS_BTNUM_BNO); + args->pag, XFS_BTNUM_BNO); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1280,7 +1277,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, args->pag, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); @@ -1677,7 +1674,7 @@ restart: * Allocate and initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, args->pag, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); bno_cur = NULL; busy = false; @@ -1840,7 +1837,7 @@ restart: * Allocate and initialize a cursor for the by-block tree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, args->pag, XFS_BTNUM_BNO); + args->pag, XFS_BTNUM_BNO); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1913,8 +1910,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, - NULL, XFS_BTNUM_BNO); + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1984,8 +1980,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, - NULL, XFS_BTNUM_CNT); + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2496,7 +2491,7 @@ xfs_exact_minlen_extent_available( int error = 0; cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, - args->agno, args->pag, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a52ab25bbf0b..0c2e4cff4ee3 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -26,8 +26,7 @@ xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno, - cur->bc_ag.pag, cur->bc_btnum); + cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); } STATIC void @@ -39,13 +38,12 @@ xfs_allocbt_set_root( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - pag->pagf_levels[btnum] += inc; + cur->bc_ag.pag->pagf_levels[btnum] += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -224,7 +222,7 @@ xfs_allocbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -472,7 +470,6 @@ STATIC struct xfs_btree_cur * xfs_allocbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_perag *pag, xfs_btnum_t btnum) { @@ -486,6 +483,7 @@ xfs_allocbt_init_common( cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ag.abt.active = false; if (btnum == XFS_BTNUM_CNT) { cur->bc_ops = &xfs_cntbt_ops; @@ -496,13 +494,10 @@ xfs_allocbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_ag.agno = agno; - cur->bc_ag.abt.active = false; - if (pag) { - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - } + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; + cur->bc_ag.agno = pag->pag_agno; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -518,14 +513,13 @@ xfs_allocbt_init_cursor( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ struct xfs_perag *pag, xfs_btnum_t btnum) /* btree identifier */ { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, tp, agno, pag, btnum); + cur = xfs_allocbt_init_common(mp, tp, pag, btnum); if (btnum == XFS_BTNUM_CNT) cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); else @@ -541,12 +535,12 @@ struct xfs_btree_cur * xfs_allocbt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, NULL, agno, NULL, btnum); + cur = xfs_allocbt_init_common(mp, NULL, pag, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index a10cedba18d8..9eb4c667a6b8 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -47,11 +47,11 @@ struct xbtree_afakeroot; (maxrecs) * sizeof(xfs_alloc_key_t) + \ ((index) - 1) * sizeof(xfs_alloc_ptr_t))) -extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, - xfs_agnumber_t, struct xfs_perag *pag, xfs_btnum_t); +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_perag *pag, xfs_btnum_t btnum); struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + struct xbtree_afakeroot *afake, struct xfs_perag *pag, xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 251410c19198..ee2d85e3fd4a 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -246,7 +246,7 @@ xrep_agf_calc_from_btrees( int error; /* Update the AGF counters from the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) @@ -260,7 +260,7 @@ xrep_agf_calc_from_btrees( agf->agf_longest = cpu_to_be32(raa.longest); /* Update the AGF counters from the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); error = xfs_btree_count_blocks(cur, &blocks); if (error) @@ -497,7 +497,7 @@ xrep_agfl_collect_blocks( xfs_btree_del_cursor(cur, error); /* Find all blocks currently being used by the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) @@ -505,7 +505,7 @@ xrep_agfl_collect_blocks( xfs_btree_del_cursor(cur, error); /* Find all blocks currently being used by the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index cc7688ce79b2..3035f8cee6f6 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -465,14 +465,14 @@ xchk_ag_btcur_init( xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, sa->pag, XFS_BTNUM_BNO); + sa->pag, XFS_BTNUM_BNO); } if (sa->agf_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, sa->pag, XFS_BTNUM_CNT); + sa->pag, XFS_BTNUM_CNT); } /* Set up a inobt cursor for cross-referencing. */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 311ebaad4f5a..736df5660f1f 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -50,7 +50,7 @@ xfs_trim_extents( goto out_put_perag; agf = agbp->b_addr; - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, pag, XFS_BTNUM_CNT); + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 623cabaeafee..7501dd941a63 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -740,7 +740,7 @@ xfs_getfsmap_datadev_bnobt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag->pag_agno, info->pag, XFS_BTNUM_BNO); + info->pag, XFS_BTNUM_BNO); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], From 7b13c515518264df0cb90d84fdab907a627c0fa9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 034/102] xfs: use perag for ialloc btree cursors Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 177 ++++++++++++++++--------------- fs/xfs/libxfs/xfs_ialloc_btree.c | 27 ++--- fs/xfs/libxfs/xfs_ialloc_btree.h | 6 +- fs/xfs/scrub/agheader_repair.c | 4 +- fs/xfs/scrub/common.c | 5 +- fs/xfs/xfs_iwalk.c | 6 +- 6 files changed, 109 insertions(+), 116 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 905872bab426..e6f64d41e208 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -172,18 +172,17 @@ xfs_inobt_insert( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t newino, xfs_agino_t newlen, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); for (thisino = newino; thisino < newino + newlen; @@ -520,18 +519,17 @@ xfs_inobt_insert_sprec( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, int btnum, struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -578,14 +576,14 @@ xfs_inobt_insert_sprec( goto error; } - trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, rec.ir_holemask, nrec->ir_startino, nrec->ir_holemask); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, nrec->ir_holemask); error = xfs_inobt_rec_check_count(mp, nrec); @@ -613,21 +611,20 @@ error: STATIC int xfs_ialloc_ag_alloc( struct xfs_trans *tp, - struct xfs_buf *agbp) + struct xfs_buf *agbp, + struct xfs_perag *pag) { struct xfs_agi *agi; struct xfs_alloc_arg args; - xfs_agnumber_t agno; int error; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe */ /* unit boundary */ /* init. to full chunk */ - uint16_t allocmask = (uint16_t) -1; struct xfs_inobt_rec_incore rec; - struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); + uint16_t allocmask = (uint16_t) -1; int do_sparse = 0; memset(&args, 0, sizeof(args)); @@ -660,14 +657,13 @@ xfs_ialloc_ag_alloc( */ agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); - agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; args.prod = 1; @@ -727,7 +723,7 @@ xfs_ialloc_ag_alloc( * For now, just allocate blocks up front. */ args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); /* * Allocate a fixed-size extent of inodes. */ @@ -748,7 +744,7 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = igeo->cluster_align; if ((error = xfs_alloc_vextent(&args))) return error; @@ -764,7 +760,7 @@ xfs_ialloc_ag_alloc( sparse_alloc: args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; @@ -809,7 +805,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, args.agbno, args.len, prandom_u32()); if (error) @@ -836,12 +832,12 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, - &rec, true); + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + XFS_BTNUM_INO, &rec, true); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, agno, + XFS_AGINO_TO_INO(args.mp, pag->pag_agno, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); @@ -861,21 +857,20 @@ sparse_alloc: * existing record with this one. */ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, - XFS_BTNUM_FINO, &rec, - false); + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + XFS_BTNUM_FINO, &rec, false); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_INO); if (error) return error; if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, + error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_FINO); if (error) return error; @@ -887,7 +882,6 @@ sparse_alloc: */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - pag = agbp->b_pag; pag->pagi_freecount += newlen; pag->pagi_count += newlen; agi->agi_newino = cpu_to_be32(newino); @@ -1123,15 +1117,14 @@ STATIC int xfs_dialloc_ag_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag = agbp->b_pag; struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; @@ -1145,7 +1138,7 @@ xfs_dialloc_ag_inobt( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1160,7 +1153,7 @@ xfs_dialloc_ag_inobt( /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == agno) { + if (pagno == pag->pag_agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1363,7 +1356,7 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1577,7 +1570,6 @@ xfs_dialloc_ag( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur; /* finobt cursor */ @@ -1587,9 +1579,10 @@ xfs_dialloc_ag( int error; int offset; int i; + struct xfs_perag *pag = agbp->b_pag; if (!xfs_sb_version_hasfinobt(&mp->m_sb)) - return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -1598,7 +1591,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1609,7 +1602,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (agno == pagno) + if (pag->pag_agno == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1621,7 +1614,7 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); /* * Modify or remove the finobt record. @@ -1641,7 +1634,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); error = xfs_check_agi_freecount(icur, agi); if (error) @@ -1657,7 +1650,7 @@ xfs_dialloc_ag( */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - agbp->b_pag->pagi_freecount--; + pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); @@ -1809,7 +1802,7 @@ xfs_dialloc_select_ag( if (!okalloc) goto nextag_relse_buffer; - error = xfs_ialloc_ag_alloc(*tpp, agbp); + error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); if (error < 0) { xfs_trans_brelse(*tpp, agbp); @@ -1935,12 +1928,12 @@ xfs_difree_inobt( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; @@ -1954,7 +1947,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -2005,7 +1998,8 @@ xfs_difree_inobt( struct xfs_perag *pag = agbp->b_pag; xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2028,7 +2022,7 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, agno, &rec); + xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); } else { xic->deleted = false; @@ -2044,7 +2038,7 @@ xfs_difree_inobt( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - agbp->b_pag->pagi_freecount++; + pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } @@ -2069,18 +2063,18 @@ xfs_difree_finobt( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2188,16 +2182,15 @@ xfs_difree( xfs_agino_t agino; /* allocation group inode number */ xfs_agnumber_t agno; /* allocation group number */ int error; /* error return value */ - struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ - - mp = tp->t_mountp; + struct xfs_perag *pag; /* * Break up inode number into its components. */ agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { + if (agno >= mp->m_sb.sb_agcount) { xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", __func__, agno, mp->m_sb.sb_agcount); ASSERT(0); @@ -2231,7 +2224,8 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, xic, &rec); + pag = agbp->b_pag; + error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); if (error) goto error0; @@ -2239,7 +2233,7 @@ xfs_difree( * Fix up the free inode btree. */ if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); if (error) goto error0; } @@ -2254,7 +2248,7 @@ STATIC int xfs_imap_lookup( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, @@ -2267,11 +2261,11 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, agno); + __func__, error, pag->pag_agno); return error; } @@ -2281,7 +2275,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, NULL, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2315,42 +2309,44 @@ xfs_imap_lookup( */ int xfs_imap( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t ino, /* inode to locate */ - struct xfs_imap *imap, /* location map structure */ - uint flags) /* flags for inode btree lookup */ + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ { - xfs_agblock_t agbno; /* block number of inode in the alloc group */ - xfs_agino_t agino; /* inode number within alloc group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agblock_t chunk_agbno; /* first block in inode chunk */ - xfs_agblock_t cluster_agbno; /* first block in inode cluster */ - int error; /* error code */ - int offset; /* index of inode in its buffer */ - xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + int error; /* error code */ + int offset; /* index of inode in its buffer */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + struct xfs_perag *pag; ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ - agno = XFS_INO_TO_AGNO(mp, ino); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (!pag || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + error = -EINVAL; #ifdef DEBUG /* * Don't output diagnostic information for untrusted inodes * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - return -EINVAL; - if (agno >= mp->m_sb.sb_agcount) { + goto out_drop; + if (!pag) { xfs_alert(mp, "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", - __func__, agno, mp->m_sb.sb_agcount); + __func__, XFS_INO_TO_AGNO(mp, ino), + mp->m_sb.sb_agcount); } if (agbno >= mp->m_sb.sb_agblocks) { xfs_alert(mp, @@ -2358,15 +2354,15 @@ xfs_imap( __func__, (unsigned long long)agbno, (unsigned long)mp->m_sb.sb_agblocks); } - if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, agno, agino)); + XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); } xfs_stack_trace(); #endif /* DEBUG */ - return -EINVAL; + goto out_drop; } /* @@ -2377,10 +2373,10 @@ xfs_imap( * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { - error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + error = xfs_imap_lookup(mp, tp, pag, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - return error; + goto out_drop; goto out_map; } @@ -2392,11 +2388,12 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); - return 0; + error = 0; + goto out_drop; } /* @@ -2408,10 +2405,10 @@ xfs_imap( offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + error = xfs_imap_lookup(mp, tp, pag, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - return error; + goto out_drop; } out_map: @@ -2422,7 +2419,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2439,9 +2436,13 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - return -EINVAL; + error = -EINVAL; + goto out_drop; } - return 0; + error = 0; +out_drop: + xfs_perag_put(pag); + return error; } /* diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 6c4efdf01674..450161b53648 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -35,8 +35,7 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno, - cur->bc_ag.pag, cur->bc_btnum); + cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); } STATIC void @@ -428,7 +427,6 @@ static struct xfs_btree_cur * xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ struct xfs_perag *pag, xfs_btnum_t btnum) /* ialloc or free ino btree */ { @@ -451,12 +449,10 @@ xfs_inobt_init_common( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ag.agno = agno; - if (pag) { - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - } + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; + cur->bc_ag.agno = pag->pag_agno; return cur; } @@ -466,14 +462,13 @@ xfs_inobt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(mp, tp, agno, pag, btnum); + cur = xfs_inobt_init_common(mp, tp, pag, btnum); if (btnum == XFS_BTNUM_INO) cur->bc_nlevels = be32_to_cpu(agi->agi_level); else @@ -487,12 +482,12 @@ struct xfs_btree_cur * xfs_inobt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - cur = xfs_inobt_init_common(mp, NULL, agno, NULL, btnum); + cur = xfs_inobt_init_common(mp, NULL, pag, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -664,7 +659,7 @@ int xfs_inobt_cur( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t which, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp) @@ -675,11 +670,11 @@ xfs_inobt_cur( ASSERT(*agi_bpp == NULL); ASSERT(*curpp == NULL); - error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp); if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, NULL, which); + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which); *curpp = cur; return 0; } @@ -696,7 +691,7 @@ xfs_inobt_count_blocks( struct xfs_btree_cur *cur = NULL; int error; - error = xfs_inobt_cur(mp, tp, pag->pag_agno, btnum, &cur, &agbp); + error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 04dfa7eee81f..e530c82b2217 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -47,10 +47,10 @@ struct xfs_perag; ((index) - 1) * sizeof(xfs_inobt_ptr_t))) extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag, xfs_btnum_t btnum); struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + struct xbtree_afakeroot *afake, struct xfs_perag *pag, xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); @@ -69,7 +69,7 @@ int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_btnum_t btnum, + struct xfs_perag *pag, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index ee2d85e3fd4a..ecc9146647ba 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -806,7 +806,7 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.pag, XFS_BTNUM_INO); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) @@ -828,7 +828,7 @@ xrep_agi_calc_from_btrees( xfs_sb_version_hasinobtcounts(&mp->m_sb)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.pag, XFS_BTNUM_FINO); error = xfs_btree_count_blocks(cur, &blocks); if (error) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 3035f8cee6f6..64c3b9b78d0d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -458,7 +458,6 @@ xchk_ag_btcur_init( struct xchk_ag *sa) { struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sa->agno; xchk_perag_get(sc->mp, sa); if (sa->agf_bp && @@ -479,14 +478,14 @@ xchk_ag_btcur_init( if (sa->agi_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - agno, sa->pag, XFS_BTNUM_INO); + sa->pag, XFS_BTNUM_INO); } /* Set up a finobt cursor for cross-referencing. */ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - agno, sa->pag, XFS_BTNUM_FINO); + sa->pag, XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index c7e8f48a3ec4..917d51eefee3 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -272,8 +272,7 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(mp, tp, pag->pag_agno, XFS_BTNUM_INO, - curpp, agi_bpp); + error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -378,8 +377,7 @@ xfs_iwalk_run_callbacks( return 0; /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, iwag->pag->pag_agno, XFS_BTNUM_INO, - curpp, agi_bpp); + error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; From 50f02fe3338d3fee6b298a1b262a4c562e7d84e0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 035/102] xfs: remove agno from btree cursor Now that everything passes a perag, the agno is not needed anymore. Convert all the users to use pag->pag_agno instead and remove the agno from the cursor. This was largely done as an automated search and replace. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_alloc_btree.c | 1 - fs/xfs/libxfs/xfs_btree.c | 12 ++-- fs/xfs/libxfs/xfs_btree.h | 1 - fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 7 +- fs/xfs/libxfs/xfs_refcount.c | 82 +++++++++++----------- fs/xfs/libxfs/xfs_refcount_btree.c | 11 ++- fs/xfs/libxfs/xfs_rmap.c | 108 ++++++++++++++--------------- fs/xfs/libxfs/xfs_rmap_btree.c | 1 - fs/xfs/scrub/agheader_repair.c | 2 +- fs/xfs/scrub/alloc.c | 3 +- fs/xfs/scrub/bmap.c | 2 +- fs/xfs/scrub/ialloc.c | 9 +-- fs/xfs/scrub/refcount.c | 3 +- fs/xfs/scrub/rmap.c | 3 +- fs/xfs/scrub/trace.c | 3 +- fs/xfs/xfs_fsmap.c | 4 +- fs/xfs/xfs_trace.h | 4 +- 19 files changed, 130 insertions(+), 130 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c99a80286efa..f7864f33c1f0 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -230,7 +230,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 0c2e4cff4ee3..6b363f78cfa2 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -497,7 +497,6 @@ xfs_allocbt_init_common( /* take a reference for the cursor */ atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; - cur->bc_ag.agno = pag->pag_agno; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 44044317c0fb..be74a6b53689 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -216,7 +216,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno); + return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); } /* @@ -245,7 +245,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_ag.agno, cur->bc_btnum, + cur->bc_ag.pag->pag_agno, cur->bc_btnum, level, index); } @@ -888,13 +888,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -952,7 +952,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); } @@ -1153,7 +1153,7 @@ xfs_btree_init_block_cur( if (cur->bc_flags & XFS_BTREE_LONG_PTRS) owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_ag.agno; + owner = cur->bc_ag.pag->pag_agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, cur->bc_btnum, level, numrecs, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index e71f33f1f111..4dbdc659c396 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -181,7 +181,6 @@ union xfs_btree_irec { /* Per-AG btree information. */ struct xfs_btree_cur_ag { - xfs_agnumber_t agno; struct xfs_perag *pag; union { struct xfs_buf *agbp; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index e6f64d41e208..4540fbcd68a3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,7 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; uint64_t realfree; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 450161b53648..823a038939f8 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -102,7 +102,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -235,7 +235,7 @@ xfs_inobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -247,7 +247,7 @@ xfs_finobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } @@ -452,7 +452,6 @@ xfs_inobt_init_common( /* take a reference for the cursor */ atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; - cur->bc_ag.agno = pag->pag_agno; return cur; } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index fd2b9cd7ec66..860a0c9801ba 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -47,7 +47,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -64,7 +64,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -81,7 +81,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -109,7 +109,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -120,7 +120,7 @@ xfs_refcount_get_rec( xfs_refcount_btrec_to_irec(rec, irec); - agno = cur->bc_ag.agno; + agno = cur->bc_ag.pag->pag_agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -145,7 +145,7 @@ xfs_refcount_get_rec( if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); return 0; out_bad_rec: @@ -170,14 +170,14 @@ xfs_refcount_update( union xfs_btree_rec rec; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -194,7 +194,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; @@ -209,7 +209,7 @@ xfs_refcount_insert( out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -235,7 +235,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { error = -EFSCORRUPTED; @@ -247,7 +247,7 @@ xfs_refcount_delete( out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -367,7 +367,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &rcext, agbno); /* Establish the right extent. */ @@ -392,7 +392,7 @@ xfs_refcount_split_extent( out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -412,7 +412,7 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_ag.agno, left, center, right); + cur->bc_ag.pag->pag_agno, left, center, right); /* * Make sure the center and right extents are not in the btree. @@ -469,7 +469,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -488,7 +488,7 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_ag.agno, left, cleft); + cur->bc_ag.pag->pag_agno, left, cleft); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { @@ -531,7 +531,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -549,7 +549,7 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_ag.agno, cright, right); + cur->bc_ag.pag->pag_agno, cright, right); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, @@ -595,7 +595,7 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -680,13 +680,13 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -769,13 +769,13 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = aglen; cright->rc_refcount = 1; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -953,7 +953,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &tmp); + cur->bc_ag.pag->pag_agno, &tmp); /* * Either cover the hole (increment) or @@ -972,7 +972,7 @@ xfs_refcount_adjust_extents( cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.agno, + cur->bc_ag.pag->pag_agno, tmp.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, tmp.rc_blockcount, oinfo); @@ -999,7 +999,7 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &ext); + cur->bc_ag.pag->pag_agno, &ext); if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) @@ -1017,7 +1017,7 @@ xfs_refcount_adjust_extents( goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.agno, + cur->bc_ag.pag->pag_agno, ext.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, oinfo); @@ -1036,7 +1036,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1058,10 +1058,10 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); /* @@ -1100,7 +1100,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1297,7 +1297,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); /* By default, skip the whole range */ @@ -1377,12 +1377,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_ag.agno, *fbno, *flen); + cur->bc_ag.pag->pag_agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1479,7 +1479,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &tmp); + cur->bc_ag.pag->pag_agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1507,7 +1507,7 @@ xfs_refcount_adjust_cow_extents( ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &ext); + cur->bc_ag.pag->pag_agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1523,7 +1523,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1569,7 +1569,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1583,7 +1583,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1600,7 +1600,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, aglen); /* Remove refcount btree reservation */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 8f6577cb3475..92d336c17e83 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -65,7 +65,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -74,13 +74,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_ag.agno); + ASSERT(args.agno == cur->bc_ag.pag->pag_agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -105,7 +105,7 @@ xfs_refcountbt_free_block( xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -170,7 +170,7 @@ xfs_refcountbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -334,7 +334,6 @@ xfs_refcountbt_init_common( /* take a reference for the cursor */ atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; - cur->bc_ag.agno = pag->pag_agno; cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index b23f949ee15c..d1dfad0204e3 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -81,7 +81,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -93,7 +93,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -109,7 +109,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -135,7 +135,7 @@ xfs_rmap_insert( done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_ag.agno, error, _RET_IP_); + rcur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -151,7 +151,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -172,7 +172,7 @@ xfs_rmap_delete( done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_ag.agno, error, _RET_IP_); + rcur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -199,7 +199,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; @@ -262,7 +262,7 @@ xfs_rmap_find_left_neighbor_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_ag.agno, rec->rm_startblock, + cur->bc_ag.pag->pag_agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -314,7 +314,7 @@ xfs_rmap_find_left_neighbor( info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_ag.agno, bno, 0, owner, offset, flags); + cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); @@ -322,7 +322,7 @@ xfs_rmap_find_left_neighbor( error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, irec->rm_startblock, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -338,7 +338,7 @@ xfs_rmap_lookup_le_range_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_ag.agno, rec->rm_startblock, + cur->bc_ag.pag->pag_agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -387,14 +387,14 @@ xfs_rmap_lookup_le_range( info.stat = stat; trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.agno, bno, 0, owner, offset, flags); + cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, irec->rm_startblock, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -500,7 +500,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -524,7 +524,7 @@ xfs_rmap_unmap( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, ltrec.rm_startblock, + cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -590,7 +590,7 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -668,7 +668,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -680,11 +680,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -775,7 +775,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -797,7 +797,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, ltrec.rm_startblock, + cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -833,7 +833,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, gtrec.rm_startblock, + cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -872,7 +872,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -923,7 +923,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) @@ -934,11 +934,11 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_ag.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1012,7 +1012,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -1036,7 +1036,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, PREV.rm_startblock, + cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1078,7 +1078,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, LEFT.rm_startblock, + cur->bc_ag.pag->pag_agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1116,7 +1116,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, RIGHT.rm_startblock, + cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1134,7 +1134,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, _RET_IP_); /* reset the cursor back to PREV */ @@ -1164,7 +1164,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1182,7 +1182,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1212,7 +1212,7 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1249,7 +1249,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1328,7 +1328,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1385,7 +1385,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1416,7 +1416,7 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); @@ -1443,7 +1443,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1467,12 +1467,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1508,7 +1508,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -1575,7 +1575,7 @@ xfs_rmap_convert_shared( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, RIGHT.rm_startblock, + cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1591,7 +1591,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1882,12 +1882,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1925,7 +1925,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -2074,12 +2074,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -2114,7 +2114,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -2140,7 +2140,7 @@ xfs_rmap_map_shared( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, gtrec.rm_startblock, + cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -2233,12 +2233,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -2389,7 +2389,7 @@ xfs_rmap_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.agno != pag->pag_agno) { + if (rcur != NULL && rcur->bc_ag.pag != pag) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index cafe181bc92d..f29bc71b9950 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -464,7 +464,6 @@ xfs_rmapbt_init_common( /* take a reference for the cursor */ atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; - cur->bc_ag.agno = pag->pag_agno; return cur; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index ecc9146647ba..e95f8c98f0f7 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -454,7 +454,7 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); if (error) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 2720bd7fe53b..d5741980094a 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -15,6 +15,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* * Set us up to scrub free space btrees. @@ -93,7 +94,7 @@ xchk_allocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t bno; xfs_extlen_t len; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 864c107666d5..0f125583189f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -515,7 +515,7 @@ xchk_bmap_check_rmap( xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.agno, rec->rm_startblock)) + cur->bc_ag.pag->pag_agno, rec->rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_blockcount > rec->rm_blockcount) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 8d9f3fb0cd22..30e568596b79 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -21,6 +21,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "xfs_ag.h" /* * Set us up to scrub inode btrees. @@ -103,7 +104,7 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -163,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -213,7 +214,7 @@ xchk_iallocbt_check_cluster( struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -423,7 +424,7 @@ xchk_iallocbt_rec( struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 744530a66c0c..7014b7408bad 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -13,6 +13,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* * Set us up to scrub reference count btrees. @@ -333,7 +334,7 @@ xchk_refcountbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index a4f17477c5d1..fc306573f0ac 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -15,6 +15,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* * Set us up to scrub reverse mapping btrees. @@ -91,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; bool non_inode; bool is_unwritten; bool is_bmbt; diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2c6c248be823..03882a605a3c 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -13,6 +13,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "scrub/scrub.h" +#include "xfs_ag.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t @@ -26,7 +27,7 @@ xchk_btree_cur_fsbno( cur->bc_flags & XFS_BTREE_LONG_PTRS) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0); + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 7501dd941a63..7d0b09c1366e 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -355,7 +355,7 @@ xfs_getfsmap_datadev_helper( xfs_fsblock_t fsb; xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock); + fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); @@ -373,7 +373,7 @@ xfs_getfsmap_datadev_bnobt_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno, + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno, rec->ar_startblock); irec.rm_startblock = rec->ar_startblock; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 808ae337b222..5ba9c6396dcb 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3730,7 +3730,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; __entry->btnum = cur->bc_btnum; - __entry->agno = cur->bc_ag.agno; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; @@ -3845,7 +3845,7 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); } else { - __entry->agno = cur->bc_ag.agno; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = be32_to_cpu(ptr->s); } __entry->nr_records = nr_records; From 4268547305c91b35ae7871374078de788a822ed1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 036/102] xfs: simplify xfs_dialloc_select_ag() return values The only caller of xfs_dialloc_select_ag() will always return -ENOSPC to it's caller if the agbp returned from xfs_dialloc_select_ag() is NULL. IOWs, failure to find a candidate AGI we can allocate inodes from is always an ENOSPC condition, so move this logic up into xfs_dialloc_select_ag() so we can simplify the return logic in this function. xfs_dialloc_select_ag() now only ever returns 0 with a locked agbp, or an error with no agbp. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 23 ++++++++--------------- fs/xfs/xfs_inode.c | 3 --- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 4540fbcd68a3..872591e8f5cb 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1717,7 +1717,7 @@ xfs_dialloc_roll( * This function will ensure that the selected AG has free inodes available to * allocate from. The selected AGI will be returned locked to the caller, and it * will allocate more free inodes if required. If no free inodes are found or - * can be allocated, no AGI will be returned. + * can be allocated, -ENOSPC be returned. */ int xfs_dialloc_select_ag( @@ -1730,7 +1730,6 @@ xfs_dialloc_select_ag( struct xfs_buf *agbp; xfs_agnumber_t agno; int error; - bool noroom = false; xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -1744,7 +1743,7 @@ xfs_dialloc_select_ag( */ start_agno = xfs_ialloc_ag_select(*tpp, parent, mode); if (start_agno == NULLAGNUMBER) - return 0; + return -ENOSPC; /* * If we have already hit the ceiling of inode blocks then clear @@ -1757,7 +1756,6 @@ xfs_dialloc_select_ag( if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { - noroom = true; okalloc = false; } @@ -1794,10 +1792,8 @@ xfs_dialloc_select_ag( if (error) break; - if (pag->pagi_freecount) { - xfs_perag_put(pag); + if (pag->pagi_freecount) goto found_ag; - } if (!okalloc) goto nextag_relse_buffer; @@ -1805,9 +1801,6 @@ xfs_dialloc_select_ag( error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); if (error < 0) { xfs_trans_brelse(*tpp, agbp); - - if (error == -ENOSPC) - error = 0; break; } @@ -1818,12 +1811,11 @@ xfs_dialloc_select_ag( * allocate one of the new inodes. */ ASSERT(pag->pagi_freecount > 0); - xfs_perag_put(pag); error = xfs_dialloc_roll(tpp, agbp); if (error) { xfs_buf_relse(agbp); - return error; + break; } goto found_ag; } @@ -1831,16 +1823,17 @@ xfs_dialloc_select_ag( nextag_relse_buffer: xfs_trans_brelse(*tpp, agbp); nextag: - xfs_perag_put(pag); if (++agno == mp->m_sb.sb_agcount) agno = 0; if (agno == start_agno) - return noroom ? -ENOSPC : 0; + break; + xfs_perag_put(pag); } xfs_perag_put(pag); - return error; + return error ? error : -ENOSPC; found_ag: + xfs_perag_put(pag); *IO_agbp = agbp; return 0; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4d397c29ff83..743c8eeee94a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -923,9 +923,6 @@ xfs_dir_ialloc( if (error) return error; - if (!agibp) - return -ENOSPC; - /* Allocate an inode from the selected AG */ error = xfs_dialloc_ag(*tpp, agibp, parent_ino, &ino); if (error) From 89b1f55a2951bb89b7ae9f8cb3fd11513ff3f219 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 037/102] xfs: collapse AG selection for inode allocation xfs_dialloc_select_ag() does a lot of repetitive work. It first calls xfs_ialloc_ag_select() to select the AG to start allocation attempts in, which can do up to two entire loops across the perags that inodes can be allocated in. This is simply checking if there is spce available to allocate inodes in an AG, and it returns when it finds the first candidate AG. xfs_dialloc_select_ag() then does it's own iterative walk across all the perags locking the AGIs and trying to allocate inodes from the locked AG. It also doesn't limit the search to mp->m_maxagi, so it will walk all AGs whether they can allocate inodes or not. Hence if we are really low on inodes, we could do almost 3 entire walks across the whole perag range before we find an allocation group we can allocate inodes in or report ENOSPC. Because xfs_ialloc_ag_select() returns on the first candidate AG it finds, we can simply do these checks directly in xfs_dialloc_select_ag() before we lock and try to allocate inodes. This reduces the inode allocation pass down to 2 perag sweeps at most - one for aligned inode cluster allocation and if we can't allocate full, aligned inode clusters anywhere we'll do another pass trying to do sparse inode cluster allocation. This also removes a big chunk of duplicate code. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 225 +++++++++++++------------------------ 1 file changed, 78 insertions(+), 147 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 872591e8f5cb..79119af36d12 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -899,139 +899,6 @@ sparse_alloc: return 0; } -STATIC xfs_agnumber_t -xfs_ialloc_next_ag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - - spin_lock(&mp->m_agirotor_lock); - agno = mp->m_agirotor; - if (++mp->m_agirotor >= mp->m_maxagi) - mp->m_agirotor = 0; - spin_unlock(&mp->m_agirotor_lock); - - return agno; -} - -/* - * Select an allocation group to look for a free inode in, based on the parent - * inode and the mode. Return the allocation group buffer. - */ -STATIC xfs_agnumber_t -xfs_ialloc_ag_select( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent directory inode number */ - umode_t mode) /* bits set to indicate file type */ -{ - xfs_agnumber_t agcount; /* number of ag's in the filesystem */ - xfs_agnumber_t agno; /* current ag number */ - int flags; /* alloc buffer locking flags */ - xfs_extlen_t ineed; /* blocks needed for inode allocation */ - xfs_extlen_t longest = 0; /* longest extent available */ - xfs_mount_t *mp; /* mount point structure */ - int needspace; /* file mode implies space allocated */ - xfs_perag_t *pag; /* per allocation group data */ - xfs_agnumber_t pagno; /* parent (starting) ag number */ - int error; - - /* - * Files of these types need at least one block if length > 0 - * (and they won't fit in the inode, but that's hard to figure out). - */ - needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); - mp = tp->t_mountp; - agcount = mp->m_maxagi; - if (S_ISDIR(mode)) - pagno = xfs_ialloc_next_ag(mp); - else { - pagno = XFS_INO_TO_AGNO(mp, parent); - if (pagno >= agcount) - pagno = 0; - } - - ASSERT(pagno < agcount); - - /* - * Loop through allocation groups, looking for one with a little - * free space in it. Note we don't look for free inodes, exactly. - * Instead, we include whether there is a need to allocate inodes - * to mean that blocks must be allocated for them, - * if none are currently free. - */ - agno = pagno; - flags = XFS_ALLOC_FLAG_TRYLOCK; - for (;;) { - pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto nextag; - } - - if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, agno); - if (error) - goto nextag; - } - - if (pag->pagi_freecount) { - xfs_perag_put(pag); - return agno; - } - - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, agno, flags); - if (error) - goto nextag; - } - - /* - * Check that there is enough free space for the file plus a - * chunk of inodes if we need to allocate some. If this is the - * first pass across the AGs, take into account the potential - * space needed for alignment of inode chunks when checking the - * longest contiguous free space in the AG - this prevents us - * from getting ENOSPC because we have free space larger than - * ialloc_blks but alignment constraints prevent us from using - * it. - * - * If we can't find an AG with space for full alignment slack to - * be taken into account, we must be near ENOSPC in all AGs. - * Hence we don't include alignment for the second pass and so - * if we fail allocation due to alignment issues then it is most - * likely a real ENOSPC condition. - */ - ineed = M_IGEO(mp)->ialloc_min_blks; - if (flags && ineed > 1) - ineed += M_IGEO(mp)->cluster_align; - longest = pag->pagf_longest; - if (!longest) - longest = pag->pagf_flcount > 0; - - if (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed) { - xfs_perag_put(pag); - return agno; - } -nextag: - xfs_perag_put(pag); - /* - * No point in iterating over the rest, if we're shutting - * down. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return NULLAGNUMBER; - agno++; - if (agno >= agcount) - agno = 0; - if (agno == pagno) { - if (flags == 0) - return NULLAGNUMBER; - flags = 0; - } - } -} - /* * Try to retrieve the next record to the left/right from the current one. */ @@ -1708,6 +1575,21 @@ xfs_dialloc_roll( return 0; } +STATIC xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor >= mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + /* * Select and prepare an AG for inode allocation. * @@ -1734,16 +1616,24 @@ xfs_dialloc_select_ag( struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); bool okalloc = true; + int needspace; + int flags; *IO_agbp = NULL; /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. + * Directories, symlinks, and regular files frequently allocate at least + * one block, so factor that potential expansion when we examine whether + * an AG has enough space for file creation. */ - start_agno = xfs_ialloc_ag_select(*tpp, parent, mode); - if (start_agno == NULLAGNUMBER) - return -ENOSPC; + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + if (S_ISDIR(mode)) + start_agno = xfs_ialloc_next_ag(mp); + else { + start_agno = XFS_INO_TO_AGNO(mp, parent); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + } /* * If we have already hit the ceiling of inode blocks then clear @@ -1765,12 +1655,14 @@ xfs_dialloc_select_ag( * allocation groups upward, wrapping at the end. */ agno = start_agno; + flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { + xfs_extlen_t ineed; + xfs_extlen_t longest = 0; + pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); + if (!pag->pagi_inodeok) goto nextag; - } if (!pag->pagi_init) { error = xfs_ialloc_pagi_init(mp, *tpp, agno); @@ -1778,12 +1670,44 @@ xfs_dialloc_select_ag( break; } - /* - * Do a first racy fast path check if this AG is usable. - */ if (!pag->pagi_freecount && !okalloc) goto nextag; + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, *tpp, agno, flags); + if (error) + goto nextag; + } + + /* + * Check that there is enough free space for the file plus a + * chunk of inodes if we need to allocate some. If this is the + * first pass across the AGs, take into account the potential + * space needed for alignment of inode chunks when checking the + * longest contiguous free space in the AG - this prevents us + * from getting ENOSPC because we have free space larger than + * ialloc_blks but alignment constraints prevent us from using + * it. + * + * If we can't find an AG with space for full alignment slack to + * be taken into account, we must be near ENOSPC in all AGs. + * Hence we don't include alignment for the second pass and so + * if we fail allocation due to alignment issues then it is most + * likely a real ENOSPC condition. + */ + if (!pag->pagi_freecount) { + ineed = M_IGEO(mp)->ialloc_min_blks; + if (flags && ineed > 1) + ineed += M_IGEO(mp)->cluster_align; + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks < needspace + ineed || + longest < ineed) + goto nextag; + } + /* * Then read in the AGI buffer and recheck with the AGI buffer * lock held. @@ -1823,10 +1747,17 @@ xfs_dialloc_select_ag( nextag_relse_buffer: xfs_trans_brelse(*tpp, agbp); nextag: - if (++agno == mp->m_sb.sb_agcount) - agno = 0; - if (agno == start_agno) + if (XFS_FORCED_SHUTDOWN(mp)) { + error = -EFSCORRUPTED; break; + } + if (++agno == mp->m_maxagi) + agno = 0; + if (agno == start_agno) { + if (!flags) + break; + flags = 0; + } xfs_perag_put(pag); } From b652afd937033911944d7f681f2031b006961f1d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 038/102] xfs: get rid of xfs_dir_ialloc() This is just a simple wrapper around the per-ag inode allocation that doesn't need to exist. The internal mechanism to select and allocate within an AG does not need to be exposed outside xfs_ialloc.c, and it being exposed simply makes it harder to follow the code and simplify it. This is simplified by internalising xf_dialloc_select_ag() and xfs_dialloc_ag() into a single xfs_dialloc() function and then xfs_dir_ialloc() can go away. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 17 ++++++---- fs/xfs/libxfs/xfs_ialloc.h | 27 +++------------- fs/xfs/xfs_inode.c | 66 +++++++------------------------------- fs/xfs/xfs_inode.h | 9 +++--- fs/xfs/xfs_qm.c | 9 ++++-- fs/xfs/xfs_symlink.c | 9 ++++-- 6 files changed, 44 insertions(+), 93 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 79119af36d12..4a04ca79ba33 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1428,7 +1428,7 @@ xfs_dialloc_ag_update_inobt( * The caller selected an AG for us, and made sure that free inodes are * available. */ -int +static int xfs_dialloc_ag( struct xfs_trans *tp, struct xfs_buf *agbp, @@ -1602,24 +1602,23 @@ xfs_ialloc_next_ag( * can be allocated, -ENOSPC be returned. */ int -xfs_dialloc_select_ag( +xfs_dialloc( struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, - struct xfs_buf **IO_agbp) + xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; struct xfs_buf *agbp; xfs_agnumber_t agno; - int error; + int error = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); bool okalloc = true; int needspace; int flags; - - *IO_agbp = NULL; + xfs_ino_t ino; /* * Directories, symlinks, and regular files frequently allocate at least @@ -1765,7 +1764,11 @@ nextag: return error ? error : -ENOSPC; found_ag: xfs_perag_put(pag); - *IO_agbp = agbp; + /* Allocate an inode in the found AG */ + error = xfs_dialloc_ag(*tpp, agbp, parent, &ino); + if (error) + return error; + *new_ino = ino; return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 3511086a7ae1..886f6748fb22 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -33,30 +33,11 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. - * - * There are two phases to inode allocation: selecting an AG and ensuring - * that it contains free inodes, followed by allocating one of the free - * inodes. xfs_dialloc_select_ag() does the former and returns a locked AGI - * to the caller, ensuring that followup call to xfs_dialloc_ag() will - * have free inodes to allocate from. xfs_dialloc_ag() will return the inode - * number of the free inode we allocated. + * Allocate an inode on disk. Mode is used to tell whether the new inode will + * need space, and whether it is a directory. */ -int /* error */ -xfs_dialloc_select_ag( - struct xfs_trans **tpp, /* double pointer of transaction */ - xfs_ino_t parent, /* parent inode (directory) */ - umode_t mode, /* mode bits for new inode */ - struct xfs_buf **IO_agbp); - -int -xfs_dialloc_ag( - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_ino_t parent, - xfs_ino_t *inop); +int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, + xfs_ino_t *new_ino); /* * Free disk inode. Carefully avoids touching the incore inode, all diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 743c8eeee94a..8d204d516621 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -749,7 +749,7 @@ xfs_inode_inherit_flags2( * Initialise a newly allocated inode and return the in-core inode to the * caller locked exclusively. */ -static int +int xfs_init_new_inode( struct user_namespace *mnt_userns, struct xfs_trans *tp, @@ -885,54 +885,6 @@ xfs_init_new_inode( return 0; } -/* - * Allocates a new inode from disk and return a pointer to the incore copy. This - * routine will internally commit the current transaction and allocate a new one - * if we needed to allocate more on-disk free inodes to perform the requested - * operation. - * - * If we are allocating quota inodes, we do not have a parent inode to attach to - * or associate with (i.e. dp == NULL) because they are not linked into the - * directory structure - they are attached directly to the superblock - and so - * have no parent. - */ -int -xfs_dir_ialloc( - struct user_namespace *mnt_userns, - struct xfs_trans **tpp, - struct xfs_inode *dp, - umode_t mode, - xfs_nlink_t nlink, - dev_t rdev, - prid_t prid, - bool init_xattrs, - struct xfs_inode **ipp) -{ - struct xfs_buf *agibp; - xfs_ino_t parent_ino = dp ? dp->i_ino : 0; - xfs_ino_t ino; - int error; - - ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); - - /* - * Call the space management code to pick the on-disk inode to be - * allocated. - */ - error = xfs_dialloc_select_ag(tpp, parent_ino, mode, &agibp); - if (error) - return error; - - /* Allocate an inode from the selected AG */ - error = xfs_dialloc_ag(*tpp, agibp, parent_ino, &ino); - if (error) - return error; - ASSERT(ino != NULLFSINO); - - return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev, - prid, init_xattrs, ipp); -} - /* * Decrement the link count on an inode & log the change. If this causes the * link count to go to zero, move the inode to AGI unlinked list so that it can @@ -990,6 +942,7 @@ xfs_create( struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; uint resblks; + xfs_ino_t ino; trace_xfs_create(dp, name); @@ -1046,14 +999,16 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, init_xattrs, &ip); + error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + if (!error) + error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; /* * Now we join the directory inode to the transaction. We do not do it - * earlier because xfs_dir_ialloc might commit the previous transaction + * earlier because xfs_dialloc might commit the previous transaction * (and release all the locks). An error from here on will result in * the transaction cancel unlocking dp so don't do it explicitly in the * error path. @@ -1143,6 +1098,7 @@ xfs_create_tmpfile( struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; uint resblks; + xfs_ino_t ino; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1167,8 +1123,10 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, - false, &ip); + error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + if (!error) + error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + 0, 0, prid, false, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ca826cfba91c..4b6703dbffb8 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -431,11 +431,10 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_dir_ialloc(struct user_namespace *mnt_userns, - struct xfs_trans **tpp, struct xfs_inode *dp, - umode_t mode, xfs_nlink_t nlink, dev_t dev, - prid_t prid, bool need_xattr, - struct xfs_inode **ipp); +int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp, + struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, + xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, + struct xfs_inode **ipp); static inline int xfs_itruncate_extents( diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f7baf4dc2554..fe341f3fd419 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -24,6 +24,7 @@ #include "xfs_icache.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_ialloc.h" /* * The global quota manager. There is only one of these for the entire @@ -788,8 +789,12 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0, - 0, false, ipp); + xfs_ino_t ino; + + error = xfs_dialloc(&tp, 0, S_IFREG, &ino); + if (!error) + error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino, + S_IFREG, 1, 0, 0, false, ipp); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 99fbec32c10a..1525636f4065 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -21,6 +21,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_trans.h" +#include "xfs_ialloc.h" /* ----- Kernel only functions below ----- */ int @@ -161,6 +162,7 @@ xfs_symlink( struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; uint resblks; + xfs_ino_t ino; *ipp = NULL; @@ -223,8 +225,11 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT), - 1, 0, prid, false, &ip); + error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); + if (!error) + error = xfs_init_new_inode(mnt_userns, tp, dp, ino, + S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, + false, &ip); if (error) goto out_trans_cancel; From 309161f6603ce1a53b76a42817cde2a9bcd17e82 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 039/102] xfs: inode allocation can use a single perag instance Now that we've internalised the two-phase inode allocation, we can now easily make the AG selection and allocation atomic from the perspective of a single perag context. This will ensure AGs going offline/away cannot occur between the selection and allocation steps. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 4a04ca79ba33..a5604df959cf 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1432,6 +1432,7 @@ static int xfs_dialloc_ag( struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { @@ -1446,7 +1447,6 @@ xfs_dialloc_ag( int error; int offset; int i; - struct xfs_perag *pag = agbp->b_pag; if (!xfs_sb_version_hasfinobt(&mp->m_sb)) return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); @@ -1763,9 +1763,9 @@ nextag: xfs_perag_put(pag); return error ? error : -ENOSPC; found_ag: - xfs_perag_put(pag); /* Allocate an inode in the found AG */ - error = xfs_dialloc_ag(*tpp, agbp, parent, &ino); + error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + xfs_perag_put(pag); if (error) return error; *new_ino = ino; From 8237fbf53d6fd2a3a248fc2a8608e047ef22316c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:24 +1000 Subject: [PATCH 040/102] xfs: clean up and simplify xfs_dialloc() Because it's a mess. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 271 +++++++++++++++++++++---------------- 1 file changed, 153 insertions(+), 118 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a5604df959cf..63a1b6d422cc 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -604,9 +604,10 @@ error: } /* - * Allocate new inodes in the allocation group specified by agbp. - * Returns 0 if inodes were allocated in this AG; 1 if there was no space - * in this AG; or the usual negative error code. + * Allocate new inodes in the allocation group specified by agbp. Returns 0 if + * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so + * the caller knows it can try another AG, a hard -ENOSPC when over the maximum + * inode count threshold, or the usual negative error code for other errors. */ STATIC int xfs_ialloc_ag_alloc( @@ -792,7 +793,7 @@ sparse_alloc: } if (args.fsbno == NULLFSBLOCK) - return 1; + return -EAGAIN; ASSERT(args.len == args.minlen); @@ -1568,14 +1569,17 @@ xfs_dialloc_roll( /* Re-attach the quota info that we detached from prev trx. */ tp->t_dqinfo = dqinfo; - *tpp = tp; - if (error) - return error; + /* + * Join the buffer even on commit error so that the buffer is released + * when the caller cancels the transaction and doesn't have to handle + * this error case specially. + */ xfs_trans_bjoin(tp, agibp); - return 0; + *tpp = tp; + return error; } -STATIC xfs_agnumber_t +static xfs_agnumber_t xfs_ialloc_next_ag( xfs_mount_t *mp) { @@ -1590,16 +1594,136 @@ xfs_ialloc_next_ag( return agno; } +static bool +xfs_dialloc_good_ag( + struct xfs_trans *tp, + struct xfs_perag *pag, + umode_t mode, + int flags, + bool ok_alloc) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_extlen_t ineed; + xfs_extlen_t longest = 0; + int needspace; + int error; + + if (!pag->pagi_inodeok) + return false; + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno); + if (error) + return false; + } + + if (pag->pagi_freecount) + return true; + if (!ok_alloc) + return false; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags); + if (error) + return false; + } + + /* + * Check that there is enough free space for the file plus a chunk of + * inodes if we need to allocate some. If this is the first pass across + * the AGs, take into account the potential space needed for alignment + * of inode chunks when checking the longest contiguous free space in + * the AG - this prevents us from getting ENOSPC because we have free + * space larger than ialloc_blks but alignment constraints prevent us + * from using it. + * + * If we can't find an AG with space for full alignment slack to be + * taken into account, we must be near ENOSPC in all AGs. Hence we + * don't include alignment for the second pass and so if we fail + * allocation due to alignment issues then it is most likely a real + * ENOSPC condition. + * + * XXX(dgc): this calculation is now bogus thanks to the per-ag + * reservations that xfs_alloc_fix_freelist() now does via + * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will + * be more than large enough for the check below to succeed, but + * xfs_alloc_space_available() will fail because of the non-zero + * metadata reservation and hence we won't actually be able to allocate + * more inodes in this AG. We do soooo much unnecessary work near ENOSPC + * because of this. + */ + ineed = M_IGEO(mp)->ialloc_min_blks; + if (flags && ineed > 1) + ineed += M_IGEO(mp)->cluster_align; + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + + if (pag->pagf_freeblks < needspace + ineed || longest < ineed) + return false; + return true; +} + +static int +xfs_dialloc_try_ag( + struct xfs_trans **tpp, + struct xfs_perag *pag, + xfs_ino_t parent, + xfs_ino_t *new_ino, + bool ok_alloc) +{ + struct xfs_buf *agbp; + xfs_ino_t ino; + int error; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp); + if (error) + return error; + + if (!pag->pagi_freecount) { + if (!ok_alloc) { + error = -EAGAIN; + goto out_release; + } + + error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); + if (error < 0) + goto out_release; + + /* + * We successfully allocated space for an inode cluster in this + * AG. Roll the transaction so that we can allocate one of the + * new inodes. + */ + ASSERT(pag->pagi_freecount > 0); + error = xfs_dialloc_roll(tpp, agbp); + if (error) + goto out_release; + } + + /* Allocate an inode in the found AG */ + error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + if (!error) + *new_ino = ino; + return error; + +out_release: + xfs_trans_brelse(*tpp, agbp); + return error; +} + /* - * Select and prepare an AG for inode allocation. + * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to - * locate it. - * - * This function will ensure that the selected AG has free inodes available to - * allocate from. The selected AGI will be returned locked to the caller, and it - * will allocate more free inodes if required. If no free inodes are found or - * can be allocated, -ENOSPC be returned. + * locate it. The on-disk inode that is allocated will be returned in @new_ino + * on success, otherwise an error will be set to indicate the failure (e.g. + * -ENOSPC). */ int xfs_dialloc( @@ -1609,14 +1733,12 @@ xfs_dialloc( xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; - struct xfs_buf *agbp; xfs_agnumber_t agno; int error = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); - bool okalloc = true; - int needspace; + bool ok_alloc = true; int flags; xfs_ino_t ino; @@ -1625,7 +1747,6 @@ xfs_dialloc( * one block, so factor that potential expansion when we examine whether * an AG has enough space for file creation. */ - needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); if (S_ISDIR(mode)) start_agno = xfs_ialloc_next_ag(mp); else { @@ -1636,7 +1757,7 @@ xfs_dialloc( /* * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free + * ok_alloc so we scan all available agi structures for a free * inode. * * Read rough value of mp->m_icount by percpu_counter_read_positive, @@ -1645,7 +1766,7 @@ xfs_dialloc( if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { - okalloc = false; + ok_alloc = false; } /* @@ -1656,96 +1777,14 @@ xfs_dialloc( agno = start_agno; flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { - xfs_extlen_t ineed; - xfs_extlen_t longest = 0; - pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) - goto nextag; - - if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, *tpp, agno); - if (error) + if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { + error = xfs_dialloc_try_ag(tpp, pag, parent, + &ino, ok_alloc); + if (error != -EAGAIN) break; } - if (!pag->pagi_freecount && !okalloc) - goto nextag; - - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, *tpp, agno, flags); - if (error) - goto nextag; - } - - /* - * Check that there is enough free space for the file plus a - * chunk of inodes if we need to allocate some. If this is the - * first pass across the AGs, take into account the potential - * space needed for alignment of inode chunks when checking the - * longest contiguous free space in the AG - this prevents us - * from getting ENOSPC because we have free space larger than - * ialloc_blks but alignment constraints prevent us from using - * it. - * - * If we can't find an AG with space for full alignment slack to - * be taken into account, we must be near ENOSPC in all AGs. - * Hence we don't include alignment for the second pass and so - * if we fail allocation due to alignment issues then it is most - * likely a real ENOSPC condition. - */ - if (!pag->pagi_freecount) { - ineed = M_IGEO(mp)->ialloc_min_blks; - if (flags && ineed > 1) - ineed += M_IGEO(mp)->cluster_align; - longest = pag->pagf_longest; - if (!longest) - longest = pag->pagf_flcount > 0; - - if (pag->pagf_freeblks < needspace + ineed || - longest < ineed) - goto nextag; - } - - /* - * Then read in the AGI buffer and recheck with the AGI buffer - * lock held. - */ - error = xfs_ialloc_read_agi(mp, *tpp, agno, &agbp); - if (error) - break; - - if (pag->pagi_freecount) - goto found_ag; - - if (!okalloc) - goto nextag_relse_buffer; - - error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); - if (error < 0) { - xfs_trans_brelse(*tpp, agbp); - break; - } - - if (error == 0) { - /* - * We successfully allocated space for an inode cluster - * in this AG. Roll the transaction so that we can - * allocate one of the new inodes. - */ - ASSERT(pag->pagi_freecount > 0); - - error = xfs_dialloc_roll(tpp, agbp); - if (error) { - xfs_buf_relse(agbp); - break; - } - goto found_ag; - } - -nextag_relse_buffer: - xfs_trans_brelse(*tpp, agbp); -nextag: if (XFS_FORCED_SHUTDOWN(mp)) { error = -EFSCORRUPTED; break; @@ -1753,23 +1792,19 @@ nextag: if (++agno == mp->m_maxagi) agno = 0; if (agno == start_agno) { - if (!flags) + if (!flags) { + error = -ENOSPC; break; + } flags = 0; } xfs_perag_put(pag); } + if (!error) + *new_ino = ino; xfs_perag_put(pag); - return error ? error : -ENOSPC; -found_ag: - /* Allocate an inode in the found AG */ - error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); - xfs_perag_put(pag); - if (error) - return error; - *new_ino = ino; - return 0; + return error; } /* From f40aadb2bb64fe0a3d9b59957e70796d629cdee2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:51 +1000 Subject: [PATCH 041/102] xfs: use perag through unlink processing Unlinked lists are held in the perag, and freeing of inodes needs to be passed a perag, too, so look up the perag early in the unlink processing and use it throughout. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_ialloc.c | 23 +++---- fs/xfs/libxfs/xfs_ialloc.h | 13 +--- fs/xfs/xfs_inode.c | 131 +++++++++++++++++++++---------------- 3 files changed, 87 insertions(+), 80 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 63a1b6d422cc..2ed6de6faf8a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2134,35 +2134,33 @@ error: */ int xfs_difree( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - struct xfs_icluster *xic) /* cluster info if deleted */ + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_ino_t inode, + struct xfs_icluster *xic) { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ struct xfs_buf *agbp; /* buffer for allocation group header */ xfs_agino_t agino; /* allocation group inode number */ - xfs_agnumber_t agno; /* allocation group number */ int error; /* error return value */ struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ - struct xfs_perag *pag; /* * Break up inode number into its components. */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", - __func__, agno, mp->m_sb.sb_agcount); + if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); ASSERT(0); return -EINVAL; } @@ -2176,7 +2174,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2186,7 +2184,6 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - pag = agbp->b_pag; error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 886f6748fb22..9df7c80408ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -39,17 +39,8 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, xfs_ino_t *new_ino); -/* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int /* error */ -xfs_difree( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - struct xfs_icluster *ifree); /* cluster info if deleted */ +int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, + xfs_ino_t ino, struct xfs_icluster *ifree); /* * Return the location of the inode in imap, for mapping it into a buffer. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d204d516621..336c350206a8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -45,7 +45,8 @@ kmem_zone_t *xfs_inode_zone; #define XFS_ITRUNC_MAX_EXTENTS 2 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); -STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); +STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *); /* * helper function to extract extent size hint from inode @@ -1241,7 +1242,11 @@ xfs_link( * Handle initial link state of O_TMPFILE inode */ if (VFS_I(sip)->i_nlink == 0) { - error = xfs_iunlink_remove(tp, sip); + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); + error = xfs_iunlink_remove(tp, pag, sip); + xfs_perag_put(pag); if (error) goto error_return; } @@ -1934,7 +1939,7 @@ xfs_iunlink_destroy( STATIC int xfs_iunlink_update_bucket( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_buf *agibp, unsigned int bucket_index, xfs_agino_t new_agino) @@ -1943,10 +1948,10 @@ xfs_iunlink_update_bucket( xfs_agino_t old_value; int offset; - ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); - trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, old_value, new_agino); /* @@ -1970,7 +1975,7 @@ xfs_iunlink_update_bucket( STATIC void xfs_iunlink_update_dinode( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_buf *ibp, struct xfs_dinode *dip, @@ -1980,9 +1985,9 @@ xfs_iunlink_update_dinode( struct xfs_mount *mp = tp->t_mountp; int offset; - ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); - trace_xfs_iunlink_update_dinode(mp, agno, agino, + trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino, be32_to_cpu(dip->di_next_unlinked), next_agino); dip->di_next_unlinked = cpu_to_be32(next_agino); @@ -2000,7 +2005,7 @@ STATIC int xfs_iunlink_update_inode( struct xfs_trans *tp, struct xfs_inode *ip, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t next_agino, xfs_agino_t *old_next_agino) { @@ -2010,7 +2015,7 @@ xfs_iunlink_update_inode( xfs_agino_t old_value; int error; - ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); if (error) @@ -2019,7 +2024,7 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); - if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; @@ -2042,7 +2047,7 @@ xfs_iunlink_update_inode( } /* Ok, update the new pointer. */ - xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino), ibp, dip, &ip->i_imap, next_agino); return 0; out: @@ -2063,10 +2068,10 @@ xfs_iunlink( struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_agi *agi; struct xfs_buf *agibp; xfs_agino_t next_agino; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; @@ -2075,10 +2080,12 @@ xfs_iunlink( ASSERT(VFS_I(ip)->i_mode != 0); trace_xfs_iunlink(ip); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); if (error) - return error; + goto out; agi = agibp->b_addr; /* @@ -2088,9 +2095,10 @@ xfs_iunlink( */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || - !xfs_verify_agino_or_null(mp, agno, next_agino)) { + !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) { xfs_buf_mark_corrupt(agibp); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + goto out; } if (next_agino != NULLAGINO) { @@ -2100,23 +2108,26 @@ xfs_iunlink( * There is already another inode in the bucket, so point this * inode to the current head of the list. */ - error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + error = xfs_iunlink_update_inode(tp, ip, pag, next_agino, &old_agino); if (error) - return error; + goto out; ASSERT(old_agino == NULLAGINO); /* * agino has been unlinked, add a backref from the next inode * back to agino. */ - error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); + error = xfs_iunlink_add_backref(pag, agino, next_agino); if (error) - return error; + goto out; } /* Point the head of the list to point to this inode. */ - return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); +out: + xfs_perag_put(pag); + return error; } /* Return the imap, dinode pointer, and buffer for an inode. */ @@ -2164,14 +2175,13 @@ xfs_iunlink_map_ino( STATIC int xfs_iunlink_map_prev( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t head_agino, xfs_agino_t target_agino, xfs_agino_t *agino, struct xfs_imap *imap, struct xfs_dinode **dipp, - struct xfs_buf **bpp, - struct xfs_perag *pag) + struct xfs_buf **bpp) { struct xfs_mount *mp = tp->t_mountp; xfs_agino_t next_agino; @@ -2183,7 +2193,8 @@ xfs_iunlink_map_prev( /* See if our backref cache can find it faster. */ *agino = xfs_iunlink_lookup_backref(pag, target_agino); if (*agino != NULLAGINO) { - error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap, + dipp, bpp); if (error) return error; @@ -2199,7 +2210,7 @@ xfs_iunlink_map_prev( WARN_ON_ONCE(1); } - trace_xfs_iunlink_map_prev_fallback(mp, agno); + trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno); /* Otherwise, walk the entire bucket until we find it. */ next_agino = head_agino; @@ -2210,8 +2221,8 @@ xfs_iunlink_map_prev( xfs_trans_brelse(tp, *bpp); *agino = next_agino; - error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, - bpp); + error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap, + dipp, bpp); if (error) return error; @@ -2220,7 +2231,7 @@ xfs_iunlink_map_prev( * Make sure this pointer is valid and isn't an obvious * infinite loop. */ - if (!xfs_verify_agino(mp, agno, unlinked_agino) || + if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) || next_agino == unlinked_agino) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -2240,6 +2251,7 @@ xfs_iunlink_map_prev( STATIC int xfs_iunlink_remove( struct xfs_trans *tp, + struct xfs_perag *pag, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; @@ -2247,7 +2259,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; xfs_agino_t head_agino; @@ -2257,7 +2268,7 @@ xfs_iunlink_remove( trace_xfs_iunlink_remove(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); if (error) return error; agi = agibp->b_addr; @@ -2267,7 +2278,7 @@ xfs_iunlink_remove( * go on. Make sure the head pointer isn't garbage. */ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(mp, agno, head_agino)) { + if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; @@ -2278,7 +2289,7 @@ xfs_iunlink_remove( * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ - error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); if (error) return error; @@ -2290,8 +2301,7 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, - NULLAGINO); + error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO); if (error) return error; } @@ -2301,14 +2311,13 @@ xfs_iunlink_remove( xfs_agino_t prev_agino; /* We need to search the list for the inode being freed. */ - error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, - &prev_agino, &imap, &last_dip, &last_ibp, - agibp->b_pag); + error = xfs_iunlink_map_prev(tp, pag, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp); if (error) return error; /* Point the previous inode on the list to the next inode. */ - xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp, last_dip, &imap, next_agino); /* @@ -2324,7 +2333,7 @@ xfs_iunlink_remove( } /* Point the head of the list to the next unlinked inode. */ - return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, next_agino); } @@ -2335,12 +2344,11 @@ xfs_iunlink_remove( */ static void xfs_ifree_mark_inode_stale( - struct xfs_buf *bp, + struct xfs_perag *pag, struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_perag *pag = bp->b_pag; + struct xfs_mount *mp = pag->pag_mount; struct xfs_inode_log_item *iip; struct xfs_inode *ip; @@ -2430,10 +2438,11 @@ out_iflags_unlock: * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ -STATIC int +static int xfs_ifree_cluster( - struct xfs_inode *free_ip, struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *free_ip, struct xfs_icluster *xic) { struct xfs_mount *mp = free_ip->i_mount; @@ -2495,7 +2504,7 @@ xfs_ifree_cluster( * already marked XFS_ISTALE. */ for (i = 0; i < igeo->inodes_per_cluster; i++) - xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); + xfs_ifree_mark_inode_stale(pag, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); @@ -2518,9 +2527,11 @@ xfs_ifree( struct xfs_trans *tp, struct xfs_inode *ip) { - int error; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; struct xfs_icluster xic = { 0 }; struct xfs_inode_log_item *iip = ip->i_itemp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2528,16 +2539,18 @@ xfs_ifree( ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_nblocks == 0); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + /* * Pull the on-disk inode from the AGI unlinked list. */ - error = xfs_iunlink_remove(tp, ip); + error = xfs_iunlink_remove(tp, pag, ip); if (error) - return error; + goto out; - error = xfs_difree(tp, ip->i_ino, &xic); + error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - return error; + goto out; /* * Free any local-format data sitting around before we reset the @@ -2552,7 +2565,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_diflags = 0; - ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2; + ip->i_diflags2 = mp->m_ino_geo.new_diflags2; ip->i_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) @@ -2571,8 +2584,9 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xic.deleted) - error = xfs_ifree_cluster(ip, tp, &xic); - + error = xfs_ifree_cluster(tp, pag, ip, &xic); +out: + xfs_perag_put(pag); return error; } @@ -3176,8 +3190,13 @@ xfs_rename( * in future. */ if (wip) { + struct xfs_perag *pag; + ASSERT(VFS_I(wip)->i_nlink == 0); - error = xfs_iunlink_remove(tp, wip); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); + error = xfs_iunlink_remove(tp, pag, wip); + xfs_perag_put(pag); if (error) goto out_trans_cancel; From 509201163fca3d4d906bd50a5320115d42818748 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 10:48:51 +1000 Subject: [PATCH 042/102] xfs: remove xfs_perag_t Almost unused, gets rid of another typedef. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 24 +++++++++++----------- fs/xfs/libxfs/xfs_ag.h | 4 ++-- fs/xfs/libxfs/xfs_alloc.c | 42 +++++++++++++++++++-------------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 0e0819f6fb89..29c42698aa90 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -104,19 +104,19 @@ xfs_perag_put( */ int xfs_initialize_perag_data( - struct xfs_mount *mp, - xfs_agnumber_t agcount) + struct xfs_mount *mp, + xfs_agnumber_t agcount) { - xfs_agnumber_t index; - xfs_perag_t *pag; - xfs_sb_t *sbp = &mp->m_sb; - uint64_t ifree = 0; - uint64_t ialloc = 0; - uint64_t bfree = 0; - uint64_t bfreelst = 0; - uint64_t btree = 0; - uint64_t fdblocks; - int error = 0; + xfs_agnumber_t index; + struct xfs_perag *pag; + struct xfs_sb *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + uint64_t fdblocks; + int error = 0; for (index = 0; index < agcount; index++) { /* diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index ebf997a8684e..6006b4329026 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -29,7 +29,7 @@ struct xfs_ag_resv { * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. */ -typedef struct xfs_perag { +struct xfs_perag { struct xfs_mount *pag_mount; /* owner filesystem */ xfs_agnumber_t pag_agno; /* AG this structure belongs to */ atomic_t pag_ref; /* perag reference count */ @@ -102,7 +102,7 @@ typedef struct xfs_perag { * or have some other means to control concurrency. */ struct rhashtable pagi_unlinked_hash; -} xfs_perag_t; +}; int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f7864f33c1f0..00bb34251829 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2694,21 +2694,21 @@ out_no_agbp: * Get a block from the freelist. * Returns with the buffer for the block gotten. */ -int /* error */ +int xfs_alloc_get_freelist( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk) /* destination is a AGF btree */ + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agblock_t *bnop, + int btreeblk) { - struct xfs_agf *agf = agbp->b_addr; - struct xfs_buf *agflbp;/* buffer for a.g. freelist structure */ - xfs_agblock_t bno; /* block number returned */ - __be32 *agfl_bno; - int error; - int logflags; - xfs_mount_t *mp = tp->t_mountp; - xfs_perag_t *pag; /* per allocation group data */ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agflbp; + xfs_agblock_t bno; + __be32 *agfl_bno; + int error; + int logflags; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; /* * Freelist is empty, give up. @@ -2818,20 +2818,20 @@ xfs_alloc_pagf_init( /* * Put the block on the freelist for the allocation group. */ -int /* error */ +int xfs_alloc_put_freelist( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for a.g. freelist header */ - struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk) /* block came from a AGF btree */ + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_buf *agflbp, + xfs_agblock_t bno, + int btreeblk) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - __be32 *blockp;/* pointer to array entry */ + struct xfs_perag *pag; + __be32 *blockp; int error; int logflags; - xfs_perag_t *pag; /* per allocation group data */ __be32 *agfl_bno; int startoff; From 5f7fd75086203a8a4dd3e518976e52bcf24e8b22 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 2 Jun 2021 14:54:09 -0700 Subject: [PATCH 043/102] xfs: sort variable alphabetically to avoid repeated declaration Variable 'xfs_agf_buf_ops', 'xfs_agi_buf_ops', 'xfs_dquot_buf_ops' and 'xfs_symlink_buf_ops' are declared twice, so sort these variables alphabetically and remove the repeated declaration. Cc: "Darrick J. Wong" Signed-off-by: Shaokun Zhang Reviewed-by: Carlos Maiolino Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_shared.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 782fdd08f759..25c4cab58851 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -22,30 +22,26 @@ struct xfs_inode; * Buffer verifier operations are widely used, including userspace tools */ extern const struct xfs_buf_ops xfs_agf_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_bnobt_buf_ops; -extern const struct xfs_buf_ops xfs_cntbt_buf_ops; -extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; -extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_da3_node_buf_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_finobt_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbuf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_rtbuf_ops; /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); From 9673261c32dc2f30863b803374b726a72d16b07c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 2 Jun 2021 14:56:29 -0700 Subject: [PATCH 044/102] xfs: Remove redundant assignment to busy Variable busy is set to false, but this value is never read as it is overwritten or not used later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: fs/xfs/libxfs/xfs_alloc.c:1679:2: warning: Value stored to 'busy' is never read [clang-analyzer-deadcode.DeadStores]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 82b7cbb1f24f..ae46fe64cc4f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1676,7 +1676,6 @@ restart: cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; - busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. From 5a981e4ea8ff8062e7c7ea8fc4a1565e4820a08b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 14:58:59 -0700 Subject: [PATCH 045/102] xfs: mark xfs_bmap_set_attrforkoff static xfs_bmap_set_attrforkoff is only used inside of xfs_bmap.c, so mark it static. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_bmap.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a3e0e6f672d6..7eb6b28a4c30 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1028,7 +1028,7 @@ xfs_bmap_add_attrfork_local( /* * Set an inode attr fork offset based on the format of the data fork. */ -int +static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, int size, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f9a390ecfb1d..67641f669918 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -187,7 +187,6 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); -int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, From 977ec4ddf0b75b30afa443cf71ae80e20f501b15 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 15:00:38 -0700 Subject: [PATCH 046/102] xfs: don't take a spinlock unconditionally in the DIO fastpath Because this happens at high thread counts on high IOPS devices doing mixed read/write AIO-DIO to a single file at about a million iops: 64.09% 0.21% [kernel] [k] io_submit_one - 63.87% io_submit_one - 44.33% aio_write - 42.70% xfs_file_write_iter - 41.32% xfs_file_dio_write_aligned - 25.51% xfs_file_write_checks - 21.60% _raw_spin_lock - 21.59% do_raw_spin_lock - 19.70% __pv_queued_spin_lock_slowpath This also happens of the IO completion IO path: 22.89% 0.69% [kernel] [k] xfs_dio_write_end_io - 22.49% xfs_dio_write_end_io - 21.79% _raw_spin_lock - 20.97% do_raw_spin_lock - 20.10% __pv_queued_spin_lock_slowpath IOWs, fio is burning ~14 whole CPUs on this spin lock. So, do an unlocked check against inode size first, then if we are at/beyond EOF, take the spinlock and recheck. This makes the spinlock disappear from the overwrite fastpath. I'd like to report that fixing this makes things go faster. It doesn't - it just exposes the the XFS_ILOCK as the next severe contention point doing extent mapping lookups, and that now burns all the 14 CPUs this spinlock was burning. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 396ef36dcd0a..c068dcd414f4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -384,21 +384,30 @@ restart: } goto restart; } + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which implies - * having to redo all checks before. + * write. If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies having to + * redo all checks before. * - * We need to serialise against EOF updates that occur in IO - * completions here. We want to make sure that nobody is changing the - * size while we do this check until we have placed an IO barrier (i.e. - * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. - * The spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value - * and hence be able to correctly determine if we need to run zeroing. + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while we + * do this check until we have placed an IO barrier (i.e. hold the + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + * + * We can do an unlocked check here safely as IO completion can only + * extend EOF. Truncate is locked out at this point, so the EOF can + * not move backwards, only forwards. Hence we only need to take the + * slow path and spin locks when we are at or beyond the current EOF. */ + if (iocb->ki_pos <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); isize = i_size_read(inode); if (iocb->ki_pos > isize) { @@ -426,7 +435,7 @@ restart: drained_dio = true; goto restart; } - + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, NULL, &xfs_buffered_write_iomap_ops); @@ -435,6 +444,7 @@ restart: } else spin_unlock(&ip->i_flags_lock); +out: return file_modified(file); } @@ -500,7 +510,17 @@ xfs_dio_write_end_io( * other IO completions here to update the EOF. Failing to serialise * here can result in EOF moving backwards and Bad Things Happen when * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. */ + if (offset + size <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); From 1ad2cfe0a57031505df682dc1e26922d9d43737f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:57 -0700 Subject: [PATCH 047/102] xfs: move the quotaoff dqrele inode walk into xfs_icache.c The only external caller of xfs_inode_walk* happens in quotaoff, when we want to walk all the incore inodes to detach the dquots. Move this code to xfs_icache.c so that we can hide xfs_inode_walk as the starting step in more cleanups of inode walks. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 65 +++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_icache.h | 8 +++-- fs/xfs/xfs_qm.h | 1 - fs/xfs/xfs_qm_syscalls.c | 54 ++------------------------------- 4 files changed, 71 insertions(+), 57 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c81daca0e9a..e2edbcf7a528 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,6 +26,18 @@ #include +/* + * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide + * with XFS_EOF_FLAGS_*. + */ +#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) +#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) +#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29) + +#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ + XFS_ICWALK_FLAG_DROP_GDQUOT | \ + XFS_ICWALK_FLAG_DROP_PDQUOT) + /* * Allocate and initialise an xfs_inode. */ @@ -890,7 +902,7 @@ xfs_inode_walk_get_perag( * Call the @execute function on all incore inodes matching the radix tree * @tag. */ -int +static int xfs_inode_walk( struct xfs_mount *mp, int iter_flags, @@ -915,8 +927,59 @@ xfs_inode_walk( } } return last_error; + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); } +#ifdef CONFIG_XFS_QUOTA +/* Drop this inode's dquots. */ +static int +xfs_dqrele_inode( + struct xfs_inode *ip, + void *priv) +{ + struct xfs_eofblocks *eofb = priv; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Detach all dquots from incore inodes if we can. The caller must already + * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will + * not get reattached. + */ +int +xfs_dqrele_all_inodes( + struct xfs_mount *mp, + unsigned int qflags) +{ + struct xfs_eofblocks eofb = { .eof_flags = 0 }; + + if (qflags & XFS_UQUOTA_ACCT) + eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; + if (qflags & XFS_GQUOTA_ACCT) + eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; + if (qflags & XFS_PQUOTA_ACCT) + eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; + + return xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + &eofb, XFS_ICI_NO_TAG); +} +#endif /* CONFIG_XFS_QUOTA */ + /* * Grab the inode for reclaim exclusively. * diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d1fddb152420..d9baa6df1121 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,9 +68,11 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); -int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); +#ifdef CONFIG_XFS_QUOTA +int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags); +#else +# define xfs_dqrele_all_inodes(mp, qflags) (0) +#endif int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index e3dabab44097..ebbb484c49dc 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -142,7 +142,6 @@ extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* dquot stuff */ extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); -extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 11f1e2fbf22f..13a56e1ea15c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -201,7 +201,8 @@ xfs_qm_scall_quotaoff( * depend on the quota inodes (and other things) being valid as long as * we keep the lock(s). */ - xfs_qm_dqrele_all_inodes(mp, flags); + error = xfs_dqrele_all_inodes(mp, flags); + ASSERT(!error); /* * Next we make the changes in the quota flag in the mount struct. @@ -747,54 +748,3 @@ xfs_qm_scall_getquota_next( xfs_qm_dqput(dqp); return error; } - -STATIC int -xfs_dqrele_inode( - struct xfs_inode *ip, - void *args) -{ - uint *flags = args; - - /* skip quota inodes */ - if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || - ip == ip->i_mount->m_quotainfo->qi_gquotaip || - ip == ip->i_mount->m_quotainfo->qi_pquotaip) { - ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_pdquot == NULL); - return 0; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { - xfs_qm_dqrele(ip->i_pdquot); - ip->i_pdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; -} - - -/* - * Go thru all the inodes in the file system, releasing their dquots. - * - * Note that the mount structure gets modified to indicate that quotas are off - * AFTER this, in the case of quotaoff. - */ -void -xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) -{ - ASSERT(mp->m_quotainfo); - xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &flags, XFS_ICI_NO_TAG); -} From 3ea06d73e3c02ee2952a62bf92abc18f9c98aba1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:57 -0700 Subject: [PATCH 048/102] xfs: detach inode dquots at the end of inactivation Once we're done with inactivating an inode, we're finished updating metadata for that inode. This means that we can detach the dquots at the end and not have to wait for reclaim to do it for us. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e2edbcf7a528..dfa0ec7d02b8 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1093,7 +1093,7 @@ reclaim: * unlocked after the lookup before we go ahead and free it. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_qm_dqdetach(ip); + ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_clean(ip)); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e4c2da4566f1..51972549e73c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1716,7 +1716,7 @@ xfs_inactive( */ if (VFS_I(ip)->i_mode == 0) { ASSERT(ip->i_df.if_broot_bytes == 0); - return; + goto out; } mp = ip->i_mount; @@ -1724,11 +1724,11 @@ xfs_inactive( /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) - return; + goto out; /* Metadata inodes require explicit resource cleanup. */ if (xfs_is_metadata_inode(ip)) - return; + goto out; /* Try to clean out the cow blocks if there are any. */ if (xfs_inode_has_cow_data(ip)) @@ -1747,7 +1747,7 @@ xfs_inactive( if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip); - return; + goto out; } if (S_ISREG(VFS_I(ip)->i_mode) && @@ -1757,14 +1757,14 @@ xfs_inactive( error = xfs_qm_dqattach(ip); if (error) - return; + goto out; if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) error = xfs_inactive_truncate(ip); if (error) - return; + goto out; /* * If there are attributes associated with the file then blow them away @@ -1774,7 +1774,7 @@ xfs_inactive( if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) - return; + goto out; } ASSERT(!ip->i_afp); @@ -1783,12 +1783,12 @@ xfs_inactive( /* * Free the inode. */ - error = xfs_inactive_ifree(ip); - if (error) - return; + xfs_inactive_ifree(ip); +out: /* - * Release the dquots held by inode, if any. + * We're done making metadata updates for this inode, so we can release + * the attached dquots. */ xfs_qm_dqdetach(ip); } From df60019739d8850b865d313053d30aa93dc38a65 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 1 Jun 2021 13:29:41 -0700 Subject: [PATCH 049/102] xfs: move the inode walk functions further down Move the inode walk functions further down in the file to limit the forward declarations to the two walk functions as we add new code that uses the inode walks. We'll clean them out later (i.e. after the deferred inode inactivation series). Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 401 +++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 195 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index dfa0ec7d02b8..55c55e449cab 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,6 +26,13 @@ #include +static int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); +static int xfs_inode_walk_ag(struct xfs_perag *pag, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); + /* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide * with XFS_EOF_FLAGS_*. @@ -732,204 +739,12 @@ xfs_icache_inode_is_allocated( * radix tree lookups to a minimum. The batch size is a trade off between * lookup reduction and stack usage. This is in the reclaim path, so we can't * be too greedy. + * + * XXX: This will be moved closer to xfs_inode_walk* once we get rid of the + * separate reclaim walk functions. */ #define XFS_LOOKUP_BATCH 32 -/* - * Decide if the given @ip is eligible to be a part of the inode walk, and - * grab it if so. Returns true if it's ready to go or false if we should just - * ignore it. - */ -STATIC bool -xfs_inode_walk_ag_grab( - struct xfs_inode *ip, - int flags) -{ - struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); - - ASSERT(rcu_read_lock_held()); - - /* Check for stale RCU freed inode */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock_noent; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || - __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock_noent; - spin_unlock(&ip->i_flags_lock); - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return false; - - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - return false; - - /* inode is valid */ - return true; - -out_unlock_noent: - spin_unlock(&ip->i_flags_lock); - return false; -} - -/* - * For a given per-AG structure @pag, grab, @execute, and rele all incore - * inodes with the given radix tree @tag. - */ -STATIC int -xfs_inode_walk_ag( - struct xfs_perag *pag, - int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - int tag) -{ - struct xfs_mount *mp = pag->pag_mount; - uint32_t first_index; - int last_error = 0; - int skipped; - bool done; - int nr_found; - -restart: - done = false; - skipped = 0; - first_index = 0; - nr_found = 0; - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int error = 0; - int i; - - rcu_read_lock(); - - if (tag == XFS_ICI_NO_TAG) - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - else - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **) batch, first_index, - XFS_LOOKUP_BATCH, tag); - - if (!nr_found) { - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can occur if - * we have inodes in the last block of the AG and we - * are currently pointing to the last inode. - * - * Because we may see inodes that are from the wrong AG - * due to RCU freeing and reallocation, only update the - * index if it lies in this AG. It was a race that lead - * us to see this inode, so another lookup from the - * same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = true; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && - xfs_iflags_test(batch[i], XFS_INEW)) - xfs_inew_wait(batch[i]); - error = execute(batch[i], args); - xfs_irele(batch[i]); - if (error == -EAGAIN) { - skipped++; - continue; - } - if (error && last_error != -EFSCORRUPTED) - last_error = error; - } - - /* bail out if the filesystem is corrupted. */ - if (error == -EFSCORRUPTED) - break; - - cond_resched(); - - } while (nr_found && !done); - - if (skipped) { - delay(1); - goto restart; - } - return last_error; -} - -/* Fetch the next (possibly tagged) per-AG structure. */ -static inline struct xfs_perag * -xfs_inode_walk_get_perag( - struct xfs_mount *mp, - xfs_agnumber_t agno, - int tag) -{ - if (tag == XFS_ICI_NO_TAG) - return xfs_perag_get(mp, agno); - return xfs_perag_get_tag(mp, agno, tag); -} - -/* - * Call the @execute function on all incore inodes matching the radix tree - * @tag. - */ -static int -xfs_inode_walk( - struct xfs_mount *mp, - int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - int tag) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; - BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); -} - #ifdef CONFIG_XFS_QUOTA /* Drop this inode's dquots. */ static int @@ -1649,6 +1464,48 @@ xfs_blockgc_start( xfs_blockgc_queue(pag); } +/* + * Decide if the given @ip is eligible to be a part of the inode walk, and + * grab it if so. Returns true if it's ready to go or false if we should just + * ignore it. + */ +static bool +xfs_inode_walk_ag_grab( + struct xfs_inode *ip, + int flags) +{ + struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); + + ASSERT(rcu_read_lock_held()); + + /* Check for stale RCU freed inode */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return false; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return false; + + /* inode is valid */ + return true; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return false; +} + /* Scan one incore inode for block preallocations that we can remove. */ static int xfs_blockgc_scan_inode( @@ -1769,3 +1626,157 @@ xfs_blockgc_free_quota( xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); } + +/* XFS Inode Cache Walking Code */ + +/* + * For a given per-AG structure @pag, grab, @execute, and rele all incore + * inodes with the given radix tree @tag. + */ +static int +xfs_inode_walk_ag( + struct xfs_perag *pag, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_mount *mp = pag->pag_mount; + uint32_t first_index; + int last_error = 0; + int skipped; + bool done; + int nr_found; + +restart: + done = false; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + + if (tag == XFS_ICI_NO_TAG) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); + error = execute(batch[i], args); + xfs_irele(batch[i]); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == -EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +/* Fetch the next (possibly tagged) per-AG structure. */ +static inline struct xfs_perag * +xfs_inode_walk_get_perag( + struct xfs_mount *mp, + xfs_agnumber_t agno, + int tag) +{ + if (tag == XFS_ICI_NO_TAG) + return xfs_perag_get(mp, agno); + return xfs_perag_get_tag(mp, agno, tag); +} + +/* + * Call the @execute function on all incore inodes matching the radix tree + * @tag. + */ +static int +xfs_inode_walk( + struct xfs_mount *mp, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t agno = 0; + + while ((pag = xfs_inode_walk_get_perag(mp, agno, tag))) { + agno = pag->pag_agno + 1; + error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); +} From c1115c0cba2b82e71ec77e794c684ac87160fcf6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 1 Jun 2021 22:41:25 -0700 Subject: [PATCH 050/102] xfs: rename xfs_inode_walk functions to xfs_icwalk Shorten the prefix so that all the incore inode cache walk code has "xfs_icwalk" in the name somewhere. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 55c55e449cab..d5ecd4cd3ef5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,10 +26,10 @@ #include -static int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, +static int xfs_icwalk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, int tag); -static int xfs_inode_walk_ag(struct xfs_perag *pag, int iter_flags, +static int xfs_icwalk_ag(struct xfs_perag *pag, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, int tag); @@ -740,7 +740,7 @@ xfs_icache_inode_is_allocated( * lookup reduction and stack usage. This is in the reclaim path, so we can't * be too greedy. * - * XXX: This will be moved closer to xfs_inode_walk* once we get rid of the + * XXX: This will be moved closer to xfs_icwalk* once we get rid of the * separate reclaim walk functions. */ #define XFS_LOOKUP_BATCH 32 @@ -790,7 +790,7 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - return xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, &eofb, XFS_ICI_NO_TAG); } #endif /* CONFIG_XFS_QUOTA */ @@ -1538,7 +1538,7 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, + error = xfs_icwalk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, XFS_ICI_BLOCKGC_TAG); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", @@ -1557,7 +1557,7 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); - return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, + return xfs_icwalk(mp, 0, xfs_blockgc_scan_inode, eofb, XFS_ICI_BLOCKGC_TAG); } @@ -1634,7 +1634,7 @@ xfs_blockgc_free_quota( * inodes with the given radix tree @tag. */ static int -xfs_inode_walk_ag( +xfs_icwalk_ag( struct xfs_perag *pag, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), @@ -1740,7 +1740,7 @@ restart: /* Fetch the next (possibly tagged) per-AG structure. */ static inline struct xfs_perag * -xfs_inode_walk_get_perag( +xfs_icwalk_get_perag( struct xfs_mount *mp, xfs_agnumber_t agno, int tag) @@ -1755,7 +1755,7 @@ xfs_inode_walk_get_perag( * @tag. */ static int -xfs_inode_walk( +xfs_icwalk( struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), @@ -1767,9 +1767,9 @@ xfs_inode_walk( int last_error = 0; xfs_agnumber_t agno = 0; - while ((pag = xfs_inode_walk_get_perag(mp, agno, tag))) { + while ((pag = xfs_icwalk_get_perag(mp, agno, tag))) { agno = pag->pag_agno + 1; - error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + error = xfs_icwalk_ag(pag, iter_flags, execute, args, tag); xfs_perag_put(pag); if (error) { last_error = error; From c809d7e948a131cba8fdf9fbd0b50e1f59255f50 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 1 Jun 2021 13:49:52 -0700 Subject: [PATCH 051/102] xfs: pass the goal of the incore inode walk to xfs_inode_walk() As part of removing the indirect calls and radix tag implementation details from the incore inode walk loop, create an enum to represent the goal of the inode iteration. More immediately, this separate removes the need for the "ICI_NOTAG" define which makes little sense. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 55 +++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_icache.h | 9 -------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d5ecd4cd3ef5..c6d956406033 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -26,12 +26,40 @@ #include +/* Radix tree tags for incore inode tree. */ + +/* inode is to be reclaimed */ +#define XFS_ICI_RECLAIM_TAG 0 +/* Inode has speculative preallocations (posteof or cow) to clean. */ +#define XFS_ICI_BLOCKGC_TAG 1 + +/* + * The goal for walking incore inodes. These can correspond with incore inode + * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. + */ +enum xfs_icwalk_goal { + /* Goals that are not related to tags; these must be < 0. */ + XFS_ICWALK_DQRELE = -1, + + /* Goals directly associated with tagged inodes. */ + XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, +}; + +#define XFS_ICWALK_NULL_TAG (-1U) + +/* Compute the inode radix tree tag for this goal. */ +static inline unsigned int +xfs_icwalk_tag(enum xfs_icwalk_goal goal) +{ + return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; +} + static int xfs_icwalk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); + void *args, enum xfs_icwalk_goal goal); static int xfs_icwalk_ag(struct xfs_perag *pag, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); + void *args, enum xfs_icwalk_goal goal); /* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide @@ -791,7 +819,7 @@ xfs_dqrele_all_inodes( eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &eofb, XFS_ICI_NO_TAG); + &eofb, XFS_ICWALK_DQRELE); } #endif /* CONFIG_XFS_QUOTA */ @@ -1539,7 +1567,7 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; error = xfs_icwalk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, - XFS_ICI_BLOCKGC_TAG); + XFS_ICWALK_BLOCKGC); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); @@ -1558,7 +1586,7 @@ xfs_blockgc_free_space( trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); return xfs_icwalk(mp, 0, xfs_blockgc_scan_inode, eofb, - XFS_ICI_BLOCKGC_TAG); + XFS_ICWALK_BLOCKGC); } /* @@ -1639,7 +1667,7 @@ xfs_icwalk_ag( int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag) + enum xfs_icwalk_goal goal) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1655,12 +1683,13 @@ restart: nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + unsigned int tag = xfs_icwalk_tag(goal); int error = 0; int i; rcu_read_lock(); - if (tag == XFS_ICI_NO_TAG) + if (tag == XFS_ICWALK_NULL_TAG) nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); @@ -1743,9 +1772,11 @@ static inline struct xfs_perag * xfs_icwalk_get_perag( struct xfs_mount *mp, xfs_agnumber_t agno, - int tag) + enum xfs_icwalk_goal goal) { - if (tag == XFS_ICI_NO_TAG) + unsigned int tag = xfs_icwalk_tag(goal); + + if (tag == XFS_ICWALK_NULL_TAG) return xfs_perag_get(mp, agno); return xfs_perag_get_tag(mp, agno, tag); } @@ -1760,16 +1791,16 @@ xfs_icwalk( int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag) + enum xfs_icwalk_goal goal) { struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t agno = 0; - while ((pag = xfs_icwalk_get_perag(mp, agno, tag))) { + while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, iter_flags, execute, args, tag); + error = xfs_icwalk_ag(pag, iter_flags, execute, args, goal); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d9baa6df1121..c4274c45d914 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -17,15 +17,6 @@ struct xfs_eofblocks { __u64 eof_min_file_size; }; -/* - * tags for inode radix tree - */ -#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_walk */ -#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -/* Inode has speculative preallocations (posteof or cow) to clean. */ -#define XFS_ICI_BLOCKGC_TAG 1 - /* * Flags for xfs_iget() */ From b9baaef42f764db7089a19c82d2b783aef836437 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:58 -0700 Subject: [PATCH 052/102] xfs: separate the dqrele_all inode grab logic from xfs_inode_walk_ag_grab Disentangle the dqrele_all inode grab code from the "generic" inode walk grabbing code, and and use the opportunity to document why the dqrele grab function does what it does. Since xfs_inode_walk_ag_grab is now only used for blockgc, rename it to reflect that. Ultimately, there will be four reasons to perform a walk of incore inodes: quotaoff dquote releasing (dqrele), garbage collection of speculative preallocations (blockgc), reclamation of incore inodes (reclaim), and deferred inactivation (inodegc). Each of these four have their own slightly different criteria for deciding if they want to handle an inode, so it makes more sense to have four cohesive igrab functions than one confusing parameteric grab function like we do now. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 71 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c6d956406033..45979791313f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -774,6 +774,44 @@ xfs_icache_inode_is_allocated( #define XFS_LOOKUP_BATCH 32 #ifdef CONFIG_XFS_QUOTA +/* Decide if we want to grab this inode to drop its dquots. */ +static bool +xfs_dqrele_igrab( + struct xfs_inode *ip) +{ + bool ret = false; + + ASSERT(rcu_read_lock_held()); + + /* Check for stale RCU freed inode */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock; + + /* + * Skip inodes that are anywhere in the reclaim machinery because we + * drop dquots before tagging an inode for reclamation. + */ + if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) + goto out_unlock; + + /* + * The inode looks alive; try to grab a VFS reference so that it won't + * get destroyed. If we got the reference, return true to say that + * we grabbed the inode. + * + * If we can't get the reference, then we know the inode had its VFS + * state torn down and hasn't yet entered the reclaim machinery. Since + * we also know that dquots are detached from an inode before it enters + * reclaim, we can skip the inode. + */ + ret = igrab(VFS_I(ip)) != NULL; + +out_unlock: + spin_unlock(&ip->i_flags_lock); + return ret; +} + /* Drop this inode's dquots. */ static int xfs_dqrele_inode( @@ -821,6 +859,8 @@ xfs_dqrele_all_inodes( return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); } +#else +# define xfs_dqrele_igrab(ip) (false) #endif /* CONFIG_XFS_QUOTA */ /* @@ -1493,12 +1533,12 @@ xfs_blockgc_start( } /* - * Decide if the given @ip is eligible to be a part of the inode walk, and - * grab it if so. Returns true if it's ready to go or false if we should just - * ignore it. + * Decide if the given @ip is eligible for garbage collection of speculative + * preallocations, and grab it if so. Returns true if it's ready to go or + * false if we should just ignore it. */ static bool -xfs_inode_walk_ag_grab( +xfs_blockgc_igrab( struct xfs_inode *ip, int flags) { @@ -1657,6 +1697,27 @@ xfs_blockgc_free_quota( /* XFS Inode Cache Walking Code */ +/* + * Decide if we want to grab this inode in anticipation of doing work towards + * the goal. If selected, the VFS must hold a reference to this inode, which + * will be released after processing. + */ +static inline bool +xfs_icwalk_igrab( + enum xfs_icwalk_goal goal, + struct xfs_inode *ip, + int iter_flags) +{ + switch (goal) { + case XFS_ICWALK_DQRELE: + return xfs_dqrele_igrab(ip); + case XFS_ICWALK_BLOCKGC: + return xfs_blockgc_igrab(ip, iter_flags); + default: + return false; + } +} + /* * For a given per-AG structure @pag, grab, @execute, and rele all incore * inodes with the given radix tree @tag. @@ -1711,7 +1772,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) + if (done || !xfs_icwalk_igrab(goal, ip, iter_flags)) batch[i] = NULL; /* From 9d2793ceecb9fd711f70a860685b71129cac5dc9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:59 -0700 Subject: [PATCH 053/102] xfs: move xfs_inew_wait call into xfs_dqrele_inode Move the INEW wait into xfs_dqrele_inode so that we can drop the iter_flags parameter in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 45979791313f..5f52948f9cfa 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -820,6 +820,9 @@ xfs_dqrele_inode( { struct xfs_eofblocks *eofb = priv; + if (xfs_iflags_test(ip, XFS_INEW)) + xfs_inew_wait(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { xfs_qm_dqrele(ip->i_udquot); @@ -856,8 +859,7 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - return xfs_icwalk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &eofb, XFS_ICWALK_DQRELE); + return xfs_icwalk(mp, 0, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); } #else # define xfs_dqrele_igrab(ip) (false) From 7fdff52623b4df9c9ae665fe8bb727978c29414e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:31:59 -0700 Subject: [PATCH 054/102] xfs: remove iter_flags parameter from xfs_inode_walk_* The sole iter_flags is XFS_INODE_WALK_INEW_WAIT, and there are no users. Remove the flag, and the parameter, and all the code that used it. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 33 ++++++++++++--------------------- fs/xfs/xfs_icache.h | 5 ----- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5f52948f9cfa..b5ce9580934f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -54,10 +54,10 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; } -static int xfs_icwalk(struct xfs_mount *mp, int iter_flags, +static int xfs_icwalk(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal); -static int xfs_icwalk_ag(struct xfs_perag *pag, int iter_flags, +static int xfs_icwalk_ag(struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal); @@ -282,7 +282,7 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); } -static void +static inline void xfs_inew_wait( struct xfs_inode *ip) { @@ -859,7 +859,7 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - return xfs_icwalk(mp, 0, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); + return xfs_icwalk(mp, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); } #else # define xfs_dqrele_igrab(ip) (false) @@ -1541,11 +1541,9 @@ xfs_blockgc_start( */ static bool xfs_blockgc_igrab( - struct xfs_inode *ip, - int flags) + struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -1555,8 +1553,7 @@ xfs_blockgc_igrab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || - __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -1608,7 +1605,7 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_icwalk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, + error = xfs_icwalk_ag(pag, xfs_blockgc_scan_inode, NULL, XFS_ICWALK_BLOCKGC); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", @@ -1627,7 +1624,7 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); - return xfs_icwalk(mp, 0, xfs_blockgc_scan_inode, eofb, + return xfs_icwalk(mp, xfs_blockgc_scan_inode, eofb, XFS_ICWALK_BLOCKGC); } @@ -1707,14 +1704,13 @@ xfs_blockgc_free_quota( static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, - struct xfs_inode *ip, - int iter_flags) + struct xfs_inode *ip) { switch (goal) { case XFS_ICWALK_DQRELE: return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: - return xfs_blockgc_igrab(ip, iter_flags); + return xfs_blockgc_igrab(ip); default: return false; } @@ -1727,7 +1723,6 @@ xfs_icwalk_igrab( static int xfs_icwalk_ag( struct xfs_perag *pag, - int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal) @@ -1774,7 +1769,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || !xfs_icwalk_igrab(goal, ip, iter_flags)) + if (done || !xfs_icwalk_igrab(goal, ip)) batch[i] = NULL; /* @@ -1802,9 +1797,6 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && - xfs_iflags_test(batch[i], XFS_INEW)) - xfs_inew_wait(batch[i]); error = execute(batch[i], args); xfs_irele(batch[i]); if (error == -EAGAIN) { @@ -1851,7 +1843,6 @@ xfs_icwalk_get_perag( static int xfs_icwalk( struct xfs_mount *mp, - int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), void *args, enum xfs_icwalk_goal goal) @@ -1863,7 +1854,7 @@ xfs_icwalk( while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, iter_flags, execute, args, goal); + error = xfs_icwalk_ag(pag, execute, args, goal); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index c4274c45d914..3ec00f1fea86 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -25,11 +25,6 @@ struct xfs_eofblocks { #define XFS_IGET_DONTCACHE 0x4 #define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ -/* - * flags for AG inode iterator - */ -#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */ - int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); From f427cf5c6236acdf72b4d8564b2e18937c4cc8d8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:32:00 -0700 Subject: [PATCH 055/102] xfs: remove indirect calls from xfs_inode_walk{,_ag} It turns out that there is a 1:1 mapping between the execute and goal parameters that are passed to xfs_inode_walk_ag: xfs_blockgc_scan_inode <=> XFS_ICWALK_BLOCKGC xfs_dqrele_inode <=> XFS_ICWALK_DQRELE Because of this exact correspondence, we don't need the execute function pointer and can replace it with a direct call. For the price of a forward static declaration, we can eliminate the indirect function call. This likely has a negligible impact on performance (since the execute function runs transactions), but it also simplifies the function signature. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b5ce9580934f..5ca5bd2ee5ae 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -55,11 +55,9 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) } static int xfs_icwalk(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, enum xfs_icwalk_goal goal); + enum xfs_icwalk_goal goal, void *args); static int xfs_icwalk_ag(struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, enum xfs_icwalk_goal goal); + enum xfs_icwalk_goal goal, void *args); /* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide @@ -859,10 +857,11 @@ xfs_dqrele_all_inodes( if (qflags & XFS_PQUOTA_ACCT) eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - return xfs_icwalk(mp, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE); + return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &eofb); } #else # define xfs_dqrele_igrab(ip) (false) +# define xfs_dqrele_inode(ip, priv) (0) #endif /* CONFIG_XFS_QUOTA */ /* @@ -1605,8 +1604,7 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_icwalk_ag(pag, xfs_blockgc_scan_inode, NULL, - XFS_ICWALK_BLOCKGC); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); @@ -1624,8 +1622,7 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); - return xfs_icwalk(mp, xfs_blockgc_scan_inode, eofb, - XFS_ICWALK_BLOCKGC); + return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, eofb); } /* @@ -1716,16 +1713,36 @@ xfs_icwalk_igrab( } } +/* Process an inode and release it. Return -EAGAIN to skip an inode. */ +static inline int +xfs_icwalk_process_inode( + enum xfs_icwalk_goal goal, + struct xfs_inode *ip, + void *args) +{ + int error; + + switch (goal) { + case XFS_ICWALK_DQRELE: + error = xfs_dqrele_inode(ip, args); + break; + case XFS_ICWALK_BLOCKGC: + error = xfs_blockgc_scan_inode(ip, args); + break; + } + xfs_irele(ip); + return error; +} + /* - * For a given per-AG structure @pag, grab, @execute, and rele all incore - * inodes with the given radix tree @tag. + * For a given per-AG structure @pag and a goal, grab qualifying inodes and + * process them in some manner. */ static int xfs_icwalk_ag( struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - enum xfs_icwalk_goal goal) + enum xfs_icwalk_goal goal, + void *args) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1797,8 +1814,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], args); - xfs_irele(batch[i]); + error = xfs_icwalk_process_inode(goal, batch[i], args); if (error == -EAGAIN) { skipped++; continue; @@ -1836,16 +1852,12 @@ xfs_icwalk_get_perag( return xfs_perag_get_tag(mp, agno, tag); } -/* - * Call the @execute function on all incore inodes matching the radix tree - * @tag. - */ +/* Walk all incore inodes to achieve a given goal. */ static int xfs_icwalk( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - enum xfs_icwalk_goal goal) + enum xfs_icwalk_goal goal, + void *args) { struct xfs_perag *pag; int error = 0; @@ -1854,7 +1866,7 @@ xfs_icwalk( while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, execute, args, goal); + error = xfs_icwalk_ag(pag, goal, args); xfs_perag_put(pag); if (error) { last_error = error; From d20d5edcf941e70e03cdbda2f8df93e3969c31a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 1 Jun 2021 23:01:44 -0700 Subject: [PATCH 056/102] xfs: clean up inode state flag tests in xfs_blockgc_igrab Clean up the definition of which inode states are not eligible for speculative preallocation garbage collecting by creating a private #define. The deferred inactivation patchset will add two new entries to the set of flags-to-ignore, so we want the definition not to end up a cluttered mess. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5ca5bd2ee5ae..94dba5c1b98d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1533,6 +1533,10 @@ xfs_blockgc_start( xfs_blockgc_queue(pag); } +/* Don't try to run block gc on an inode that's in any of these states. */ +#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_IRECLAIMABLE | \ + XFS_IRECLAIM) /* * Decide if the given @ip is eligible for garbage collection of speculative * preallocations, and grab it if so. Returns true if it's ready to go or @@ -1551,8 +1555,7 @@ xfs_blockgc_igrab( if (!ip->i_ino) goto out_unlock_noent; - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); From 594ab00b760f1722b800c45d37adc21eecf42dc1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:32:00 -0700 Subject: [PATCH 057/102] xfs: make the icwalk processing functions clean up the grab state Soon we're going to be adding two new callers to the incore inode walk code: reclaim of incore inodes, and (later) inactivation of inodes. Both states operate on inodes that no longer have any VFS state, so we need to move the xfs_irele calls into the processing functions. In other words, icwalk processing functions are responsible for cleaning up whatever state changes are made by the corresponding icwalk igrab function that picked the inode for processing. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 94dba5c1b98d..806faa8df7e9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -811,7 +811,7 @@ out_unlock: } /* Drop this inode's dquots. */ -static int +static void xfs_dqrele_inode( struct xfs_inode *ip, void *priv) @@ -835,7 +835,7 @@ xfs_dqrele_inode( ip->i_pdquot = NULL; } xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; + xfs_irele(ip); } /* @@ -861,7 +861,7 @@ xfs_dqrele_all_inodes( } #else # define xfs_dqrele_igrab(ip) (false) -# define xfs_dqrele_inode(ip, priv) (0) +# define xfs_dqrele_inode(ip, priv) ((void)0) #endif /* CONFIG_XFS_QUOTA */ /* @@ -1592,6 +1592,7 @@ xfs_blockgc_scan_inode( unlock: if (lockflags) xfs_iunlock(ip, lockflags); + xfs_irele(ip); return error; } @@ -1698,8 +1699,7 @@ xfs_blockgc_free_quota( /* * Decide if we want to grab this inode in anticipation of doing work towards - * the goal. If selected, the VFS must hold a reference to this inode, which - * will be released after processing. + * the goal. */ static inline bool xfs_icwalk_igrab( @@ -1716,24 +1716,26 @@ xfs_icwalk_igrab( } } -/* Process an inode and release it. Return -EAGAIN to skip an inode. */ +/* + * Process an inode. Each processing function must handle any state changes + * made by the icwalk igrab function. Return -EAGAIN to skip an inode. + */ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, void *args) { - int error; + int error = 0; switch (goal) { case XFS_ICWALK_DQRELE: - error = xfs_dqrele_inode(ip, args); + xfs_dqrele_inode(ip, args); break; case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, args); break; } - xfs_irele(ip); return error; } From 919a4ddb68413056ecb7c71d9d5465bb54c8032b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:32:01 -0700 Subject: [PATCH 058/102] xfs: fix radix tree tag signs Radix tree tags are supposed to be unsigned ints, so fix the callers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 2 +- fs/xfs/libxfs/xfs_sb.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dfbbcbd448c1..300d0a1a8049 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -61,7 +61,7 @@ struct xfs_perag * xfs_perag_get_tag( struct xfs_mount *mp, xfs_agnumber_t first, - int tag) + unsigned int tag) { struct xfs_perag *pag; int found; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index f79f9dc632b6..e5f1c2d879eb 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -17,8 +17,8 @@ struct xfs_perag; * perag get/put wrappers for ref counting */ extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); -extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int tag); extern void xfs_perag_put(struct xfs_perag *pag); extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); From 9d5ee837595134f91bb2d66f571f498c3b8ab148 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:32:01 -0700 Subject: [PATCH 059/102] xfs: pass struct xfs_eofblocks to the inode scan callback Pass a pointer to the actual eofb structure around the inode scanner functions instead of a void pointer, now that none of the functions is used as a callback. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 806faa8df7e9..0c40c39a5f9f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -55,9 +55,9 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) } static int xfs_icwalk(struct xfs_mount *mp, - enum xfs_icwalk_goal goal, void *args); + enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); static int xfs_icwalk_ag(struct xfs_perag *pag, - enum xfs_icwalk_goal goal, void *args); + enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); /* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide @@ -814,10 +814,8 @@ out_unlock: static void xfs_dqrele_inode( struct xfs_inode *ip, - void *priv) + struct xfs_eofblocks *eofb) { - struct xfs_eofblocks *eofb = priv; - if (xfs_iflags_test(ip, XFS_INEW)) xfs_inew_wait(ip); @@ -1232,10 +1230,9 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - void *args, + struct xfs_eofblocks *eofb, unsigned int *lockflags) { - struct xfs_eofblocks *eofb = args; bool wait; wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); @@ -1439,10 +1436,9 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - void *args, + struct xfs_eofblocks *eofb, unsigned int *lockflags) { - struct xfs_eofblocks *eofb = args; bool wait; int ret = 0; @@ -1579,16 +1575,16 @@ out_unlock_noent: static int xfs_blockgc_scan_inode( struct xfs_inode *ip, - void *args) + struct xfs_eofblocks *eofb) { unsigned int lockflags = 0; int error; - error = xfs_inode_free_eofblocks(ip, args, &lockflags); + error = xfs_inode_free_eofblocks(ip, eofb, &lockflags); if (error) goto unlock; - error = xfs_inode_free_cowblocks(ip, args, &lockflags); + error = xfs_inode_free_cowblocks(ip, eofb, &lockflags); unlock: if (lockflags) xfs_iunlock(ip, lockflags); @@ -1724,16 +1720,16 @@ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, - void *args) + struct xfs_eofblocks *eofb) { int error = 0; switch (goal) { case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, args); + xfs_dqrele_inode(ip, eofb); break; case XFS_ICWALK_BLOCKGC: - error = xfs_blockgc_scan_inode(ip, args); + error = xfs_blockgc_scan_inode(ip, eofb); break; } return error; @@ -1747,7 +1743,7 @@ static int xfs_icwalk_ag( struct xfs_perag *pag, enum xfs_icwalk_goal goal, - void *args) + struct xfs_eofblocks *eofb) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1819,7 +1815,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = xfs_icwalk_process_inode(goal, batch[i], args); + error = xfs_icwalk_process_inode(goal, batch[i], eofb); if (error == -EAGAIN) { skipped++; continue; @@ -1862,7 +1858,7 @@ static int xfs_icwalk( struct xfs_mount *mp, enum xfs_icwalk_goal goal, - void *args) + struct xfs_eofblocks *eofb) { struct xfs_perag *pag; int error = 0; @@ -1871,7 +1867,7 @@ xfs_icwalk( while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, goal, args); + error = xfs_icwalk_ag(pag, goal, eofb); xfs_perag_put(pag); if (error) { last_error = error; From f1bc5c5630f90b83b339e8970dcf6d03abba5bd5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:32:02 -0700 Subject: [PATCH 060/102] xfs: merge xfs_reclaim_inodes_ag into xfs_inode_walk_ag Merge these two inode walk loops together, since they're pretty similar now. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 162 +++++++++++++------------------------------- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 5 +- 3 files changed, 53 insertions(+), 115 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0c40c39a5f9f..1223921fb01c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -43,6 +43,7 @@ enum xfs_icwalk_goal { /* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, + XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, }; #define XFS_ICWALK_NULL_TAG (-1U) @@ -67,9 +68,13 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) #define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29) +/* Stop scanning after icw_scan_limit inodes. */ +#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) + #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ - XFS_ICWALK_FLAG_DROP_PDQUOT) + XFS_ICWALK_FLAG_DROP_PDQUOT | \ + XFS_ICWALK_FLAG_SCAN_LIMIT) /* * Allocate and initialise an xfs_inode. @@ -760,17 +765,6 @@ xfs_icache_inode_is_allocated( return 0; } -/* - * The inode lookup is done in batches to keep the amount of lock traffic and - * radix tree lookups to a minimum. The batch size is a trade off between - * lookup reduction and stack usage. This is in the reclaim path, so we can't - * be too greedy. - * - * XXX: This will be moved closer to xfs_icwalk* once we get rid of the - * separate reclaim walk functions. - */ -#define XFS_LOOKUP_BATCH 32 - #ifdef CONFIG_XFS_QUOTA /* Decide if we want to grab this inode to drop its dquots. */ static bool @@ -880,7 +874,7 @@ xfs_dqrele_all_inodes( * Return true if we grabbed it, false otherwise. */ static bool -xfs_reclaim_inode_grab( +xfs_reclaim_igrab( struct xfs_inode *ip) { ASSERT(rcu_read_lock_held()); @@ -990,108 +984,13 @@ out: xfs_iflags_clear(ip, XFS_IRECLAIM); } -/* - * Walk the AGs and reclaim the inodes in them. Even if the filesystem is - * corrupted, we still want to try to reclaim all the inodes. If we don't, - * then a shut down during filesystem unmount reclaim walk leak all the - * unreclaimed inodes. - * - * Returns non-zero if any AGs or inodes were skipped in the reclaim pass - * so that callers that want to block until all dirty inodes are written back - * and reclaimed can sanely loop. - */ -static void -xfs_reclaim_inodes_ag( - struct xfs_mount *mp, - int *nr_to_scan) -{ - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; - - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - unsigned long first_index = 0; - int done = 0; - int nr_found = 0; - - ag = pag->pag_agno + 1; - - first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH, - XFS_ICI_RECLAIM_TAG); - if (!nr_found) { - done = 1; - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_reclaim_inode_grab(ip)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can - * occur if we have inodes in the last block of - * the AG and we are currently pointing to the - * last inode. - * - * Because we may see inodes that are from the - * wrong AG due to RCU freeing and - * reallocation, only update the index if it - * lies in this AG. It was a race that lead us - * to see this inode, so another lookup from - * the same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != - pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (batch[i]) - xfs_reclaim_inode(batch[i], pag); - } - - *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); - - if (done) - first_index = 0; - WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); - xfs_perag_put(pag); - } -} - void xfs_reclaim_inodes( struct xfs_mount *mp) { - int nr_to_scan = INT_MAX; - while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); } } @@ -1107,11 +1006,16 @@ xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) { + struct xfs_eofblocks eofb = { + .eof_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, + .icw_scan_limit = nr_to_scan, + }; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); return 0; } @@ -1221,9 +1125,8 @@ xfs_reclaim_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_reclaim_work); - int nr_to_scan = INT_MAX; - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); xfs_reclaim_work_queue(mp); } @@ -1693,6 +1596,15 @@ xfs_blockgc_free_quota( /* XFS Inode Cache Walking Code */ +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + + /* * Decide if we want to grab this inode in anticipation of doing work towards * the goal. @@ -1707,6 +1619,8 @@ xfs_icwalk_igrab( return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); + case XFS_ICWALK_RECLAIM: + return xfs_reclaim_igrab(ip); default: return false; } @@ -1720,6 +1634,7 @@ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, + struct xfs_perag *pag, struct xfs_eofblocks *eofb) { int error = 0; @@ -1731,6 +1646,9 @@ xfs_icwalk_process_inode( case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, eofb); break; + case XFS_ICWALK_RECLAIM: + xfs_reclaim_inode(ip, pag); + break; } return error; } @@ -1755,7 +1673,10 @@ xfs_icwalk_ag( restart: done = false; skipped = 0; - first_index = 0; + if (goal == XFS_ICWALK_RECLAIM) + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); + else + first_index = 0; nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; @@ -1776,6 +1697,7 @@ restart: XFS_LOOKUP_BATCH, tag); if (!nr_found) { + done = true; rcu_read_unlock(); break; } @@ -1815,7 +1737,8 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = xfs_icwalk_process_inode(goal, batch[i], eofb); + error = xfs_icwalk_process_inode(goal, batch[i], pag, + eofb); if (error == -EAGAIN) { skipped++; continue; @@ -1830,8 +1753,19 @@ restart: cond_resched(); + if (eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { + eofb->icw_scan_limit -= XFS_LOOKUP_BATCH; + if (eofb->icw_scan_limit <= 0) + break; + } } while (nr_found && !done); + if (goal == XFS_ICWALK_RECLAIM) { + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); + } + if (skipped) { delay(1); goto restart; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3ec00f1fea86..b6ab1067c52b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -15,6 +15,7 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; + int icw_scan_limit; }; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 808ae337b222..1377b1e24e1d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3898,6 +3898,7 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, __field(uint32_t, gid) __field(prid_t, prid) __field(__u64, min_file_size) + __field(int, scan_limit) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -3909,15 +3910,17 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, eofb->eof_gid) : 0; __entry->prid = eofb ? eofb->eof_prid : 0; __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->scan_limit = eofb ? eofb->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags, __entry->uid, __entry->gid, __entry->prid, __entry->min_file_size, + __entry->scan_limit, (char *)__entry->caller_ip) ); #define DEFINE_EOFBLOCKS_EVENT(name) \ From c076ae7a9361b87624900c722012a837fee0b1b3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 31 May 2021 11:32:02 -0700 Subject: [PATCH 061/102] xfs: refactor per-AG inode tagging functions In preparation for adding another incore inode tree tag, refactor the code that sets and clears tags from the per-AG inode tree and the tree of per-AG structures, and remove the open-coded versions used by the blockgc code. Note: For reclaim, we now rely on the radix tree tags instead of the reclaimable inode count more heavily than we used to. The conversion should be fine, but the logic isn't 100% identical. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 182 +++++++++++++++++++++----------------------- fs/xfs/xfs_icache.h | 2 +- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_trace.h | 6 +- 4 files changed, 92 insertions(+), 100 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 1223921fb01c..396cc54ca03f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -207,54 +207,102 @@ xfs_reclaim_work_queue( rcu_read_unlock(); } -static void -xfs_perag_set_reclaim_tag( +/* + * Background scanning to trim preallocated space. This is queued based on the + * 'speculative_prealloc_lifetime' tunable (5m by default). + */ +static inline void +xfs_blockgc_queue( struct xfs_perag *pag) +{ + rcu_read_lock(); + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) + queue_delayed_work(pag->pag_mount->m_gc_workqueue, + &pag->pag_blockgc_work, + msecs_to_jiffies(xfs_blockgc_secs * 1000)); + rcu_read_unlock(); +} + +/* Set a tag on both the AG incore inode tree and the AG radix tree. */ +static void +xfs_perag_set_inode_tag( + struct xfs_perag *pag, + xfs_agino_t agino, + unsigned int tag) +{ + struct xfs_mount *mp = pag->pag_mount; + bool was_tagged; + + lockdep_assert_held(&pag->pag_ici_lock); + + was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); + radix_tree_tag_set(&pag->pag_ici_root, agino, tag); + + if (tag == XFS_ICI_RECLAIM_TAG) + pag->pag_ici_reclaimable++; + + if (was_tagged) + return; + + /* propagate the tag up into the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); + spin_unlock(&mp->m_perag_lock); + + /* start background work */ + switch (tag) { + case XFS_ICI_RECLAIM_TAG: + xfs_reclaim_work_queue(mp); + break; + case XFS_ICI_BLOCKGC_TAG: + xfs_blockgc_queue(pag); + break; + } + + trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); +} + +/* Clear a tag on both the AG incore inode tree and the AG radix tree. */ +static void +xfs_perag_clear_inode_tag( + struct xfs_perag *pag, + xfs_agino_t agino, + unsigned int tag) { struct xfs_mount *mp = pag->pag_mount; lockdep_assert_held(&pag->pag_ici_lock); - if (pag->pag_ici_reclaimable++) + + /* + * Reclaim can signal (with a null agino) that it cleared its own tag + * by removing the inode from the radix tree. + */ + if (agino != NULLAGINO) + radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); + else + ASSERT(tag == XFS_ICI_RECLAIM_TAG); + + if (tag == XFS_ICI_RECLAIM_TAG) + pag->pag_ici_reclaimable--; + + if (radix_tree_tagged(&pag->pag_ici_root, tag)) return; - /* propagate the reclaim tag up into the perag radix tree */ + /* clear the tag from the perag radix tree */ spin_lock(&mp->m_perag_lock); - radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, - XFS_ICI_RECLAIM_TAG); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock); - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(mp); - - trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } -static void -xfs_perag_clear_reclaim_tag( - struct xfs_perag *pag) -{ - struct xfs_mount *mp = pag->pag_mount; - - lockdep_assert_held(&pag->pag_ici_lock); - if (--pag->pag_ici_reclaimable) - return; - - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&mp->m_perag_lock); - radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, - XFS_ICI_RECLAIM_TAG); - spin_unlock(&mp->m_perag_lock); - trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); -} - - /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag * can go away. */ void -xfs_inode_set_reclaim_tag( +xfs_inode_mark_reclaimable( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; @@ -264,9 +312,8 @@ xfs_inode_set_reclaim_tag( spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - xfs_perag_set_reclaim_tag(pag); + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); @@ -274,17 +321,6 @@ xfs_inode_set_reclaim_tag( xfs_perag_put(pag); } -STATIC void -xfs_inode_clear_reclaim_tag( - struct xfs_perag *pag, - xfs_ino_t ino) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(pag->pag_mount, ino), - XFS_ICI_RECLAIM_TAG); - xfs_perag_clear_reclaim_tag(pag); -} - static inline void xfs_inew_wait( struct xfs_inode *ip) @@ -483,7 +519,9 @@ xfs_iget_cache_hit( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - xfs_inode_clear_reclaim_tag(pag, ip->i_ino); + xfs_perag_clear_inode_tag(pag, + XFS_INO_TO_AGINO(pag->pag_mount, ino), + XFS_ICI_RECLAIM_TAG); inode->i_state = I_NEW; ip->i_sick = 0; ip->i_checked = 0; @@ -957,7 +995,7 @@ reclaim: if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - xfs_perag_clear_reclaim_tag(pag); + xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); spin_unlock(&pag->pag_ici_lock); /* @@ -1173,22 +1211,6 @@ xfs_inode_free_eofblocks( return 0; } -/* - * Background scanning to trim preallocated space. This is queued based on the - * 'speculative_prealloc_lifetime' tunable (5m by default). - */ -static inline void -xfs_blockgc_queue( - struct xfs_perag *pag) -{ - rcu_read_lock(); - if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, - &pag->pag_blockgc_work, - msecs_to_jiffies(xfs_blockgc_secs * 1000)); - rcu_read_unlock(); -} - static void xfs_blockgc_set_iflag( struct xfs_inode *ip, @@ -1196,7 +1218,6 @@ xfs_blockgc_set_iflag( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; - int tagged; ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); @@ -1213,24 +1234,8 @@ xfs_blockgc_set_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - if (!tagged) { - /* propagate the blockgc tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* kick off background trimming */ - xfs_blockgc_queue(pag); - - trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, - _RET_IP_); - } + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); @@ -1266,19 +1271,8 @@ xfs_blockgc_clear_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { - /* clear the blockgc tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, - _RET_IP_); - } + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b6ab1067c52b..191620a069af 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -39,7 +39,7 @@ void xfs_reclaim_inodes(struct xfs_mount *mp); int xfs_reclaim_inodes_count(struct xfs_mount *mp); long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); -void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_mark_reclaimable(struct xfs_inode *ip); int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a2dab05332ac..db61e9cdc013 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -667,7 +667,7 @@ xfs_fs_destroy_inode( * reclaim path handles this more efficiently than we can here, so * simply let background reclaim tear down all inodes. */ - xfs_inode_set_reclaim_tag(ip); + xfs_inode_mark_reclaimable(ip); } static void diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 1377b1e24e1d..0171d93239a2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -153,10 +153,8 @@ DEFINE_EVENT(xfs_perag_class, name, \ DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), From 54cd3aa6f8102f4648190fc93eb5dd8603de9b52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Jun 2021 11:49:50 +1000 Subject: [PATCH 062/102] xfs: remove ->b_offset handling for page backed buffers ->b_offset can only be non-zero for _XBF_KMEM backed buffers, so remove all code dealing with it for page backed buffers. Signed-off-by: Christoph Hellwig [dgc: modified to fit this patchset] Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 8 +++----- fs/xfs/xfs_buf.h | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d02edb683cfd..f901a74d6176 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -79,7 +79,7 @@ static inline int xfs_buf_vmap_len( struct xfs_buf *bp) { - return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; + return (bp->b_page_count * PAGE_SIZE); } /* @@ -281,7 +281,7 @@ xfs_buf_free_pages( ASSERT(bp->b_flags & _XBF_PAGES); if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); + vm_unmap_ram(bp->b_addr, bp->b_page_count); for (i = 0; i < bp->b_page_count; i++) { if (bp->b_pages[i]) @@ -442,7 +442,7 @@ _xfs_buf_map_pages( ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + bp->b_addr = page_address(bp->b_pages[0]); } else if (flags & XBF_UNMAPPED) { bp->b_addr = NULL; } else { @@ -469,7 +469,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; - bp->b_addr += bp->b_offset; } return 0; @@ -1680,7 +1679,6 @@ xfs_buf_offset( if (bp->b_addr) return bp->b_addr + offset; - offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; return page_address(page) + (offset & (PAGE_SIZE-1)); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 459ca34f26f5..464dc548fa23 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -167,7 +167,8 @@ struct xfs_buf { atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset in first page */ + unsigned int b_offset; /* page offset of b_addr, + only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ /* From 934d1076bb2c5bbb3d5b0e3892b208d1f537949d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Jun 2021 11:50:00 +1000 Subject: [PATCH 063/102] xfs: simplify the b_page_count calculation Ever since we stopped using the Linux page cache to back XFS buffers there is no need to take the start sector into account for calculating the number of pages in a buffer, as the data always start from the beginning of the buffer. Signed-off-by: Christoph Hellwig [dgc: modified to suit this series] Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f901a74d6176..5c6a7140d28d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -348,14 +348,13 @@ xfs_buf_alloc_kmem( static int xfs_buf_alloc_pages( struct xfs_buf *bp, - uint page_count, xfs_buf_flags_t flags) { gfp_t gfp_mask = xb_to_gfp(flags); long filled = 0; /* Make sure that we have a page list */ - bp->b_page_count = page_count; + bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); if (bp->b_page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { @@ -409,7 +408,6 @@ xfs_buf_allocate_memory( uint flags) { size_t size; - xfs_off_t start, end; int error; /* @@ -424,11 +422,7 @@ xfs_buf_allocate_memory( if (!error) return 0; } - - start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) - >> PAGE_SHIFT; - return xfs_buf_alloc_pages(bp, end - start, flags); + return xfs_buf_alloc_pages(bp, flags); } /* @@ -922,7 +916,6 @@ xfs_buf_get_uncached( int flags, struct xfs_buf **bpp) { - unsigned long page_count; int error; struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); @@ -934,8 +927,7 @@ xfs_buf_get_uncached( if (error) return error; - page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = xfs_buf_alloc_pages(bp, page_count, flags); + error = xfs_buf_alloc_pages(bp, flags); if (error) goto fail_free_buf; From 289ae7b48c2c4d9bec515e720c01146498109dee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 7 Jun 2021 11:50:17 +1000 Subject: [PATCH 064/102] xfs: get rid of xb_to_gfp() Only used in one place, so just open code the logic in the macro. Based on a patch from Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5c6a7140d28d..0b0d66d31515 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -22,9 +22,6 @@ static kmem_zone_t *xfs_buf_zone; -#define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) - /* * Locking orders * @@ -350,9 +347,14 @@ xfs_buf_alloc_pages( struct xfs_buf *bp, xfs_buf_flags_t flags) { - gfp_t gfp_mask = xb_to_gfp(flags); + gfp_t gfp_mask = __GFP_NOWARN; long filled = 0; + if (flags & XBF_READ_AHEAD) + gfp_mask |= __GFP_NORETRY; + else + gfp_mask |= GFP_NOFS; + /* Make sure that we have a page list */ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); if (bp->b_page_count <= XB_PAGES) { From 170041f71596dad3f34dea40ee0ef0c848d3f906 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Jun 2021 11:50:47 +1000 Subject: [PATCH 065/102] xfs: cleanup error handling in xfs_buf_get_map Use a single goto label for freeing the buffer and returning an error. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0b0d66d31515..7dea73535959 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -689,16 +689,12 @@ xfs_buf_get_map( return error; error = xfs_buf_allocate_memory(new_bp, flags); - if (error) { - xfs_buf_free(new_bp); - return error; - } + if (error) + goto out_free_buf; error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); - if (error) { - xfs_buf_free(new_bp); - return error; - } + if (error) + goto out_free_buf; if (bp != new_bp) xfs_buf_free(new_bp); @@ -726,6 +722,9 @@ found: trace_xfs_buf_get(bp, flags, _RET_IP_); *bpp = bp; return 0; +out_free_buf: + xfs_buf_free(new_bp); + return error; } int From 8bcac7448a942fa4662441a310c97d47cec24310 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 7 Jun 2021 11:50:48 +1000 Subject: [PATCH 066/102] xfs: merge xfs_buf_allocate_memory It only has one caller and is now a simple function, so merge it into the caller. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 44 +++++++++++++------------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 7dea73535959..a55471612150 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -313,11 +313,11 @@ xfs_buf_free( static int xfs_buf_alloc_kmem( struct xfs_buf *bp, - size_t size, xfs_buf_flags_t flags) { int align_mask = xfs_buftarg_dma_alignment(bp->b_target); xfs_km_flags_t kmflag_mask = KM_NOFS; + size_t size = BBTOB(bp->b_length); /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) @@ -400,33 +400,6 @@ xfs_buf_alloc_pages( return 0; } - -/* - * Allocates all the pages for buffer in question and builds it's page list. - */ -static int -xfs_buf_allocate_memory( - struct xfs_buf *bp, - uint flags) -{ - size_t size; - int error; - - /* - * For buffers that fit entirely within a single page, first attempt to - * allocate the memory from the heap to minimise memory usage. If we - * can't get heap memory for these small buffers, we fall back to using - * the page allocator. - */ - size = BBTOB(bp->b_length); - if (size < PAGE_SIZE) { - error = xfs_buf_alloc_kmem(bp, size, flags); - if (!error) - return 0; - } - return xfs_buf_alloc_pages(bp, flags); -} - /* * Map buffer into kernel address-space if necessary. */ @@ -688,9 +661,18 @@ xfs_buf_get_map( if (error) return error; - error = xfs_buf_allocate_memory(new_bp, flags); - if (error) - goto out_free_buf; + /* + * For buffers that fit entirely within a single page, first attempt to + * allocate the memory from the heap to minimise memory usage. If we + * can't get heap memory for these small buffers, we fall back to using + * the page allocator. + */ + if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + error = xfs_buf_alloc_pages(new_bp, flags); + if (error) + goto out_free_buf; + } error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); if (error) From 9ba0889e2272294bfbb5589b1b180ad2e782b2a4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 8 Jun 2021 09:19:22 -0700 Subject: [PATCH 067/102] xfs: drop the AGI being passed to xfs_check_agi_freecount From: Dave Chinner Stephen Rothwell reported this compiler warning from linux-next: fs/xfs/libxfs/xfs_ialloc.c: In function 'xfs_difree_finobt': fs/xfs/libxfs/xfs_ialloc.c:2032:20: warning: unused variable 'agi' [-Wunused-variable] 2032 | struct xfs_agi *agi = agbp->b_addr; Which is fallout from agno -> perag conversions that were done in this function. xfs_check_agi_freecount() is the only user of "agi" in xfs_difree_finobt() now, and it only uses the agi to get the current free inode count. We hold that in the perag structure, so there's not need to directly reference the raw AGI to get this information. The btree cursor being passed to xfs_check_agi_freecount() has a reference to the perag being operated on, so use that directly in xfs_check_agi_freecount() rather than passing an AGI. Fixes: 7b13c5155182 ("xfs: use perag for ialloc btree cursors") Reported-by: Stephen Rothwell Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 2ed6de6faf8a..654a8d9681e1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -214,10 +214,9 @@ xfs_inobt_insert( * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG -STATIC int +static int xfs_check_agi_freecount( - struct xfs_btree_cur *cur, - struct xfs_agi *agi) + struct xfs_btree_cur *cur) { if (cur->bc_nlevels == 1) { xfs_inobt_rec_incore_t rec; @@ -243,12 +242,12 @@ xfs_check_agi_freecount( } while (i == 1); if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) - ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); } return 0; } #else -#define xfs_check_agi_freecount(cur, agi) 0 +#define xfs_check_agi_freecount(cur) 0 #endif /* @@ -1014,7 +1013,7 @@ xfs_dialloc_ag_inobt( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -1234,7 +1233,7 @@ alloc_inode: xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -1461,7 +1460,7 @@ xfs_dialloc_ag( cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error_cur; @@ -1504,7 +1503,7 @@ xfs_dialloc_ag( */ icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); - error = xfs_check_agi_freecount(icur, agi); + error = xfs_check_agi_freecount(icur); if (error) goto error_icur; @@ -1522,10 +1521,10 @@ xfs_dialloc_ag( xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - error = xfs_check_agi_freecount(icur, agi); + error = xfs_check_agi_freecount(icur); if (error) goto error_icur; - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error_icur; @@ -1911,7 +1910,7 @@ xfs_difree_inobt( */ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -2004,7 +2003,7 @@ xfs_difree_inobt( xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -2029,7 +2028,6 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; @@ -2114,7 +2112,7 @@ xfs_difree_finobt( } out: - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error; From 255794c7ed7adb914e831f5e4905d783d31378d2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:49 -0700 Subject: [PATCH 068/102] xfs: only reset incore inode health state flags when reclaiming an inode While running some fuzz tests on inode metadata, I noticed that the filesystem health report (as provided by xfs_spaceman) failed to report the file corruption even when spaceman was run immediately after running xfs_scrub to detect the corruption. That isn't the intended behavior; one ought to be able to run scrub to detect errors in the ondisk metadata and be able to access to those reports for some time after the scrub. After running the same sequence through an instrumented kernel, I discovered the reason why -- scrub igets the file, scans it, marks it sick, and ireleases the inode. When the VFS lets go of the incore inode, it moves to RECLAIMABLE state. If spaceman igets the incore inode before it moves to RECLAIM state, iget reinitializes the VFS state, clears the sick and checked masks, and hands back the inode. At this point, the caller has the exact same incore inode, but with all the health state erased. In other words, we're erasing the incore inode's health state flags when we've decided NOT to sever the link between the incore inode and the ondisk inode. This is wrong, so we need to remove the lines that zero the fields from xfs_iget_cache_hit. As a precaution, we add the same lines into xfs_reclaim_inode just after we sever the link between incore and ondisk inode. Strictly speaking this isn't necessary because once an inode has gone through reclaim it must go through xfs_inode_alloc (which also zeroes the state) and xfs_iget is careful to check for mismatches between the inode it pulls out of the radix tree and the one it wants. Fixes: 6772c1f11206 ("xfs: track metadata health status") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0cd29a2f9da5..c0d288e4d0fd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -523,9 +523,6 @@ xfs_iget_cache_hit( XFS_INO_TO_AGINO(pag->pag_mount, ino), XFS_ICI_RECLAIM_TAG); inode->i_state = I_NEW; - ip->i_sick = 0; - ip->i_checked = 0; - spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); } else { @@ -979,6 +976,8 @@ reclaim: spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; + ip->i_sick = 0; + ip->i_checked = 0; spin_unlock(&ip->i_flags_lock); xfs_iunlock(ip, XFS_ILOCK_EXCL); From 7975e465af6b46e9d0eaf94f764922dc92b28d9c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:50 -0700 Subject: [PATCH 069/102] xfs: drop IDONTCACHE on inodes when we mark them sick When we decide to mark an inode sick, clear the DONTCACHE flag so that the incore inode will be kept around until memory pressure forces it out of memory. This increases the chances that the sick status will be caught by someone compiling a health report later on. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_health.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 5de3195f6cb2..eb10eacabc8f 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -229,6 +229,15 @@ xfs_inode_mark_sick( ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); + + /* + * Keep this inode around so we don't lose the sickness report. Scrub + * grabs inodes with DONTCACHE assuming that most inode are ok, which + * is not the case here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); } /* Mark parts of an inode healed. */ From 2d53f66baffde66fe72c360e3b9b0c8a2d7ce7c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:51 -0700 Subject: [PATCH 070/102] xfs: change the prefix of XFS_EOF_FLAGS_* to XFS_ICWALK_FLAG_ In preparation for renaming struct xfs_eofblocks to struct xfs_icwalk, change the prefix of the existing XFS_EOF_FLAGS_* flags to XFS_ICWALK_FLAG_ and convert all the existing users. This adds a degree of interface separation between the ioctl definitions and the incore parameters. Since FLAGS_UNION is only used in xfs_icache.c, move it there as a private flag. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_file.c | 4 ++-- fs/xfs/xfs_icache.c | 44 +++++++++++++++++++++++--------------------- fs/xfs/xfs_icache.h | 17 +++++++++++++++-- fs/xfs/xfs_ioctl.c | 13 ++++++++++++- 4 files changed, 52 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c068dcd414f4..eb39c3777491 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -769,7 +769,7 @@ write_retry: */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC); + xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); cleared_space = true; goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { @@ -779,7 +779,7 @@ write_retry: xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); - eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + eofb.eof_flags = XFS_ICWALK_FLAG_SYNC; xfs_blockgc_free_space(ip->i_mount, &eofb); goto write_retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6f1383bf706a..cbfb5cec7f18 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -62,7 +62,7 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, /* * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide - * with XFS_EOF_FLAGS_*. + * with XFS_ICWALK_FLAGS_VALID. */ #define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) @@ -72,12 +72,14 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) +#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ XFS_ICWALK_FLAG_DROP_PDQUOT | \ XFS_ICWALK_FLAG_SCAN_LIMIT | \ - XFS_ICWALK_FLAG_RECLAIM_SICK) + XFS_ICWALK_FLAG_RECLAIM_SICK | \ + XFS_ICWALK_FLAG_UNION) /* * Allocate and initialise an xfs_inode. @@ -1113,15 +1115,15 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) return false; - if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) return false; - if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && ip->i_projid != eofb->eof_prid) return false; @@ -1137,15 +1139,15 @@ xfs_inode_match_id_union( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) return true; - if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) return true; - if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && ip->i_projid == eofb->eof_prid) return true; @@ -1167,7 +1169,7 @@ xfs_inode_matches_eofb( if (!eofb) return true; - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + if (eofb->eof_flags & XFS_ICWALK_FLAG_UNION) match = xfs_inode_match_id_union(ip, eofb); else match = xfs_inode_match_id(ip, eofb); @@ -1175,7 +1177,7 @@ xfs_inode_matches_eofb( return false; /* skip the inode if the file size is too small */ - if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && + if ((eofb->eof_flags & XFS_ICWALK_FLAG_MINFILESIZE) && XFS_ISIZE(ip) < eofb->eof_min_file_size) return false; @@ -1207,7 +1209,7 @@ xfs_inode_free_eofblocks( { bool wait; - wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC); if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) return 0; @@ -1370,7 +1372,7 @@ xfs_inode_free_cowblocks( bool wait; int ret = 0; - wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC); if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; @@ -1552,7 +1554,7 @@ xfs_blockgc_free_space( * scan. * * Callers must not hold any inode's ILOCK. If requesting a synchronous scan - * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or + * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or * MMAPLOCK. */ int @@ -1561,7 +1563,7 @@ xfs_blockgc_free_dquots( struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - unsigned int eof_flags) + unsigned int iwalk_flags) { struct xfs_eofblocks eofb = {0}; bool do_work = false; @@ -1573,23 +1575,23 @@ xfs_blockgc_free_dquots( * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; + eofb.eof_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); - eofb.eof_flags |= XFS_EOF_FLAGS_UID; + eofb.eof_flags |= XFS_ICWALK_FLAG_UID; do_work = true; } if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); - eofb.eof_flags |= XFS_EOF_FLAGS_GID; + eofb.eof_flags |= XFS_ICWALK_FLAG_GID; do_work = true; } if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { eofb.eof_prid = pdqp->q_id; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + eofb.eof_flags |= XFS_ICWALK_FLAG_PRID; do_work = true; } @@ -1603,12 +1605,12 @@ xfs_blockgc_free_dquots( int xfs_blockgc_free_quota( struct xfs_inode *ip, - unsigned int eof_flags) + unsigned int iwalk_flags) { return xfs_blockgc_free_dquots(ip->i_mount, xfs_inode_dquot(ip, XFS_DQTYPE_USER), xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), - xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); + xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); } /* XFS Inode Cache Walking Code */ @@ -1828,5 +1830,5 @@ xfs_icwalk( } } return last_error; - BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID); + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 191620a069af..b29048c493b6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -18,6 +18,19 @@ struct xfs_eofblocks { int icw_scan_limit; }; +/* Flags that reflect xfs_fs_eofblocks functionality. */ +#define XFS_ICWALK_FLAG_SYNC (1U << 0) /* sync/wait mode scan */ +#define XFS_ICWALK_FLAG_UID (1U << 1) /* filter by uid */ +#define XFS_ICWALK_FLAG_GID (1U << 2) /* filter by gid */ +#define XFS_ICWALK_FLAG_PRID (1U << 3) /* filter by project id */ +#define XFS_ICWALK_FLAG_MINFILESIZE (1U << 4) /* filter by min file size */ + +#define XFS_ICWALK_FLAGS_VALID (XFS_ICWALK_FLAG_SYNC | \ + XFS_ICWALK_FLAG_UID | \ + XFS_ICWALK_FLAG_GID | \ + XFS_ICWALK_FLAG_PRID | \ + XFS_ICWALK_FLAG_MINFILESIZE) + /* * Flags for xfs_iget() */ @@ -43,8 +56,8 @@ void xfs_inode_mark_reclaimable(struct xfs_inode *ip); int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - unsigned int eof_flags); -int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); + unsigned int iwalk_flags); +int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1fe4c1fc0aea..c6450fd059f1 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1887,7 +1887,18 @@ xfs_fs_eofblocks_from_user( memchr_inv(src->pad64, 0, sizeof(src->pad64))) return -EINVAL; - dst->eof_flags = src->eof_flags; + dst->eof_flags = 0; + if (src->eof_flags & XFS_EOF_FLAGS_SYNC) + dst->eof_flags |= XFS_ICWALK_FLAG_SYNC; + if (src->eof_flags & XFS_EOF_FLAGS_UID) + dst->eof_flags |= XFS_ICWALK_FLAG_UID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) + dst->eof_flags |= XFS_ICWALK_FLAG_GID; + if (src->eof_flags & XFS_EOF_FLAGS_PRID) + dst->eof_flags |= XFS_ICWALK_FLAG_PRID; + if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) + dst->eof_flags |= XFS_ICWALK_FLAG_MINFILESIZE; + dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; From 9492750a8b18f02a8dec2aab594c59aabe2e4d0d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:50 -0700 Subject: [PATCH 071/102] xfs: selectively keep sick inodes in memory It's important that the filesystem retain its memory of sick inodes for a little while after problems are found so that reports can be collected about what was wrong. Don't let inode reclamation free sick inodes unless we're unmounting or the fs already went down. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c0d288e4d0fd..6f1383bf706a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -71,10 +71,13 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, /* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) +#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) + #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ XFS_ICWALK_FLAG_DROP_GDQUOT | \ XFS_ICWALK_FLAG_DROP_PDQUOT | \ - XFS_ICWALK_FLAG_SCAN_LIMIT) + XFS_ICWALK_FLAG_SCAN_LIMIT | \ + XFS_ICWALK_FLAG_RECLAIM_SICK) /* * Allocate and initialise an xfs_inode. @@ -910,7 +913,8 @@ xfs_dqrele_all_inodes( */ static bool xfs_reclaim_igrab( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) { ASSERT(rcu_read_lock_held()); @@ -921,6 +925,14 @@ xfs_reclaim_igrab( spin_unlock(&ip->i_flags_lock); return false; } + + /* Don't reclaim a sick inode unless the caller asked for it. */ + if (ip->i_sick && + (!eofb || !(eofb->eof_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { + spin_unlock(&ip->i_flags_lock); + return false; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); return true; @@ -1021,13 +1033,30 @@ out: xfs_iflags_clear(ip, XFS_IRECLAIM); } +/* Reclaim sick inodes if we're unmounting or the fs went down. */ +static inline bool +xfs_want_reclaim_sick( + struct xfs_mount *mp) +{ + return (mp->m_flags & XFS_MOUNT_UNMOUNTING) || + (mp->m_flags & XFS_MOUNT_NORECOVERY) || + XFS_FORCED_SHUTDOWN(mp); +} + void xfs_reclaim_inodes( struct xfs_mount *mp) { + struct xfs_eofblocks eofb = { + .eof_flags = 0, + }; + + if (xfs_want_reclaim_sick(mp)) + eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); } } @@ -1048,6 +1077,9 @@ xfs_reclaim_inodes_nr( .icw_scan_limit = nr_to_scan, }; + if (xfs_want_reclaim_sick(mp)) + eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); @@ -1597,7 +1629,8 @@ xfs_blockgc_free_quota( static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) { switch (goal) { case XFS_ICWALK_DQRELE: @@ -1605,7 +1638,7 @@ xfs_icwalk_igrab( case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: - return xfs_reclaim_igrab(ip); + return xfs_reclaim_igrab(ip, eofb); default: return false; } @@ -1694,7 +1727,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || !xfs_icwalk_igrab(goal, ip)) + if (done || !xfs_icwalk_igrab(goal, ip, eofb)) batch[i] = NULL; /* From b26b2bf14f823e9597118c01993aeba9aeb9a701 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 7 Jun 2021 09:34:51 -0700 Subject: [PATCH 072/102] xfs: rename struct xfs_eofblocks to xfs_icwalk The xfs_eofblocks structure is no longer well-named -- nowadays it provides optional filtering criteria to any walk of the incore inode cache. Only one of the cache walk goals has anything to do with clearing of speculative post-EOF preallocations, so change the name to be more appropriate. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_file.c | 6 +- fs/xfs/xfs_icache.c | 164 ++++++++++++++++++++++---------------------- fs/xfs/xfs_icache.h | 14 ++-- fs/xfs/xfs_ioctl.c | 40 +++++------ fs/xfs/xfs_trace.h | 36 +++++----- 5 files changed, 130 insertions(+), 130 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index eb39c3777491..9fd5a82a814c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -773,14 +773,14 @@ write_retry: cleared_space = true; goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { - struct xfs_eofblocks eofb = {0}; + struct xfs_icwalk icw = {0}; cleared_space = true; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); - eofb.eof_flags = XFS_ICWALK_FLAG_SYNC; - xfs_blockgc_free_space(ip->i_mount, &eofb); + icw.icw_flags = XFS_ICWALK_FLAG_SYNC; + xfs_blockgc_free_space(ip->i_mount, &icw); goto write_retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index cbfb5cec7f18..4e4682879bbd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -56,13 +56,13 @@ xfs_icwalk_tag(enum xfs_icwalk_goal goal) } static int xfs_icwalk(struct xfs_mount *mp, - enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); + enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); static int xfs_icwalk_ag(struct xfs_perag *pag, - enum xfs_icwalk_goal goal, struct xfs_eofblocks *eofb); + enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); /* - * Private inode cache walk flags for struct xfs_eofblocks. Must not coincide - * with XFS_ICWALK_FLAGS_VALID. + * Private inode cache walk flags for struct xfs_icwalk. Must not + * coincide with XFS_ICWALK_FLAGS_VALID. */ #define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) #define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) @@ -848,21 +848,21 @@ out_unlock: static void xfs_dqrele_inode( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { if (xfs_iflags_test(ip, XFS_INEW)) xfs_inew_wait(ip); xfs_ilock(ip, XFS_ILOCK_EXCL); - if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } @@ -880,16 +880,16 @@ xfs_dqrele_all_inodes( struct xfs_mount *mp, unsigned int qflags) { - struct xfs_eofblocks eofb = { .eof_flags = 0 }; + struct xfs_icwalk icw = { .icw_flags = 0 }; if (qflags & XFS_UQUOTA_ACCT) - eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; if (qflags & XFS_GQUOTA_ACCT) - eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; if (qflags & XFS_PQUOTA_ACCT) - eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &eofb); + return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw); } #else # define xfs_dqrele_igrab(ip) (false) @@ -916,7 +916,7 @@ xfs_dqrele_all_inodes( static bool xfs_reclaim_igrab( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { ASSERT(rcu_read_lock_held()); @@ -930,7 +930,7 @@ xfs_reclaim_igrab( /* Don't reclaim a sick inode unless the caller asked for it. */ if (ip->i_sick && - (!eofb || !(eofb->eof_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { + (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { spin_unlock(&ip->i_flags_lock); return false; } @@ -1049,16 +1049,16 @@ void xfs_reclaim_inodes( struct xfs_mount *mp) { - struct xfs_eofblocks eofb = { - .eof_flags = 0, + struct xfs_icwalk icw = { + .icw_flags = 0, }; if (xfs_want_reclaim_sick(mp)) - eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } } @@ -1074,19 +1074,19 @@ xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) { - struct xfs_eofblocks eofb = { - .eof_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, + struct xfs_icwalk icw = { + .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, .icw_scan_limit = nr_to_scan, }; if (xfs_want_reclaim_sick(mp)) - eofb.eof_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &eofb); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); return 0; } @@ -1111,20 +1111,20 @@ xfs_reclaim_inodes_count( } STATIC bool -xfs_inode_match_id( +xfs_icwalk_match_id( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && - !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && + !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) return false; - if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && - !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && + !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) return false; - if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && - ip->i_projid != eofb->eof_prid) + if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && + ip->i_projid != icw->icw_prid) return false; return true; @@ -1135,20 +1135,20 @@ xfs_inode_match_id( * criteria match. This is for global/internal scans only. */ STATIC bool -xfs_inode_match_id_union( +xfs_icwalk_match_id_union( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - if ((eofb->eof_flags & XFS_ICWALK_FLAG_UID) && - uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && + uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) return true; - if ((eofb->eof_flags & XFS_ICWALK_FLAG_GID) && - gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && + gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) return true; - if ((eofb->eof_flags & XFS_ICWALK_FLAG_PRID) && - ip->i_projid == eofb->eof_prid) + if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && + ip->i_projid == icw->icw_prid) return true; return false; @@ -1156,29 +1156,29 @@ xfs_inode_match_id_union( /* * Is this inode @ip eligible for eof/cow block reclamation, given some - * filtering parameters @eofb? The inode is eligible if @eofb is null or + * filtering parameters @icw? The inode is eligible if @icw is null or * if the predicate functions match. */ static bool -xfs_inode_matches_eofb( +xfs_icwalk_match( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { bool match; - if (!eofb) + if (!icw) return true; - if (eofb->eof_flags & XFS_ICWALK_FLAG_UNION) - match = xfs_inode_match_id_union(ip, eofb); + if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) + match = xfs_icwalk_match_id_union(ip, icw); else - match = xfs_inode_match_id(ip, eofb); + match = xfs_icwalk_match_id(ip, icw); if (!match) return false; /* skip the inode if the file size is too small */ - if ((eofb->eof_flags & XFS_ICWALK_FLAG_MINFILESIZE) && - XFS_ISIZE(ip) < eofb->eof_min_file_size) + if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && + XFS_ISIZE(ip) < icw->icw_min_file_size) return false; return true; @@ -1204,12 +1204,12 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - struct xfs_eofblocks *eofb, + struct xfs_icwalk *icw, unsigned int *lockflags) { bool wait; - wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC); + wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) return 0; @@ -1221,7 +1221,7 @@ xfs_inode_free_eofblocks( if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (!xfs_inode_matches_eofb(ip, eofb)) + if (!xfs_icwalk_match(ip, icw)) return 0; /* @@ -1366,13 +1366,13 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - struct xfs_eofblocks *eofb, + struct xfs_icwalk *icw, unsigned int *lockflags) { bool wait; int ret = 0; - wait = eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SYNC); + wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; @@ -1380,7 +1380,7 @@ xfs_inode_free_cowblocks( if (!xfs_prep_free_cowblocks(ip)) return 0; - if (!xfs_inode_matches_eofb(ip, eofb)) + if (!xfs_icwalk_match(ip, icw)) return 0; /* @@ -1497,16 +1497,16 @@ out_unlock_noent: static int xfs_blockgc_scan_inode( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { unsigned int lockflags = 0; int error; - error = xfs_inode_free_eofblocks(ip, eofb, &lockflags); + error = xfs_inode_free_eofblocks(ip, icw, &lockflags); if (error) goto unlock; - error = xfs_inode_free_cowblocks(ip, eofb, &lockflags); + error = xfs_inode_free_cowblocks(ip, icw, &lockflags); unlock: if (lockflags) xfs_iunlock(ip, lockflags); @@ -1540,11 +1540,11 @@ xfs_blockgc_worker( int xfs_blockgc_free_space( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); - return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, eofb); + return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); } /* @@ -1565,7 +1565,7 @@ xfs_blockgc_free_dquots( struct xfs_dquot *pdqp, unsigned int iwalk_flags) { - struct xfs_eofblocks eofb = {0}; + struct xfs_icwalk icw = {0}; bool do_work = false; if (!udqp && !gdqp && !pdqp) @@ -1575,30 +1575,30 @@ xfs_blockgc_free_dquots( * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; + icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { - eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); - eofb.eof_flags |= XFS_ICWALK_FLAG_UID; + icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); + icw.icw_flags |= XFS_ICWALK_FLAG_UID; do_work = true; } if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { - eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); - eofb.eof_flags |= XFS_ICWALK_FLAG_GID; + icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); + icw.icw_flags |= XFS_ICWALK_FLAG_GID; do_work = true; } if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { - eofb.eof_prid = pdqp->q_id; - eofb.eof_flags |= XFS_ICWALK_FLAG_PRID; + icw.icw_prid = pdqp->q_id; + icw.icw_flags |= XFS_ICWALK_FLAG_PRID; do_work = true; } if (!do_work) return 0; - return xfs_blockgc_free_space(mp, &eofb); + return xfs_blockgc_free_space(mp, &icw); } /* Run cow/eofblocks scans on the quotas attached to the inode. */ @@ -1632,7 +1632,7 @@ static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { switch (goal) { case XFS_ICWALK_DQRELE: @@ -1640,7 +1640,7 @@ xfs_icwalk_igrab( case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: - return xfs_reclaim_igrab(ip, eofb); + return xfs_reclaim_igrab(ip, icw); default: return false; } @@ -1655,16 +1655,16 @@ xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, struct xfs_perag *pag, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { int error = 0; switch (goal) { case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, eofb); + xfs_dqrele_inode(ip, icw); break; case XFS_ICWALK_BLOCKGC: - error = xfs_blockgc_scan_inode(ip, eofb); + error = xfs_blockgc_scan_inode(ip, icw); break; case XFS_ICWALK_RECLAIM: xfs_reclaim_inode(ip, pag); @@ -1681,7 +1681,7 @@ static int xfs_icwalk_ag( struct xfs_perag *pag, enum xfs_icwalk_goal goal, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; @@ -1729,7 +1729,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || !xfs_icwalk_igrab(goal, ip, eofb)) + if (done || !xfs_icwalk_igrab(goal, ip, icw)) batch[i] = NULL; /* @@ -1758,7 +1758,7 @@ restart: if (!batch[i]) continue; error = xfs_icwalk_process_inode(goal, batch[i], pag, - eofb); + icw); if (error == -EAGAIN) { skipped++; continue; @@ -1773,9 +1773,9 @@ restart: cond_resched(); - if (eofb && (eofb->eof_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { - eofb->icw_scan_limit -= XFS_LOOKUP_BATCH; - if (eofb->icw_scan_limit <= 0) + if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { + icw->icw_scan_limit -= XFS_LOOKUP_BATCH; + if (icw->icw_scan_limit <= 0) break; } } while (nr_found && !done); @@ -1812,7 +1812,7 @@ static int xfs_icwalk( struct xfs_mount *mp, enum xfs_icwalk_goal goal, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { struct xfs_perag *pag; int error = 0; @@ -1821,7 +1821,7 @@ xfs_icwalk( while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { agno = pag->pag_agno + 1; - error = xfs_icwalk_ag(pag, goal, eofb); + error = xfs_icwalk_ag(pag, goal, icw); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b29048c493b6..00dc98a92835 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -9,12 +9,12 @@ struct xfs_mount; struct xfs_perag; -struct xfs_eofblocks { - __u32 eof_flags; - kuid_t eof_uid; - kgid_t eof_gid; - prid_t eof_prid; - __u64 eof_min_file_size; +struct xfs_icwalk { + __u32 icw_flags; + kuid_t icw_uid; + kgid_t icw_gid; + prid_t icw_prid; + __u64 icw_min_file_size; int icw_scan_limit; }; @@ -58,7 +58,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); -int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); +int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c6450fd059f1..0f6794333b01 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1875,7 +1875,7 @@ out: static inline int xfs_fs_eofblocks_from_user( struct xfs_fs_eofblocks *src, - struct xfs_eofblocks *dst) + struct xfs_icwalk *dst) { if (src->eof_version != XFS_EOFBLOCKS_VERSION) return -EINVAL; @@ -1887,32 +1887,32 @@ xfs_fs_eofblocks_from_user( memchr_inv(src->pad64, 0, sizeof(src->pad64))) return -EINVAL; - dst->eof_flags = 0; + dst->icw_flags = 0; if (src->eof_flags & XFS_EOF_FLAGS_SYNC) - dst->eof_flags |= XFS_ICWALK_FLAG_SYNC; + dst->icw_flags |= XFS_ICWALK_FLAG_SYNC; if (src->eof_flags & XFS_EOF_FLAGS_UID) - dst->eof_flags |= XFS_ICWALK_FLAG_UID; + dst->icw_flags |= XFS_ICWALK_FLAG_UID; if (src->eof_flags & XFS_EOF_FLAGS_GID) - dst->eof_flags |= XFS_ICWALK_FLAG_GID; + dst->icw_flags |= XFS_ICWALK_FLAG_GID; if (src->eof_flags & XFS_EOF_FLAGS_PRID) - dst->eof_flags |= XFS_ICWALK_FLAG_PRID; + dst->icw_flags |= XFS_ICWALK_FLAG_PRID; if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) - dst->eof_flags |= XFS_ICWALK_FLAG_MINFILESIZE; + dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE; - dst->eof_prid = src->eof_prid; - dst->eof_min_file_size = src->eof_min_file_size; + dst->icw_prid = src->eof_prid; + dst->icw_min_file_size = src->eof_min_file_size; - dst->eof_uid = INVALID_UID; + dst->icw_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { - dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); - if (!uid_valid(dst->eof_uid)) + dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->icw_uid)) return -EINVAL; } - dst->eof_gid = INVALID_GID; + dst->icw_gid = INVALID_GID; if (src->eof_flags & XFS_EOF_FLAGS_GID) { - dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); - if (!gid_valid(dst->eof_gid)) + dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->icw_gid)) return -EINVAL; } return 0; @@ -2175,8 +2175,8 @@ xfs_file_ioctl( return xfs_errortag_clearall(mp); case XFS_IOC_FREE_EOFBLOCKS: { - struct xfs_fs_eofblocks eofb; - struct xfs_eofblocks keofb; + struct xfs_fs_eofblocks eofb; + struct xfs_icwalk icw; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2187,14 +2187,14 @@ xfs_file_ioctl( if (copy_from_user(&eofb, arg, sizeof(eofb))) return -EFAULT; - error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + error = xfs_fs_eofblocks_from_user(&eofb, &icw); if (error) return error; - trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); + trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &keofb); + error = xfs_blockgc_free_space(mp, &icw); sb_end_write(mp->m_super); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index abe98aca384f..a10612155377 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,7 +37,7 @@ struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; -struct xfs_eofblocks; +struct xfs_icwalk; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -3885,10 +3885,10 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \ DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); -DECLARE_EVENT_CLASS(xfs_eofblocks_class, - TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, +DECLARE_EVENT_CLASS(xfs_icwalk_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, unsigned long caller_ip), - TP_ARGS(mp, eofb, caller_ip), + TP_ARGS(mp, icw, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(__u32, flags) @@ -3901,14 +3901,14 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->flags = eofb ? eofb->eof_flags : 0; - __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns, - eofb->eof_uid) : 0; - __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns, - eofb->eof_gid) : 0; - __entry->prid = eofb ? eofb->eof_prid : 0; - __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; - __entry->scan_limit = eofb ? eofb->icw_scan_limit : 0; + __entry->flags = icw ? icw->icw_flags : 0; + __entry->uid = icw ? from_kuid(mp->m_super->s_user_ns, + icw->icw_uid) : 0; + __entry->gid = icw ? from_kgid(mp->m_super->s_user_ns, + icw->icw_gid) : 0; + __entry->prid = icw ? icw->icw_prid : 0; + __entry->min_file_size = icw ? icw->icw_min_file_size : 0; + __entry->scan_limit = icw ? icw->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", @@ -3921,13 +3921,13 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, __entry->scan_limit, (char *)__entry->caller_ip) ); -#define DEFINE_EOFBLOCKS_EVENT(name) \ -DEFINE_EVENT(xfs_eofblocks_class, name, \ - TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \ +#define DEFINE_ICWALK_EVENT(name) \ +DEFINE_EVENT(xfs_icwalk_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \ unsigned long caller_ip), \ - TP_ARGS(mp, eofb, caller_ip)) -DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); -DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space); + TP_ARGS(mp, icw, caller_ip)) +DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks); +DEFINE_ICWALK_EVENT(xfs_blockgc_free_space); #endif /* _TRACE_XFS_H */ From 4a4957c16dc674d1306a3b43d6b07ed93a7b7a14 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 21 May 2021 00:57:15 -0700 Subject: [PATCH 073/102] xfs: Fix default ASSERT in xfs_attr_set_iter This ASSERT checks for the state value of RM_SHRINK in the set path which should never happen. Change to ASSERT(0); Suggested-by: Darrick J. Wong Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2387a41b705e..a0edebc098ea 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -612,7 +612,7 @@ xfs_attr_set_iter( error = xfs_attr_node_addname_clear_incomplete(dac); break; default: - ASSERT(dac->dela_state != XFS_DAS_RM_SHRINK); + ASSERT(0); break; } out: From 816c8e39b7ea0875640312c9ed3be0d5a68d7183 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 28 May 2021 15:15:05 -0700 Subject: [PATCH 074/102] xfs: Make attr name schemes consistent This patch renames the following functions to make the nameing scheme more consistent: xfs_attr_shortform_remove -> xfs_attr_sf_removename xfs_attr_node_remove_name -> xfs_attr_node_removename xfs_attr_set_fmt -> xfs_attr_sf_addname Suggested-by: Darrick J. Wong Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 18 +++++++++--------- fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index a0edebc098ea..611dc67234a6 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -63,8 +63,8 @@ STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, struct xfs_buf **leaf_bp); -STATIC int xfs_attr_node_remove_name(struct xfs_da_args *args, - struct xfs_da_state *state); +STATIC int xfs_attr_node_removename(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( @@ -298,7 +298,7 @@ xfs_attr_set_args( } STATIC int -xfs_attr_set_fmt( +xfs_attr_sf_addname( struct xfs_delattr_context *dac, struct xfs_buf **leaf_bp) { @@ -367,7 +367,7 @@ xfs_attr_set_iter( * release the hold once we return with a clean transaction. */ if (xfs_attr_is_shortform(dp)) - return xfs_attr_set_fmt(dac, leaf_bp); + return xfs_attr_sf_addname(dac, leaf_bp); if (*leaf_bp != NULL) { xfs_trans_bhold_release(args->trans, *leaf_bp); *leaf_bp = NULL; @@ -840,7 +840,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) if (retval == -EEXIST) { if (args->attr_flags & XATTR_CREATE) return retval; - retval = xfs_attr_shortform_remove(args); + retval = xfs_attr_sf_removename(args); if (retval) return retval; /* @@ -1223,7 +1223,7 @@ xfs_attr_node_addname_clear_incomplete( if (error) goto out; - error = xfs_attr_node_remove_name(args, state); + error = xfs_attr_node_removename(args, state); /* * Check to see if the tree needs to be collapsed. @@ -1339,7 +1339,7 @@ out: } STATIC int -xfs_attr_node_remove_name( +xfs_attr_node_removename( struct xfs_da_args *args, struct xfs_da_state *state) { @@ -1390,7 +1390,7 @@ xfs_attr_remove_iter( * thus state transitions. Call the right helper and return. */ if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_shortform_remove(args); + return xfs_attr_sf_removename(args); if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_removename(args); @@ -1453,7 +1453,7 @@ xfs_attr_remove_iter( goto out; } - retval = xfs_attr_node_remove_name(args, state); + retval = xfs_attr_node_removename(args, state); /* * Check to see if the tree needs to be collapsed. If so, roll diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index d97de2083feb..5a3d261d901f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -773,7 +773,7 @@ xfs_attr_fork_remove( * Remove an attribute from the shortform attribute list structure. */ int -xfs_attr_shortform_remove( +xfs_attr_sf_removename( struct xfs_da_args *args) { struct xfs_attr_shortform *sf; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 9b1c59f40a26..efa757f1e912 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, struct xfs_buf **leaf_bp); -int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_sf_removename(struct xfs_da_args *args); int xfs_attr_sf_findname(struct xfs_da_args *args, struct xfs_attr_sf_entry **sfep, unsigned int *basep); From 90e2c1c20ac672756a2835b5a92a606dd48a4aa3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:14:20 -0700 Subject: [PATCH 075/102] xfs: perag may be null in xfs_imap() Dan Carpenter's static checker reported: The patch 7b13c5155182: "xfs: use perag for ialloc btree cursors" from Jun 2, 2021, leads to the following Smatch complaint: fs/xfs/libxfs/xfs_ialloc.c:2403 xfs_imap() error: we previously assumed 'pag' could be null (see line 2294) And it's right. Fix it. Fixes: 7b13c5155182 ("xfs: use perag for ialloc btree cursors") Reported-by: Dan Carpenter Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson --- fs/xfs/libxfs/xfs_ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 654a8d9681e1..57d9cb632983 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2398,7 +2398,8 @@ out_map: } error = 0; out_drop: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } From 9bb38aa080394fb327c90eff75388e0598f266f0 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Fri, 18 Jun 2021 08:14:31 -0700 Subject: [PATCH 076/102] xfs: remove redundant initialization of variable error 'error' will be initialized, so clean up the redundant initialization. Cc: "Darrick J. Wong" Signed-off-by: Shaokun Zhang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b4ee9d3532f0..11edd4ad8151 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -648,7 +648,7 @@ xfs_buf_get_map( { struct xfs_buf *bp; struct xfs_buf *new_bp; - int error = 0; + int error; *bpp = NULL; error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); From a6a65fef5ef8d0a6a0ce514eb66b2f3dfa777b48 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:48 -0700 Subject: [PATCH 077/102] xfs: log stripe roundoff is a property of the log We don't need to look at the xfs_mount and superblock every time we need to do an iclog roundoff calculation. The property is fixed for the life of the log, so store the roundoff in the log at mount time and use that everywhere. On a debug build: $ size fs/xfs/xfs_log.o.* text data bss dec hex filename 27360 560 8 27928 6d18 fs/xfs/xfs_log.o.orig 27219 560 8 27787 6c8b fs/xfs/xfs_log.o.patched Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Allison Henderson --- fs/xfs/libxfs/xfs_log_format.h | 3 -- fs/xfs/xfs_log.c | 59 ++++++++++++++-------------------- fs/xfs/xfs_log_priv.h | 2 ++ 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 3e15ea29fb8d..d548ea4b6aab 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t; #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) #define XLOG_HEADER_SIZE 512 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c19a82adea1e..0e563ff8cd3b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1401,6 +1401,11 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + log->l_iclog_roundoff = mp->m_sb.sb_logsunit; + else + log->l_iclog_roundoff = BBSIZE; + xlog_grant_head_init(&log->l_reserve_head); xlog_grant_head_init(&log->l_write_head); @@ -1854,29 +1859,15 @@ xlog_calc_iclog_size( uint32_t *roundoff) { uint32_t count_init, count; - bool use_lsunit; - - use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1; /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; + count = roundup(count_init, log->l_iclog_roundoff); - /* Round out the log write size */ - if (use_lsunit) { - /* we have a v2 stripe unit to use */ - count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); - } else { - count = BBTOB(BTOBB(count_init)); - } - - ASSERT(count >= count_init); *roundoff = count - count_init; - if (use_lsunit) - ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); - else - ASSERT(*roundoff < BBTOB(1)); + ASSERT(count >= count_init); + ASSERT(*roundoff < log->l_iclog_roundoff); return count; } @@ -3151,10 +3142,9 @@ xlog_state_switch_iclogs( log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); /* Round up to next log-sunit */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { - uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); - log->l_curr_block = roundup(log->l_curr_block, sunit_bb); + if (log->l_iclog_roundoff > BBSIZE) { + log->l_curr_block = roundup(log->l_curr_block, + BTOBB(log->l_iclog_roundoff)); } if (log->l_curr_block >= log->l_logBBsize) { @@ -3406,12 +3396,11 @@ xfs_log_ticket_get( * Figure out the total log space unit (in bytes) that would be * required for a log ticket. */ -int -xfs_log_calc_unit_res( - struct xfs_mount *mp, +static int +xlog_calc_unit_res( + struct xlog *log, int unit_bytes) { - struct xlog *log = mp->m_log; int iclog_space; uint num_headers; @@ -3487,18 +3476,20 @@ xfs_log_calc_unit_res( /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { - /* log su roundoff */ - unit_bytes += 2 * mp->m_sb.sb_logsunit; - } else { - /* BB roundoff */ - unit_bytes += 2 * BBSIZE; - } + /* roundoff padding for transaction data and one for commit record */ + unit_bytes += 2 * log->l_iclog_roundoff; return unit_bytes; } +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + return xlog_calc_unit_res(mp->m_log, unit_bytes); +} + /* * Allocate and initialise a new log ticket. */ @@ -3515,7 +3506,7 @@ xlog_ticket_alloc( tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); - unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes); atomic_set(&tic->t_ref, 1); tic->t_task = current; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1c6fdbf3d506..037950cf1061 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -436,6 +436,8 @@ struct xlog { #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; + + uint32_t l_iclog_roundoff;/* padding roundoff */ }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ From 18842e0a4f48564bbed541947abd8131fd0e9734 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 18 Jun 2021 08:24:04 -0700 Subject: [PATCH 078/102] xfs: Fix 64-bit division on 32-bit in xlog_state_switch_iclogs() On 32-bit (e.g. m68k): ERROR: modpost: "__udivdi3" [fs/xfs/xfs.ko] undefined! Fix this by using a uint32_t intermediate, like before. Reported-by: noreply@ellerman.id.au Fixes: 7660a5b48fbef958 ("xfs: log stripe roundoff is a property of the log") Signed-off-by: Geert Uytterhoeven Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0e563ff8cd3b..0c91da5defee 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3143,8 +3143,8 @@ xlog_state_switch_iclogs( /* Round up to next log-sunit */ if (log->l_iclog_roundoff > BBSIZE) { - log->l_curr_block = roundup(log->l_curr_block, - BTOBB(log->l_iclog_roundoff)); + uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff); + log->l_curr_block = roundup(log->l_curr_block, sunit_bb); } if (log->l_curr_block >= log->l_logBBsize) { From a79b28c284fd910bb291dbf307a26f4d432e88f3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:48 -0700 Subject: [PATCH 079/102] xfs: separate CIL commit record IO To allow for iclog IO device cache flush behaviour to be optimised, we first need to separate out the commit record iclog IO from the rest of the checkpoint so we can wait for the checkpoint IO to complete before we issue the commit record. This separation is only necessary if the commit record is being written into a different iclog to the start of the checkpoint as the upcoming cache flushing changes requires completion ordering against the other iclogs submitted by the checkpoint. If the entire checkpoint and commit is in the one iclog, then they are both covered by the one set of cache flush primitives on the iclog and hence there is no need to separate them for ordering. Otherwise, we need to wait for all the previous iclogs to complete so they are ordered correctly and made stable by the REQ_PREFLUSH that the commit record iclog IO issues. This guarantees that if a reader sees the commit record in the journal, they will also see the entire checkpoint that commit record closes off. This also provides the guarantee that when the commit record IO completes, we can safely unpin all the log items in the checkpoint so they can be written back because the entire checkpoint is stable in the journal. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Brian Foster Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 8 +++++--- fs/xfs/xfs_log_cil.c | 9 +++++++++ fs/xfs/xfs_log_priv.h | 2 ++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0c91da5defee..17ece07de439 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -786,10 +786,12 @@ xfs_log_mount_cancel( } /* - * Wait for the iclog to be written disk, or return an error if the log has been - * shut down. + * Wait for the iclog and all prior iclogs to be written disk as required by the + * log force state machine. Waiting on ic_force_wait ensures iclog completions + * have been ordered and callbacks run before we are woken here, hence + * guaranteeing that all the iclogs up to this one are on stable storage. */ -static int +int xlog_wait_on_iclog( struct xlog_in_core *iclog) __releases(iclog->ic_log->l_icloglock) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index b0ef071b3cb5..1e5fd6f268c2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -870,6 +870,15 @@ restart: wake_up_all(&cil->xc_commit_wait); spin_unlock(&cil->xc_push_lock); + /* + * If the checkpoint spans multiple iclogs, wait for all previous + * iclogs to complete before we submit the commit_iclog. + */ + if (ctx->start_lsn != commit_lsn) { + spin_lock(&log->l_icloglock); + xlog_wait_on_iclog(commit_iclog->ic_prev); + } + /* release the hounds! */ xfs_log_release_iclog(commit_iclog); return; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 037950cf1061..ee7786b33da9 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -584,6 +584,8 @@ xlog_wait( remove_wait_queue(wq, &wait); } +int xlog_wait_on_iclog(struct xlog_in_core *iclog); + /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a From b5071ada510a76eac0d02912bf66297b9e30ca59 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:49 -0700 Subject: [PATCH 080/102] xfs: remove xfs_blkdev_issue_flush It's a one line wrapper around blkdev_issue_flush(). Just replace it with direct calls to blkdev_issue_flush(). Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_file.c | 6 +++--- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_super.c | 7 ------- fs/xfs/xfs_super.h | 1 - 5 files changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 11edd4ad8151..8ff42b3585e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1889,7 +1889,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - xfs_blkdev_issue_flush(btp); + blkdev_issue_flush(btp->bt_bdev); kmem_free(btp); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9fd5a82a814c..62262d69e39d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -197,9 +197,9 @@ xfs_file_fsync( * inode size in case of an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* * Any inode that has dirty modifications in the log is pinned. The @@ -219,7 +219,7 @@ xfs_file_fsync( */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && mp->m_logdev_targp == mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); return error; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 17ece07de439..b7f858ade134 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1964,7 +1964,7 @@ xlog_sync( * layer state machine for preflushes. */ if (log->l_targ != log->l_mp->m_ddev_targp || split) { - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev); need_flush = false; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3a7fd4f02aa7..2c9e26a44546 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -340,13 +340,6 @@ xfs_blkdev_put( blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -void -xfs_blkdev_issue_flush( - xfs_buftarg_t *buftarg) -{ - blkdev_issue_flush(buftarg->bt_bdev); -} - STATIC void xfs_close_devices( struct xfs_mount *mp) diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index d2b40dc60dfc..167d23f92ffe 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -87,7 +87,6 @@ struct xfs_buftarg; struct block_device; extern void xfs_flush_inodes(struct xfs_mount *mp); -extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); From 0431d926b399d74f1cde2c355d48289c6d7fa882 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:49 -0700 Subject: [PATCH 081/102] xfs: async blkdev cache flush The new checkpoint cache flush mechanism requires us to issue an unconditional cache flush before we start a new checkpoint. We don't want to block for this if we can help it, and we have a fair chunk of CPU work to do between starting the checkpoint and issuing the first journal IO. Hence it makes sense to amortise the latency cost of the cache flush by issuing it asynchronously and then waiting for it only when we need to issue the first IO in the transaction. To do this, we need async cache flush primitives to submit the cache flush bio and to wait on it. The block layer has no such primitives for filesystems, so roll our own for the moment. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bio_io.c | 35 +++++++++++++++++++++++++++++++++++ fs/xfs/xfs_linux.h | 2 ++ 2 files changed, 37 insertions(+) diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 17f36db2f792..667e297f59b1 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,6 +9,41 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } +static void +xfs_flush_bdev_async_endio( + struct bio *bio) +{ + complete(bio->bi_private); +} + +/* + * Submit a request for an async cache flush to run. If the request queue does + * not require flush operations, just skip it altogether. If the caller needs + * to wait for the flush completion at a later point in time, they must supply a + * valid completion. This will be signalled when the flush completes. The + * caller never sees the bio that is issued here. + */ +void +xfs_flush_bdev_async( + struct bio *bio, + struct block_device *bdev, + struct completion *done) +{ + struct request_queue *q = bdev->bd_disk->queue; + + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + complete(done); + return; + } + + bio_init(bio, NULL, 0); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + bio->bi_private = done; + bio->bi_end_io = xfs_flush_bdev_async_endio; + + submit_bio(bio); +} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7688663b9773..c174262a074e 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -196,6 +196,8 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); +void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, + struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) From bad77c375e8de6c776c848e443f7dc2d0d909be5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:50 -0700 Subject: [PATCH 082/102] xfs: CIL checkpoint flushes caches unconditionally Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to guarantee the ordering requirements the journal has w.r.t. metadata writeback. THe two ordering constraints are: 1. we cannot overwrite metadata in the journal until we guarantee that the dirty metadata has been written back in place and is stable. 2. we cannot write back dirty metadata until it has been written to the journal and guaranteed to be stable (and hence recoverable) in the journal. These rules apply to the atomic transactions recorded in the journal, not to the journal IO itself. Hence we need to ensure metadata is stable before we start writing a new transaction to the journal (guarantee #1), and we need to ensure the entire transaction is stable in the journal before we start metadata writeback (guarantee #2). The ordering guarantees of #1 are currently provided by REQ_PREFLUSH being added to every iclog IO. This causes the journal IO to issue a cache flush and wait for it to complete before issuing the write IO to the journal. Hence all completed metadata IO is guaranteed to be stable before the journal overwrites the old metadata. However, for long running CIL checkpoints that might do a thousand journal IOs, we don't need every single one of these iclog IOs to issue a cache flush - the cache flush done before the first iclog is submitted is sufficient to cover the entire range in the log that the checkpoint will overwrite because the CIL space reservation guarantees the tail of the log (completed metadata) is already beyond the range of the checkpoint write. Hence we only need a full cache flush between closing off the CIL checkpoint context (i.e. when the push switches it out) and issuing the first journal IO. Rather than plumbing this through to the journal IO, we can start this cache flush the moment the CIL context is owned exclusively by the push worker. The cache flush can be in progress while we process the CIL ready for writing, hence reducing the latency of the initial iclog write. This is especially true for large checkpoints, where we might have to process hundreds of thousands of log vectors before we issue the first iclog write. In these cases, it is likely the cache flush has already been completed by the time we have built the CIL log vector chain. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 1e5fd6f268c2..7b8b7ac85ea9 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -656,6 +656,8 @@ xlog_cil_push_work( struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; + struct bio bio; + DECLARE_COMPLETION_ONSTACK(bdev_flush); new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -719,10 +721,19 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock); /* - * pull all the log vectors off the items in the CIL, and - * remove the items from the CIL. We don't need the CIL lock - * here because it's only needed on the transaction commit - * side which is currently locked out by the flush lock. + * The CIL is stable at this point - nothing new will be added to it + * because we hold the flush lock exclusively. Hence we can now issue + * a cache flush to ensure all the completed metadata in the journal we + * are about to overwrite is on stable storage. + */ + xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, + &bdev_flush); + + /* + * Pull all the log vectors off the items in the CIL, and remove the + * items from the CIL. We don't need the CIL lock here because it's only + * needed on the transaction commit side which is currently locked out + * by the flush lock. */ lv = NULL; num_iovecs = 0; @@ -806,6 +817,12 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; + /* + * Before we format and submit the first iclog, we have to ensure that + * the metadata writeback ordering cache flush is complete. + */ + wait_for_completion(&bdev_flush); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket; From 3468bb1ca6e8840789e13c7b9d8b0c556b4fbe79 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:50 -0700 Subject: [PATCH 083/102] xfs: remove need_start_rec parameter from xlog_write() The CIL push is the only call to xlog_write that sets this variable to true. The other callers don't need a start rec, and they tell xlog_write what to do by passing the type of ophdr they need written in the flags field. The need_start_rec parameter essentially tells xlog_write to to write an extra ophdr with a XLOG_START_TRANS type, so get rid of the variable to do this and pass XLOG_START_TRANS as the flag value into xlog_write() from the CIL push. $ size fs/xfs/xfs_log.o* text data bss dec hex filename 27595 560 8 28163 6e03 fs/xfs/xfs_log.o.orig 27454 560 8 28022 6d76 fs/xfs/xfs_log.o.patched Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 44 +++++++++++++++++++++---------------------- fs/xfs/xfs_log_cil.c | 3 ++- fs/xfs/xfs_log_priv.h | 3 +-- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b7f858ade134..bd588a4cdddc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -820,9 +820,7 @@ xlog_wait_on_iclog( static int xlog_write_unmount_record( struct xlog *log, - struct xlog_ticket *ticket, - xfs_lsn_t *lsn, - uint flags) + struct xlog_ticket *ticket) { struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, @@ -839,7 +837,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); - return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); + return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); } /* @@ -853,15 +851,13 @@ xlog_unmount_write( struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; - xfs_lsn_t lsn; - uint flags = XLOG_UNMOUNT_TRANS; int error; error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (error) goto out_err; - error = xlog_write_unmount_record(log, tic, &lsn, flags); + error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -1553,8 +1549,7 @@ xlog_commit_record( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO; - error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, - false); + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS); if (error) xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -2151,13 +2146,16 @@ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, struct xfs_log_vec *log_vector, - bool need_start_rec) + uint optype) { struct xfs_log_vec *lv; - int headers = need_start_rec ? 1 : 0; + int headers = 0; int len = 0; int i; + if (optype & XLOG_START_TRANS) + headers++; + for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2377,8 +2375,7 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags, - bool need_start_rec) + uint optype) { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; @@ -2406,8 +2403,9 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); - *start_lsn = 0; + len = xlog_write_calc_vec_length(ticket, log_vector, optype); + if (start_lsn) + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2421,7 +2419,7 @@ xlog_write( ptr = iclog->ic_datap + log_offset; /* start_lsn is the first lsn written to. That's all we need. */ - if (!*start_lsn) + if (start_lsn && !*start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); /* @@ -2434,6 +2432,7 @@ xlog_write( int copy_len; int copy_off; bool ordered = false; + bool wrote_start_rec = false; /* ordered log vectors have no regions to write */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { @@ -2451,13 +2450,15 @@ xlog_write( * write a start record. Only do this for the first * iclog we write to. */ - if (need_start_rec) { + if (optype & XLOG_START_TRANS) { xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); + optype &= ~XLOG_START_TRANS; + wrote_start_rec = true; } - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); if (!ophdr) return -EIO; @@ -2488,14 +2489,13 @@ xlog_write( } copy_len += sizeof(struct xlog_op_header); record_cnt++; - if (need_start_rec) { + if (wrote_start_rec) { copy_len += sizeof(struct xlog_op_header); record_cnt++; - need_start_rec = false; } data_cnt += contwr ? copy_len : 0; - error = xlog_write_copy_finish(log, iclog, flags, + error = xlog_write_copy_finish(log, iclog, optype, &record_cnt, &data_cnt, &partial_copy, &partial_copy_len, @@ -2539,7 +2539,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); + ASSERT(optype & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } else { error = xlog_state_release_iclog(log, iclog); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7b8b7ac85ea9..172bb3551d6b 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -823,7 +823,8 @@ xlog_cil_push_work( */ wait_for_completion(&bdev_flush); - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, + XLOG_START_TRANS); if (error) goto out_abort_free_ticket; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ee7786b33da9..56e1942c47df 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -480,8 +480,7 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint flags, - bool need_start_rec); + struct xlog_in_core **commit_iclog, uint optype); int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *lsn); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); From eef983ffeae7a1cdde8c3338155ae2dd74b8621b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:51 -0700 Subject: [PATCH 084/102] xfs: journal IO cache flush reductions Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to guarantee the ordering requirements the journal has w.r.t. metadata writeback. THe two ordering constraints are: 1. we cannot overwrite metadata in the journal until we guarantee that the dirty metadata has been written back in place and is stable. 2. we cannot write back dirty metadata until it has been written to the journal and guaranteed to be stable (and hence recoverable) in the journal. The ordering guarantees of #1 are provided by REQ_PREFLUSH. This causes the journal IO to issue a cache flush and wait for it to complete before issuing the write IO to the journal. Hence all completed metadata IO is guaranteed to be stable before the journal overwrites the old metadata. The ordering guarantees of #2 are provided by the REQ_FUA, which ensures the journal writes do not complete until they are on stable storage. Hence by the time the last journal IO in a checkpoint completes, we know that the entire checkpoint is on stable storage and we can unpin the dirty metadata and allow it to be written back. This is the mechanism by which ordering was first implemented in XFS way back in 2002 by commit 95d97c36e5155075ba2eb22b17562cfcc53fcf96 ("Add support for drive write cache flushing") in the xfs-archive tree. A lot has changed since then, most notably we now use delayed logging to checkpoint the filesystem to the journal rather than write each individual transaction to the journal. Cache flushes on journal IO are necessary when individual transactions are wholly contained within a single iclog. However, CIL checkpoints are single transactions that typically span hundreds to thousands of individual journal writes, and so the requirements for device cache flushing have changed. That is, the ordering rules I state above apply to ordering of atomic transactions recorded in the journal, not to the journal IO itself. Hence we need to ensure metadata is stable before we start writing a new transaction to the journal (guarantee #1), and we need to ensure the entire transaction is stable in the journal before we start metadata writeback (guarantee #2). Hence we only need a REQ_PREFLUSH on the journal IO that starts a new journal transaction to provide #1, and it is not on any other journal IO done within the context of that journal transaction. The CIL checkpoint already issues a cache flush before it starts writing to the log, so we no longer need the iclog IO to issue a REQ_REFLUSH for us. Hence if XLOG_START_TRANS is passed to xlog_write(), we no longer need to mark the first iclog in the log write with REQ_PREFLUSH for this case. As an added bonus, this ordering mechanism works for both internal and external logs, meaning we can remove the explicit data device cache flushes from the iclog write code when using external logs. Given the new ordering semantics of commit records for the CIL, we need iclogs containing commit records to issue a REQ_PREFLUSH. We also require unmount records to do this. Hence for both XLOG_COMMIT_TRANS and XLOG_UNMOUNT_TRANS xlog_write() calls we need to mark the first iclog being written with REQ_PREFLUSH. For both commit records and unmount records, we also want them immediately on stable storage, so we want to also mark the iclogs that contain these records to be marked REQ_FUA. That means if a record is split across multiple iclogs, they are all marked REQ_FUA and not just the last one so that when the transaction is completed all the parts of the record are on stable storage. And for external logs, unmount records need a pre-write data device cache flush similar to the CIL checkpoint cache pre-flush as the internal iclog write code does not do this implicitly anymore. As an optimisation, when the commit record lands in the same iclog as the journal transaction starts, we don't need to wait for anything and can simply use REQ_FUA to provide guarantee #2. This means that for fsync() heavy workloads, the cache flush behaviour is completely unchanged and there is no degradation in performance as a result of optimise the multi-IO transaction case. The most notable sign that there is less IO latency on my test machine (nvme SSDs) is that the "noiclogs" rate has dropped substantially. This metric indicates that the CIL push is blocking in xlog_get_iclog_space() waiting for iclog IO completion to occur. With 8 iclogs of 256kB, the rate is appoximately 1 noiclog event to every 4 iclog writes. IOWs, every 4th call to xlog_get_iclog_space() is blocking waiting for log IO. With the changes in this patch, this drops to 1 noiclog event for every 100 iclog writes. Hence it is clear that log IO is completing much faster than it was previously, but it is also clear that for large iclog sizes, this isn't the performance limiting factor on this hardware. With smaller iclogs (32kB), however, there is a substantial difference. With the cache flush modifications, the journal is now running at over 4000 write IOPS, and the journal throughput is largely identical to the 256kB iclogs and the noiclog event rate stays low at about 1:50 iclog writes. The existing code tops out at about 2500 IOPS as the number of cache flushes dominate performance and latency. The noiclog event rate is about 1:4, and the performance variance is quite large as the journal throughput can fall to less than half the peak sustained rate when the cache flush rate prevents metadata writeback from keeping up and the log runs out of space and throttles reservations. As a result: logbsize fsmark create rate rm -rf before 32kb 152851+/-5.3e+04 5m28s patched 32kb 221533+/-1.1e+04 5m24s before 256kb 220239+/-6.2e+03 4m58s patched 256kb 228286+/-9.2e+03 5m06s The rm -rf times are included because I ran them, but the differences are largely noise. This workload is largely metadata read IO latency bound and the changes to the journal cache flushing doesn't really make any noticable difference to behaviour apart from a reduction in noiclog events from background CIL pushing. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 66 +++++++++++++++---------------------------- fs/xfs/xfs_log.h | 1 - fs/xfs/xfs_log_cil.c | 18 +++++++++--- fs/xfs/xfs_log_priv.h | 6 ++++ 4 files changed, 43 insertions(+), 48 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bd588a4cdddc..817c9bfd0cd0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -513,7 +513,7 @@ __xlog_state_release_iclog( * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. */ -static int +int xlog_state_release_iclog( struct xlog *log, struct xlog_in_core *iclog) @@ -533,23 +533,6 @@ xlog_state_release_iclog( return 0; } -void -xfs_log_release_iclog( - struct xlog_in_core *iclog) -{ - struct xlog *log = iclog->ic_log; - bool sync = false; - - if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - if (iclog->ic_state != XLOG_STATE_IOERROR) - sync = __xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - } - - if (sync) - xlog_sync(log, iclog); -} - /* * Mount a log filesystem * @@ -837,6 +820,14 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); + + /* + * For external log devices, we need to flush the data device cache + * first to ensure all metadata writeback is on stable storage before we + * stamp the tail LSN into the unmount record. + */ + if (log->l_targ != log->l_mp->m_ddev_targp) + blkdev_issue_flush(log->l_targ->bt_bdev); return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); } @@ -874,6 +865,11 @@ out_err: else ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_state == XLOG_STATE_IOERROR); + /* + * Ensure the journal is fully flushed and on stable storage once the + * iclog containing the unmount record is written. + */ + iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); error = xlog_state_release_iclog(log, iclog); xlog_wait_on_iclog(iclog); @@ -1755,8 +1751,7 @@ xlog_write_iclog( struct xlog *log, struct xlog_in_core *iclog, uint64_t bno, - unsigned int count, - bool need_flush) + unsigned int count) { ASSERT(bno < log->l_logBBsize); @@ -1794,10 +1789,12 @@ xlog_write_iclog( * writeback throttle from throttling log writes behind background * metadata writeback and causing priority inversions. */ - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | - REQ_IDLE | REQ_FUA; - if (need_flush) + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + if (iclog->ic_flags & XLOG_ICL_NEED_FUA) + iclog->ic_bio.bi_opf |= REQ_FUA; + iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1900,7 +1897,6 @@ xlog_sync( unsigned int roundoff; /* roundoff to BB or stripe */ uint64_t bno; unsigned int size; - bool need_flush = true, split = false; ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1925,10 +1921,8 @@ xlog_sync( bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); /* Do we need to split this write into 2 parts? */ - if (bno + BTOBB(count) > log->l_logBBsize) { + if (bno + BTOBB(count) > log->l_logBBsize) xlog_split_iclog(log, &iclog->ic_header, bno, count); - split = true; - } /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, @@ -1949,22 +1943,8 @@ xlog_sync( be64_to_cpu(iclog->ic_header.h_lsn)); } #endif - - /* - * Flush the data device before flushing the log to make sure all meta - * data written back from the AIL actually made it to disk before - * stamping the new log tail LSN into the log buffer. For an external - * log we need to issue the flush explicitly, and unfortunately - * synchronously here; for an internal log we can simply use the block - * layer state machine for preflushes. - */ - if (log->l_targ != log->l_mp->m_ddev_targp || split) { - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev); - need_flush = false; - } - xlog_verify_iclog(log, iclog, count); - xlog_write_iclog(log, iclog, bno, count, need_flush); + xlog_write_iclog(log, iclog, bno, count); } /* @@ -2418,7 +2398,7 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset; - /* start_lsn is the first lsn written to. That's all we need. */ + /* Start_lsn is the first lsn written to. */ if (start_lsn && !*start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 044e02cb8921..99f9d6ed9598 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -117,7 +117,6 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 172bb3551d6b..9d2fa8464289 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -890,15 +890,25 @@ restart: /* * If the checkpoint spans multiple iclogs, wait for all previous - * iclogs to complete before we submit the commit_iclog. + * iclogs to complete before we submit the commit_iclog. In this case, + * the commit_iclog write needs to issue a pre-flush so that the + * ordering is correctly preserved down to stable storage. */ + spin_lock(&log->l_icloglock); if (ctx->start_lsn != commit_lsn) { - spin_lock(&log->l_icloglock); xlog_wait_on_iclog(commit_iclog->ic_prev); + spin_lock(&log->l_icloglock); + commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; } - /* release the hounds! */ - xfs_log_release_iclog(commit_iclog); + /* + * The commit iclog must be written to stable storage to guarantee + * journal IO vs metadata writeback IO is correctly ordered on stable + * storage. + */ + commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + xlog_state_release_iclog(log, commit_iclog); + spin_unlock(&log->l_icloglock); return; out_skip: diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 56e1942c47df..2203ccecafb6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -133,6 +133,9 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 +#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ + /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 @@ -201,6 +204,7 @@ typedef struct xlog_in_core { u32 ic_size; u32 ic_offset; enum xlog_iclog_state ic_state; + unsigned int ic_flags; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ @@ -486,6 +490,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); + /* * When we crack an atomic LSN, we sample it first so that the value will not * change while we are cracking it into the component values. This means we From 19f4e7cc819771812a7f527d7897c2deffbf7a00 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:51 -0700 Subject: [PATCH 085/102] xfs: Fix CIL throttle hang when CIL space used going backwards A hang with tasks stuck on the CIL hard throttle was reported and largely diagnosed by Donald Buczek, who discovered that it was a result of the CIL context space usage decrementing in committed transactions once the hard throttle limit had been hit and processes were already blocked. This resulted in the CIL push not waking up those waiters because the CIL context was no longer over the hard throttle limit. The surprising aspect of this was the CIL space usage going backwards regularly enough to trigger this situation. Assumptions had been made in design that the relogging process would only increase the size of the objects in the CIL, and so that space would only increase. This change and commit message fixes the issue and documents the result of an audit of the triggers that can cause the CIL space to go backwards, how large the backwards steps tend to be, the frequency in which they occur, and what the impact on the CIL accounting code is. Even though the CIL ctx->space_used can go backwards, it will only do so if the log item is already logged to the CIL and contains a space reservation for it's entire logged state. This is tracked by the shadow buffer state on the log item. If the item is not previously logged in the CIL it has no shadow buffer nor log vector, and hence the entire size of the logged item copied to the log vector is accounted to the CIL space usage. i.e. it will always go up in this case. If the item has a log vector (i.e. already in the CIL) and the size decreases, then the existing log vector will be overwritten and the space usage will go down. This is the only condition where the space usage reduces, and it can only occur when an item is already tracked in the CIL. Hence we are safe from CIL space usage underruns as a result of log items decreasing in size when they are relogged. Typically this reduction in CIL usage occurs from metadata blocks being free, such as when a btree block merge occurs or a directory enter/xattr entry is removed and the da-tree is reduced in size. This generally results in a reduction in size of around a single block in the CIL, but also tends to increase the number of log vectors because the parent and sibling nodes in the tree needs to be updated when a btree block is removed. If a multi-level merge occurs, then we see reduction in size of 2+ blocks, but again the log vector count goes up. The other vector is inode fork size changes, which only log the current size of the fork and ignore the previously logged size when the fork is relogged. Hence if we are removing items from the inode fork (dir/xattr removal in shortform, extent record removal in extent form, etc) the relogged size of the inode for can decrease. No other log items can decrease in size either because they are a fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging an intent actually creates a new intent log item and doesn't relog the old item at all.) Hence the only two vectors for CIL context size reduction are relogging inode forks and marking buffers active in the CIL as stale. Long story short: the majority of the code does the right thing and handles the reduction in log item size correctly, and only the CIL hard throttle implementation is problematic and needs fixing. This patch makes that fix, as well as adds comments in the log item code that result in items shrinking in size when they are relogged as a clear reminder that this can and does happen frequently. The throttle fix is based upon the change Donald proposed, though it goes further to ensure that once the throttle is activated, it captures all tasks until the CIL push issues a wakeup, regardless of whether the CIL space used has gone back under the throttle threshold. This ensures that we prevent tasks reducing the CIL slightly under the throttle threshold and then making more changes that push it well over the throttle limit. This is acheived by checking if the throttle wait queue is already active as a condition of throttling. Hence once we start throttling, we continue to apply the throttle until the CIL context push wakes everything on the wait queue. We can use waitqueue_active() for the waitqueue manipulations and checks as they are all done under the ctx->xc_push_lock. Hence the waitqueue has external serialisation and we can safely peek inside the wait queue without holding the internal waitqueue locks. Many thanks to Donald for his diagnostic and analysis work to isolate the cause of this hang. Reported-and-tested-by: Donald Buczek Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 37 ++++++++++++++++++------------------- fs/xfs/xfs_inode_item.c | 14 ++++++++++++++ fs/xfs/xfs_log_cil.c | 22 +++++++++++++++++----- 3 files changed, 49 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index fb69879e4b2b..14d1fefcbf4c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -74,14 +74,12 @@ xfs_buf_item_straddle( } /* - * This returns the number of log iovecs needed to log the - * given buf log item. + * Return the number of log iovecs and space needed to log the given buf log + * item segment. * - * It calculates this as 1 iovec for the buf log format structure - * and 1 for each stretch of non-contiguous chunks to be logged. - * Contiguous chunks are logged in a single iovec. - * - * If the XFS_BLI_STALE flag has been set, then log nothing. + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. */ STATIC void xfs_buf_item_size_segment( @@ -168,11 +166,8 @@ slow_scan: } /* - * This returns the number of log iovecs needed to log the given buf log item. - * - * It calculates this as 1 iovec for the buf log format structure and 1 for each - * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged - * in a single iovec. + * Return the number of log iovecs and space needed to log the given buf log + * item. * * Discontiguous buffers need a format structure per region that is being * logged. This makes the changes in the buffer appear to log recovery as though @@ -182,7 +177,11 @@ slow_scan: * what ends up on disk. * * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log - * format structures. + * format structures. If the item has previously been logged and has dirty + * regions, we do not relog them in stale buffers. This has the effect of + * reducing the size of the relogged item by the amount of dirty data tracked + * by the log item. This can result in the committing transaction reducing the + * amount of space being consumed by the CIL. */ STATIC void xfs_buf_item_size( @@ -199,9 +198,9 @@ xfs_buf_item_size( ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. + * The buffer is stale, so all we need to log is the buf log + * format structure with the cancel flag in it as we are never + * going to replay the changes tracked in the log item. */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); @@ -216,9 +215,9 @@ xfs_buf_item_size( if (bip->bli_flags & XFS_BLI_ORDERED) { /* - * The buffer has been logged just to order it. - * It is not being included in the transaction - * commit, so no vectors are used at all. + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so no vectors are used at + * all. */ trace_xfs_buf_item_size_ordered(bip); *nvecs = XFS_LOG_VEC_ORDERED; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6764d12342da..5a2dd33020e2 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } +/* + * The logged size of an inode fork is always the current size of the inode + * fork. This means that when an inode fork is relogged, the size of the logged + * region is determined by the current state, not the combination of the + * previously logged state + the current state. This is different relogging + * behaviour to most other log items which will retain the size of the + * previously logged changes when smaller regions are relogged. + * + * Hence operations that remove data from the inode fork (e.g. shortform + * dir/attr remove, extent form extent removal, etc), the size of the relogged + * inode gets -smaller- rather than stays the same size as the previously logged + * size and this can result in the committing transaction reducing the amount of + * space being consumed by the CIL. + */ STATIC void xfs_inode_item_data_fork_size( struct xfs_inode_log_item *iip, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9d2fa8464289..903617e6d054 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -670,9 +670,14 @@ xlog_cil_push_work( ASSERT(push_seq <= ctx->sequence); /* - * Wake up any background push waiters now this context is being pushed. + * As we are about to switch to a new, empty CIL context, we no longer + * need to throttle tasks on CIL space overruns. Wake any waiters that + * the hard push throttle may have caught so they can start committing + * to the new context. The ctx->xc_push_lock provides the serialisation + * necessary for safely using the lockless waitqueue_active() check in + * this context. */ - if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait); /* @@ -944,7 +949,7 @@ xlog_cil_push_background( ASSERT(!list_empty(&cil->xc_cil)); /* - * don't do a background push if we haven't used up all the + * Don't do a background push if we haven't used up all the * space available yet. */ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { @@ -968,9 +973,16 @@ xlog_cil_push_background( /* * If we are well over the space limit, throttle the work that is being - * done until the push work on this context has begun. + * done until the push work on this context has begun. Enforce the hard + * throttle on all transaction commits once it has been activated, even + * if the committing transactions have resulted in the space usage + * dipping back down under the hard limit. + * + * The ctx->xc_push_lock provides the serialisation necessary for safely + * using the lockless waitqueue_active() check in this context. */ - if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) || + waitqueue_active(&cil->xc_push_wait)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); ASSERT(cil->xc_ctx->space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); From 5f9b4b0de8dc2fb8eb655463b438001c111570fe Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 08:21:52 -0700 Subject: [PATCH 086/102] xfs: xfs_log_force_lsn isn't passed a LSN In doing an investigation into AIL push stalls, I was looking at the log force code to see if an async CIL push could be done instead. This lead me to xfs_log_force_lsn() and looking at how it works. xfs_log_force_lsn() is only called from inode synchronisation contexts such as fsync(), and it takes the ip->i_itemp->ili_last_lsn value as the LSN to sync the log to. This gets passed to xlog_cil_force_lsn() via xfs_log_force_lsn() to flush the CIL to the journal, and then used by xfs_log_force_lsn() to flush the iclogs to the journal. The problem is that ip->i_itemp->ili_last_lsn does not store a log sequence number. What it stores is passed to it from the ->iop_committing method, which is called by xfs_log_commit_cil(). The value this passes to the iop_committing method is the CIL context sequence number that the item was committed to. As it turns out, xlog_cil_force_lsn() converts the sequence to an actual commit LSN for the related context and returns that to xfs_log_force_lsn(). xfs_log_force_lsn() overwrites it's "lsn" variable that contained a sequence with an actual LSN and then uses that to sync the iclogs. This caused me some confusion for a while, even though I originally wrote all this code a decade ago. ->iop_committing is only used by a couple of log item types, and only inode items use the sequence number it is passed. Let's clean up the API, CIL structures and inode log item to call it a sequence number, and make it clear that the high level code is using CIL sequence numbers and not on-disk LSNs for integrity synchronisation purposes. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_types.h | 1 + fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot_item.c | 2 +- fs/xfs/xfs_file.c | 14 +++++++------- fs/xfs/xfs_inode.c | 10 +++++----- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_inode_item.h | 2 +- fs/xfs/xfs_log.c | 27 ++++++++++++++------------- fs/xfs/xfs_log.h | 4 +--- fs/xfs/xfs_log_cil.c | 30 +++++++++++------------------- fs/xfs/xfs_log_priv.h | 15 +++++++-------- fs/xfs/xfs_trans.c | 6 +++--- fs/xfs/xfs_trans.h | 4 ++-- 13 files changed, 56 insertions(+), 65 deletions(-) diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 064bd6e8c922..0870ef6f933d 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ +typedef int64_t xfs_csn_t; /* CIL sequence number */ typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 14d1fefcbf4c..1cb087b320b1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -713,7 +713,7 @@ xfs_buf_item_release( STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { return xfs_buf_item_release(lip); } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 8c1fdf37ee8f..8ed47b739b6c 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -188,7 +188,7 @@ xfs_qm_dquot_logitem_release( STATIC void xfs_qm_dquot_logitem_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { return xfs_qm_dquot_logitem_release(lip); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 62262d69e39d..3d64d99e64f9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -119,8 +119,8 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_lsn_t -xfs_fsync_lsn( +static xfs_csn_t +xfs_fsync_seq( struct xfs_inode *ip, bool datasync) { @@ -128,7 +128,7 @@ xfs_fsync_lsn( return 0; if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) return 0; - return ip->i_itemp->ili_last_lsn; + return ip->i_itemp->ili_commit_seq; } /* @@ -151,12 +151,12 @@ xfs_fsync_flush_log( int *log_flushed) { int error = 0; - xfs_lsn_t lsn; + xfs_csn_t seq; xfs_ilock(ip, XFS_ILOCK_SHARED); - lsn = xfs_fsync_lsn(ip, datasync); - if (lsn) { - error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, + seq = xfs_fsync_seq(ip, datasync); + if (seq) { + error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, log_flushed); spin_lock(&ip->i_itemp->ili_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3bee1cd20072..9ae5d2968274 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2633,7 +2633,7 @@ xfs_iunpin( trace_xfs_inode_unpin_nowait(ip, _RET_IP_); /* Give the log a push to start the unpinning I/O */ - xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); + xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); } @@ -3647,16 +3647,16 @@ int xfs_log_force_inode( struct xfs_inode *ip) { - xfs_lsn_t lsn = 0; + xfs_csn_t seq = 0; xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; + seq = ip->i_itemp->ili_commit_seq; xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!lsn) + if (!seq) return 0; - return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); } /* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 5a2dd33020e2..35de30849fcc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -643,9 +643,9 @@ xfs_inode_item_committed( STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { - INODE_ITEM(lip)->ili_last_lsn = commit_lsn; + INODE_ITEM(lip)->ili_commit_seq = seq; return xfs_inode_item_release(lip); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4b926e32831c..403b45ab9aa2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -33,7 +33,7 @@ struct xfs_inode_log_item { unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + xfs_csn_t ili_commit_seq; /* last transaction commit */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 817c9bfd0cd0..6f9cafd581d9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3252,14 +3252,13 @@ out_error: } static int -__xfs_log_force_lsn( - struct xfs_mount *mp, +xlog_force_lsn( + struct xlog *log, xfs_lsn_t lsn, uint flags, int *log_flushed, bool already_slept) { - struct xlog *log = mp->m_log; struct xlog_in_core *iclog; spin_lock(&log->l_icloglock); @@ -3292,8 +3291,6 @@ __xfs_log_force_lsn( if (!already_slept && (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); return -EAGAIN; @@ -3331,25 +3328,29 @@ out_error: * to disk, that thread will wake up all threads waiting on the queue. */ int -xfs_log_force_lsn( +xfs_log_force_seq( struct xfs_mount *mp, - xfs_lsn_t lsn, + xfs_csn_t seq, uint flags, int *log_flushed) { + struct xlog *log = mp->m_log; + xfs_lsn_t lsn; int ret; - ASSERT(lsn != 0); + ASSERT(seq != 0); XFS_STATS_INC(mp, xs_log_force); - trace_xfs_log_force(mp, lsn, _RET_IP_); + trace_xfs_log_force(mp, seq, _RET_IP_); - lsn = xlog_cil_force_lsn(mp->m_log, lsn); + lsn = xlog_cil_force_seq(log, seq); if (lsn == NULLCOMMITLSN) return 0; - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); - if (ret == -EAGAIN) - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) { + XFS_STATS_INC(mp, xs_log_force_sleep); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, true); + } return ret; } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 99f9d6ed9598..813b972e9788 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -106,7 +106,7 @@ struct xfs_item_ops; struct xfs_trans; int xfs_log_force(struct xfs_mount *mp, uint flags); -int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, +int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, int *log_forced); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, @@ -131,8 +131,6 @@ bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, bool regrant); void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 903617e6d054..3c2b1205944d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -788,7 +788,7 @@ xlog_cil_push_work( * that higher sequences will wait for us to write out a commit record * before they do. * - * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * xfs_log_force_seq requires us to mirror the new sequence into the cil * structure atomically with the addition of this sequence to the * committing list. This also ensures that we can do unlocked checks * against the current sequence in log forces without risking @@ -1057,16 +1057,14 @@ xlog_cil_empty( * allowed again. */ void -xfs_log_commit_cil( - struct xfs_mount *mp, +xlog_cil_commit( + struct xlog *log, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, + xfs_csn_t *commit_seq, bool regrant) { - struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; - xfs_lsn_t xc_commit_lsn; /* * Do all necessary memory allocation before we lock the CIL. @@ -1080,10 +1078,6 @@ xfs_log_commit_cil( xlog_cil_insert_items(log, tp); - xc_commit_lsn = cil->xc_ctx->sequence; - if (commit_lsn) - *commit_lsn = xc_commit_lsn; - if (regrant && !XLOG_FORCED_SHUTDOWN(log)) xfs_log_ticket_regrant(log, tp->t_ticket); else @@ -1106,8 +1100,10 @@ xfs_log_commit_cil( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); if (lip->li_ops->iop_committing) - lip->li_ops->iop_committing(lip, xc_commit_lsn); + lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence); } + if (commit_seq) + *commit_seq = cil->xc_ctx->sequence; /* xlog_cil_push_background() releases cil->xc_ctx_lock */ xlog_cil_push_background(log); @@ -1124,9 +1120,9 @@ xfs_log_commit_cil( * iclog flush is necessary following this call. */ xfs_lsn_t -xlog_cil_force_lsn( +xlog_cil_force_seq( struct xlog *log, - xfs_lsn_t sequence) + xfs_csn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; @@ -1222,21 +1218,17 @@ bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; if (list_empty(&lip->li_cil)) return false; - ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; - /* * li_seq is written on the first commit of a log item to record the * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) - return false; - return true; + return lip->li_seq == ctx->sequence; } /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2203ccecafb6..2d7e7cbee8b7 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -234,7 +234,7 @@ struct xfs_cil; struct xfs_cil_ctx { struct xfs_cil *cil; - xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_ticket *ticket; /* chkpt ticket */ @@ -272,10 +272,10 @@ struct xfs_cil { struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; - xfs_lsn_t xc_push_seq; + xfs_csn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; - xfs_lsn_t xc_current_sequence; + xfs_csn_t xc_current_sequence; struct work_struct xc_push_work; wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; @@ -554,19 +554,18 @@ int xlog_cil_init(struct xlog *log); void xlog_cil_init_post_recovery(struct xlog *log); void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); +void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, + xfs_csn_t *commit_seq, bool regrant); /* * CIL force routines */ -xfs_lsn_t -xlog_cil_force_lsn( - struct xlog *log, - xfs_lsn_t sequence); +xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); static inline void xlog_cil_force(struct xlog *log) { - xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); + xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); } /* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 586f2992b789..87bffd12c20c 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -839,7 +839,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; - xfs_lsn_t commit_lsn = -1; + xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -881,7 +881,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); + xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -890,7 +890,7 @@ __xfs_trans_commit( * log out now and wait for it. */ if (sync) { - error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL); XFS_STATS_INC(mp, xs_trans_sync); } else { XFS_STATS_INC(mp, xs_trans_async); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ee42d98d9011..50da47f23a07 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -43,7 +43,7 @@ struct xfs_log_item { struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ - xfs_lsn_t li_seq; /* CIL commit seq */ + xfs_csn_t li_seq; /* CIL commit seq */ }; /* @@ -69,7 +69,7 @@ struct xfs_item_ops { void (*iop_pin)(struct xfs_log_item *); void (*iop_unpin)(struct xfs_log_item *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); - void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); + void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); int (*iop_recover)(struct xfs_log_item *lip, From 956f6daa84bf50dd5bd13a64b57cae446bca3899 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 18 Jun 2021 11:57:05 -0700 Subject: [PATCH 087/102] xfs: add iclog state trace events For the DEBUGS! Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 18 +++++++++++++ fs/xfs/xfs_log_priv.h | 10 ++++++++ fs/xfs/xfs_trace.h | 60 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6f9cafd581d9..596cd1640bab 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -502,6 +502,7 @@ __xlog_state_release_iclog( iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog, tail_lsn); /* cycle incremented when incrementing curr_block */ + trace_xlog_iclog_syncing(iclog, _RET_IP_); return true; } @@ -520,6 +521,7 @@ xlog_state_release_iclog( { lockdep_assert_held(&log->l_icloglock); + trace_xlog_iclog_release(iclog, _RET_IP_); if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO; @@ -781,6 +783,7 @@ xlog_wait_on_iclog( { struct xlog *log = iclog->ic_log; + trace_xlog_iclog_wait_on(iclog, _RET_IP_); if (!XLOG_FORCED_SHUTDOWN(log) && iclog->ic_state != XLOG_STATE_ACTIVE && iclog->ic_state != XLOG_STATE_DIRTY) { @@ -1754,6 +1757,7 @@ xlog_write_iclog( unsigned int count) { ASSERT(bno < log->l_logBBsize); + trace_xlog_iclog_write(iclog, _RET_IP_); /* * We lock the iclogbufs here so that we can serialise against I/O @@ -1899,6 +1903,7 @@ xlog_sync( unsigned int size; ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync(iclog, _RET_IP_); count = xlog_calc_iclog_size(log, iclog, &roundoff); @@ -2535,6 +2540,7 @@ xlog_state_activate_iclog( int *iclogs_changed) { ASSERT(list_empty_careful(&iclog->ic_callbacks)); + trace_xlog_iclog_activate(iclog, _RET_IP_); /* * If the number of ops in this iclog indicate it just contains the @@ -2624,6 +2630,8 @@ xlog_state_clean_iclog( { int iclogs_changed = 0; + trace_xlog_iclog_clean(dirty_iclog, _RET_IP_); + dirty_iclog->ic_state = XLOG_STATE_DIRTY; xlog_state_activate_iclogs(log, &iclogs_changed); @@ -2683,6 +2691,7 @@ xlog_state_set_callback( struct xlog_in_core *iclog, xfs_lsn_t header_lsn) { + trace_xlog_iclog_callback(iclog, _RET_IP_); iclog->ic_state = XLOG_STATE_CALLBACK; ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), @@ -2764,6 +2773,7 @@ xlog_state_do_iclog_callbacks( __releases(&log->l_icloglock) __acquires(&log->l_icloglock) { + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); spin_unlock(&log->l_icloglock); spin_lock(&iclog->ic_callback_lock); while (!list_empty(&iclog->ic_callbacks)) { @@ -2783,6 +2793,7 @@ xlog_state_do_iclog_callbacks( */ spin_lock(&log->l_icloglock); spin_unlock(&iclog->ic_callback_lock); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); } STATIC void @@ -2874,6 +2885,7 @@ xlog_state_done_syncing( spin_lock(&log->l_icloglock); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync_done(iclog, _RET_IP_); /* * If we got an error, either on the first buffer, or in the case of @@ -2947,6 +2959,8 @@ restart: atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; + trace_xlog_iclog_get_space(iclog, _RET_IP_); + /* On the 1st write to an iclog, figure out lsn. This works * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are * committing to. If the offset is set, that's how many blocks @@ -3112,6 +3126,7 @@ xlog_state_switch_iclogs( { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); assert_spin_locked(&log->l_icloglock); + trace_xlog_iclog_switch(iclog, _RET_IP_); if (!eventual_size) eventual_size = iclog->ic_offset; @@ -3194,6 +3209,8 @@ xfs_log_force( if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; + trace_xlog_iclog_force(iclog, _RET_IP_); + if (iclog->ic_state == XLOG_STATE_DIRTY || (iclog->ic_state == XLOG_STATE_ACTIVE && atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { @@ -3267,6 +3284,7 @@ xlog_force_lsn( goto out_error; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) goto out_unlock; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2d7e7cbee8b7..293d82b1fc0d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -50,6 +50,16 @@ enum xlog_iclog_state { XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ }; +#define XLOG_STATE_STRINGS \ + { XLOG_STATE_ACTIVE, "XLOG_STATE_ACTIVE" }, \ + { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \ + { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \ + { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \ + { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ + { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \ + { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" } + + /* * Log ticket flags */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 71dca776c110..28d570742000 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -24,6 +24,7 @@ struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; struct xlog_rec_header; +struct xlog_in_core; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; @@ -3927,6 +3928,65 @@ DEFINE_EVENT(xfs_icwalk_class, name, \ DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks); DEFINE_ICWALK_EVENT(xfs_blockgc_free_space); +TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE); +TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC); +TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING); +TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC); +TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK); +TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY); +TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR); + +DECLARE_EVENT_CLASS(xlog_iclog_class, + TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), + TP_ARGS(iclog, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, state) + __field(int32_t, refcount) + __field(uint32_t, offset) + __field(unsigned long long, lsn) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = iclog->ic_log->l_mp->m_super->s_dev; + __entry->state = iclog->ic_state; + __entry->refcount = atomic_read(&iclog->ic_refcnt); + __entry->offset = iclog->ic_offset; + __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->state, XLOG_STATE_STRINGS), + __entry->refcount, + __entry->offset, + __entry->lsn, + (char *)__entry->caller_ip) + +); + +#define DEFINE_ICLOG_EVENT(name) \ +DEFINE_EVENT(xlog_iclog_class, name, \ + TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \ + TP_ARGS(iclog, caller_ip)) + +DEFINE_ICLOG_EVENT(xlog_iclog_activate); +DEFINE_ICLOG_EVENT(xlog_iclog_clean); +DEFINE_ICLOG_EVENT(xlog_iclog_callback); +DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start); +DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done); +DEFINE_ICLOG_EVENT(xlog_iclog_force); +DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn); +DEFINE_ICLOG_EVENT(xlog_iclog_get_space); +DEFINE_ICLOG_EVENT(xlog_iclog_release); +DEFINE_ICLOG_EVENT(xlog_iclog_switch); +DEFINE_ICLOG_EVENT(xlog_iclog_sync); +DEFINE_ICLOG_EVENT(xlog_iclog_syncing); +DEFINE_ICLOG_EVENT(xlog_iclog_sync_done); +DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); +DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); +DEFINE_ICLOG_EVENT(xlog_iclog_write); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH From ff7bebeb91f8cc2e26e7dabbf301da5ec0e9328c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Jun 2021 11:57:05 -0700 Subject: [PATCH 088/102] xfs: refactor the inode recycling code Hoist the code in xfs_iget_cache_hit that restores the VFS inode state to an xfs_inode that was previously vfs-destroyed. The next patch will add a new set of state flags, so we need the helper to avoid duplication. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 143 +++++++++++++++++++++++++------------------- fs/xfs/xfs_trace.h | 4 +- 2 files changed, 83 insertions(+), 64 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4e4682879bbd..37229517c8f7 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -355,14 +355,14 @@ xfs_reinit_inode( struct xfs_mount *mp, struct inode *inode) { - int error; - uint32_t nlink = inode->i_nlink; - uint32_t generation = inode->i_generation; - uint64_t version = inode_peek_iversion(inode); - umode_t mode = inode->i_mode; - dev_t dev = inode->i_rdev; - kuid_t uid = inode->i_uid; - kgid_t gid = inode->i_gid; + int error; + uint32_t nlink = inode->i_nlink; + uint32_t generation = inode->i_generation; + uint64_t version = inode_peek_iversion(inode); + umode_t mode = inode->i_mode; + dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -376,6 +376,74 @@ xfs_reinit_inode( return error; } +/* + * Carefully nudge an inode whose VFS state has been torn down back into a + * usable state. Drops the i_flags_lock and the rcu read lock. + */ +static int +xfs_iget_recycle( + struct xfs_perag *pag, + struct xfs_inode *ip) __releases(&ip->i_flags_lock) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int error; + + trace_xfs_iget_recycle(ip); + + /* + * We need to make it look like the inode is being reclaimed to prevent + * the actual reclaim workers from stomping over us while we recycle + * the inode. We can't clear the radix tree tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + error = xfs_reinit_inode(mp, inode); + if (error) { + bool wake; + + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + wake = !!__xfs_iflags_test(ip, XFS_INEW); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + trace_xfs_iget_recycle_fail(ip); + return error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now effectively + * a new inode and need to return to the initial state before reuse + * occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + + return 0; +} + /* * If we are allocating a new inode, then check what was returned is * actually a free, empty inode. If we are not allocating an inode, @@ -450,7 +518,7 @@ xfs_iget_cache_hit( /* * If we are racing with another cache hit that is currently * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete + * reclaimable state, wait for the initialisation to complete * before continuing. * * XXX(hch): eventually we should do something equivalent to @@ -472,64 +540,16 @@ xfs_iget_cache_hit( if (error) goto out_error; - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - if (flags & XFS_IGET_INCORE) { error = -EAGAIN; goto out_error; } - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - error = xfs_reinit_inode(mp, inode); - if (error) { - bool wake; - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - wake = !!__xfs_iflags_test(ip, XFS_INEW); - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - if (wake) - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - xfs_perag_clear_inode_tag(pag, - XFS_INO_TO_AGINO(pag->pag_mount, ino), - XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); + /* Drops i_flags_lock and RCU read lock. */ + error = xfs_iget_recycle(pag, ip); + if (error) + return error; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -559,7 +579,6 @@ out_error: return error; } - static int xfs_iget_cache_miss( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 28d570742000..a442bc4dfdc4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -631,8 +631,8 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip), \ TP_ARGS(ip)) DEFINE_INODE_EVENT(xfs_iget_skip); -DEFINE_INODE_EVENT(xfs_iget_reclaim); -DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_recycle); +DEFINE_INODE_EVENT(xfs_iget_recycle_fail); DEFINE_INODE_EVENT(xfs_iget_hit); DEFINE_INODE_EVENT(xfs_iget_miss); From 77b4d2861e8381d00e4b9bd1be2a355dda99ff60 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Jun 2021 11:57:06 -0700 Subject: [PATCH 089/102] xfs: separate primary inode selection criteria in xfs_iget_cache_hit During review of the v6 deferred inode inactivation patchset[1], Dave commented that _cache_hit should have a clear separation between inode selection criteria and actions performed on a selected inode. Move a hunk to make this true, and compact the shrink cases in the function. [1] https://lore.kernel.org/linux-xfs/162310469340.3465262.504398465311182657.stgit@locust/T/#mca6d958521cb88bbc1bfe1a30767203328d410b5 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_icache.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 37229517c8f7..6b44fc734cb5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -507,13 +507,8 @@ xfs_iget_cache_hit( * will not match, so check for that, too. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } - + if (ip->i_ino != ino) + goto out_skip; /* * If we are racing with another cache hit that is currently @@ -525,12 +520,8 @@ xfs_iget_cache_hit( * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + goto out_skip; /* * Check the inode free state is valid. This also detects lookup @@ -540,23 +531,21 @@ xfs_iget_cache_hit( if (error) goto out_error; - if (ip->i_flags & XFS_IRECLAIMABLE) { - if (flags & XFS_IGET_INCORE) { - error = -EAGAIN; - goto out_error; - } + /* Skip inodes that have no vfs state. */ + if ((flags & XFS_IGET_INCORE) && + (ip->i_flags & XFS_IRECLAIMABLE)) + goto out_skip; + /* The inode fits the selection criteria; process it. */ + if (ip->i_flags & XFS_IRECLAIMABLE) { /* Drops i_flags_lock and RCU read lock. */ error = xfs_iget_recycle(pag, ip); if (error) return error; } else { /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = -EAGAIN; - goto out_error; - } + if (!igrab(inode)) + goto out_skip; /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); @@ -573,6 +562,10 @@ xfs_iget_cache_hit( return 0; +out_skip: + trace_xfs_iget_skip(ip); + XFS_STATS_INC(mp, xs_ig_frecycle); + error = -EAGAIN; out_error: spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); From 10be350b8c6c426b82d4df937f25b37eabdc3d67 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Jun 2021 11:57:06 -0700 Subject: [PATCH 090/102] xfs: fix type mismatches in the inode reclaim functions It's currently unlikely that we will ever end up with more than 4 billion inodes waiting for reclamation, but the fs object code uses long int for object counts and we're certainly capable of generating that many. Instead of truncating the internal counters, widen them and report the object counts correctly. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 8 ++++---- fs/xfs/xfs_icache.h | 6 +++--- fs/xfs/xfs_trace.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6b44fc734cb5..6007683482c6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1084,11 +1084,11 @@ xfs_reclaim_inodes( long xfs_reclaim_inodes_nr( struct xfs_mount *mp, - int nr_to_scan) + unsigned long nr_to_scan) { struct xfs_icwalk icw = { .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, - .icw_scan_limit = nr_to_scan, + .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), }; if (xfs_want_reclaim_sick(mp)) @@ -1106,13 +1106,13 @@ xfs_reclaim_inodes_nr( * Return the number of reclaimable inodes in the filesystem for * the shrinker to determine how much to reclaim. */ -int +long xfs_reclaim_inodes_count( struct xfs_mount *mp) { struct xfs_perag *pag; xfs_agnumber_t ag = 0; - int reclaimable = 0; + long reclaimable = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { ag = pag->pag_agno + 1; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 00dc98a92835..c751cc32dc46 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -15,7 +15,7 @@ struct xfs_icwalk { kgid_t icw_gid; prid_t icw_prid; __u64 icw_min_file_size; - int icw_scan_limit; + long icw_scan_limit; }; /* Flags that reflect xfs_fs_eofblocks functionality. */ @@ -49,8 +49,8 @@ void xfs_inode_free(struct xfs_inode *ip); void xfs_reclaim_worker(struct work_struct *work); void xfs_reclaim_inodes(struct xfs_mount *mp); -int xfs_reclaim_inodes_count(struct xfs_mount *mp); -long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_count(struct xfs_mount *mp); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, unsigned long nr_to_scan); void xfs_inode_mark_reclaimable(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a442bc4dfdc4..f9d8d605f9b1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3895,7 +3895,7 @@ DECLARE_EVENT_CLASS(xfs_icwalk_class, __field(uint32_t, gid) __field(prid_t, prid) __field(__u64, min_file_size) - __field(int, scan_limit) + __field(long, scan_limit) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -3910,7 +3910,7 @@ DECLARE_EVENT_CLASS(xfs_icwalk_class, __entry->scan_limit = icw ? icw->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags, __entry->uid, From 3a1c3abe89710c60c98a8f59a5f16e5dfe249e49 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Jun 2021 11:57:07 -0700 Subject: [PATCH 091/102] xfs: print name of function causing fs shutdown instead of hex pointer In xfs_do_force_shutdown, print the symbolic name of the function that called us to shut down the filesystem instead of a raw hex pointer. This makes debugging a lot easier: XFS (sda): xfs_do_force_shutdown(0x2) called from line 2440 of file fs/xfs/xfs_log.c. Return address = ffffffffa038bc38 becomes: XFS (sda): xfs_do_force_shutdown(0x2) called from line 2440 of file fs/xfs/xfs_log.c. Return address = xfs_trans_mod_sb+0x25 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 07c745cd483e..b7f979eca1e2 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -543,7 +543,7 @@ xfs_do_force_shutdown( } xfs_notice(mp, -"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, +"%s(0x%x) called from line %d of file %s. Return address = %pS", __func__, flags, lnnum, fname, __return_address); if (flags & SHUTDOWN_CORRUPT_INCORE) { From c06ad17cfa0bac3b51c9b3448a843860d29bc85a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 21 Jun 2021 10:01:14 -0700 Subject: [PATCH 092/102] xfs: shorten the shutdown messages to a single line Consolidate the shutdown messages to a single line containing the reason, the passed-in flags, the source of the shutdown, and the end result. This means we now only have one line to look for when debugging, which is useful when the fs goes down while something else is flooding dmesg. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b7f979eca1e2..6ed29b158312 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -538,25 +538,25 @@ xfs_do_force_shutdown( if (flags & SHUTDOWN_FORCE_UMOUNT) { xfs_alert(mp, -"User initiated shutdown received. Shutting down filesystem"); +"User initiated shutdown (0x%x) received. Shutting down filesystem", + flags); return; } - xfs_notice(mp, -"%s(0x%x) called from line %d of file %s. Return address = %pS", - __func__, flags, lnnum, fname, __return_address); - if (flags & SHUTDOWN_CORRUPT_INCORE) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, -"Corruption of in-memory data detected. Shutting down filesystem"); +"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem", + flags, __return_address, fname, lnnum); if (XFS_ERRLEVEL_HIGH <= xfs_error_level) xfs_stack_trace(); } else if (logerror) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); +"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", + flags, __return_address, fname, lnnum); } else { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); +"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", + flags, __return_address, fname, lnnum); } xfs_alert(mp, From 81ed94751b1513fcc5978dcc06eb1f5b4e55a785 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Jun 2021 11:57:07 -0700 Subject: [PATCH 093/102] xfs: fix log intent recovery ENOSPC shutdowns when inactivating inodes During regular operation, the xfs_inactive operations create transactions with zero block reservation because in general we're freeing space, not asking for more. The per-AG space reservations created at mount time enable us to handle expansions of the refcount btree without needing to reserve blocks to the transaction. Unfortunately, log recovery doesn't create the per-AG space reservations when intent items are being recovered. This isn't an issue for intent item recovery itself because they explicitly request blocks, but any inode inactivation that can happen during log recovery uses the same xfs_inactive paths as regular runtime. If a refcount btree expansion happens, the transaction will fail due to blk_res_used > blk_res, and we shut down the filesystem unnecessarily. Fix this problem by making per-AG reservations temporarily so that we can handle the inactivations, and releasing them at the end. This brings the recovery environment closer to the runtime environment. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_mount.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c3a96fb3ad80..d0755494597f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -859,9 +859,17 @@ xfs_mountfs( /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently - * read in. + * read in. Temporarily create per-AG space reservations for metadata + * btree shape changes because space freeing transactions (for inode + * inactivation) require the per-AG reservation in lieu of reserving + * blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error == -ENOSPC) + xfs_warn(mp, + "ENOSPC reserving per-AG metadata pool, log recovery may fail."); error = xfs_log_mount_finish(mp); + xfs_fs_unreserve_ag_blocks(mp); if (error) { xfs_warn(mp, "log mount finish failed"); goto out_rtunmount; From 4e6b8270c820c8c57a73f869799a0af2b56eff3e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 18 Jun 2021 11:57:07 -0700 Subject: [PATCH 094/102] xfs: force the log offline when log intent item recovery fails If any part of log intent item recovery fails, we should shut down the log immediately to stop the log from writing a clean unmount record to disk, because the metadata is not consistent. The inability to cancel a dirty transaction catches most of these cases, but there are a few things that have slipped through the cracks, such as ENOSPC from a transaction allocation, or runtime errors that result in cancellation of a non-dirty transaction. This solves some weird behaviors reported by customers where a system goes down, the first mount fails, the second succeeds, but then the fs goes down later because of inconsistent metadata. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 3 +++ fs/xfs/xfs_log_recover.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 596cd1640bab..e93cac6b5378 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -755,6 +755,9 @@ xfs_log_mount_finish( if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; + /* Make sure the log is dead if we're returning failure. */ + ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + return error; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1227503d2246..1721fce2ec94 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2458,8 +2458,10 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); - if (error) + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; + } /* * Transfer to this new transaction all the dfops we captured @@ -3449,6 +3451,7 @@ xlog_recover_finish( * this) before we get around to xfs_log_mount_cancel. */ xlog_recover_cancel_intents(log); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); xfs_alert(log->l_mp, "Failed to recover intents"); return error; } From 84d8949e770745b16a7e8a68dcb1d0f3687bdee9 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 21 Jun 2021 09:43:14 -0700 Subject: [PATCH 095/102] xfs: hold buffer across unpin and potential shutdown processing The special processing used to simulate a buffer I/O failure on fs shutdown has a difficult to reproduce race that can result in a use after free of the associated buffer. Consider a buffer that has been committed to the on-disk log and thus is AIL resident. The buffer lands on the writeback delwri queue, but is subsequently locked, committed and pinned by another transaction before submitted for I/O. At this point, the buffer is stuck on the delwri queue as it cannot be submitted for I/O until it is unpinned. A log checkpoint I/O failure occurs sometime later, which aborts the bli. The unpin handler is called with the aborted log item, drops the bli reference count, the pin count, and falls into the I/O failure simulation path. The potential problem here is that once the pin count falls to zero in ->iop_unpin(), xfsaild is free to retry delwri submission of the buffer at any time, before the unpin handler even completes. If delwri queue submission wins the race to the buffer lock, it observes the shutdown state and simulates the I/O failure itself. This releases both the bli and delwri queue holds and frees the buffer while xfs_buf_item_unpin() sits on xfs_buf_lock() waiting to run through the same failure sequence. This problem is rare and requires many iterations of fstest generic/019 (which simulates disk I/O failures) to reproduce. To avoid this problem, grab a hold on the buffer before the log item is unpinned if the associated item has been aborted and will require a simulated I/O failure. The hold is already required for the simulated I/O failure, so the ordering simply guarantees the unpin handler access to the buffer before it is unpinned and thus processed by the AIL. This particular ordering is required so long as the AIL does not acquire a reference on the bli, which is the long term solution to this problem. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1cb087b320b1..464587c5a2cb 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -474,17 +474,8 @@ xfs_buf_item_pin( } /* - * This is called to unpin the buffer associated with the buf log - * item which was previously pinned with a call to xfs_buf_item_pin(). - * - * Also drop the reference to the buf item for the current transaction. - * If the XFS_BLI_STALE flag is set and we are the last reference, - * then free up the buf log item and unlock the buffer. - * - * If the remove flag is set we are called from uncommit in the - * forced-shutdown path. If that is true and the reference count on - * the log item is going to drop to zero we need to free the item's - * descriptor in the transaction. + * This is called to unpin the buffer associated with the buf log item which + * was previously pinned with a call to xfs_buf_item_pin(). */ STATIC void xfs_buf_item_unpin( @@ -501,12 +492,26 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin(bip); + /* + * Drop the bli ref associated with the pin and grab the hold required + * for the I/O simulation failure in the abort case. We have to do this + * before the pin count drops because the AIL doesn't acquire a bli + * reference. Therefore if the refcount drops to zero, the bli could + * still be AIL resident and the buffer submitted for I/O (and freed on + * completion) at any point before we return. This can be removed once + * the AIL properly holds a reference on the bli. + */ freed = atomic_dec_and_test(&bip->bli_refcount); - + if (freed && !stale && remove) + xfs_buf_hold(bp); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); - if (freed && stale) { + /* nothing to do but drop the pin count if the bli is active */ + if (!freed) + return; + + if (stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_flags & XBF_STALE); @@ -549,13 +554,13 @@ xfs_buf_item_unpin( ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); - } else if (freed && remove) { + } else if (remove) { /* * The buffer must be locked and held by the caller to simulate - * an async I/O failure. + * an async I/O failure. We acquired the hold for this case + * before the buffer was unpinned. */ xfs_buf_lock(bp); - xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); } From e53d3aa0b605c49d780e1b2fd0b49dba4154f32b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 21 Jun 2021 09:43:14 -0700 Subject: [PATCH 096/102] xfs: remove dead stale buf unpin handling code This code goes back to a time when transaction commits wrote directly to iclogs. The associated log items were pinned, written to the log, and then "uncommitted" if some part of the log write had failed. This uncommit sequence called an ->iop_unpin_remove() handler that was eventually folded into ->iop_unpin() via the remove parameter. The log subsystem has since changed significantly in that transactions commit to the CIL instead of direct to iclogs, though log items must still be aborted in the event of an eventual log I/O error. However, the context for a log item abort is now asynchronous from transaction commit, which means the committing transaction has been freed by this point in time and the transaction uncommit sequence of events is no longer relevant. Further, since stale buffers remain locked at transaction commit through unpin, we can be certain that the buffer is not associated with any transaction when the unpin callback executes. Remove this unused hunk of code and replace it with an assertion that the buffer is disassociated from transaction context. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 464587c5a2cb..2828ce45b701 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -516,28 +516,11 @@ xfs_buf_item_unpin( ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_flags & XBF_STALE); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!bp->b_transp); trace_xfs_buf_item_unpin_stale(bip); - if (remove) { - /* - * If we are in a transaction context, we have to - * remove the log item from the transaction as we are - * about to release our reference to the buffer. If we - * don't, the unlock that occurs later in - * xfs_trans_uncommit() will try to reference the - * buffer which we no longer have a hold on. - */ - if (!list_empty(&lip->li_trans)) - xfs_trans_del_item(lip); - - /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. - */ - bp->b_transp = NULL; - } - /* * If we get called here because of an IO error, we may or may * not have the item on the AIL. xfs_trans_ail_delete() will From a8f3522c9a1f4a31e93b17f2b5310a2b615f5581 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 21 Jun 2021 17:39:09 -0700 Subject: [PATCH 097/102] xfs: fix endianness issue in xfs_ag_shrink_space The AGI buffer is in big-endian format, so we must convert the endianness to CPU format to do any comparisons. Fixes: 46141dc891f7 ("xfs: introduce xfs_ag_shrink_space()") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Gao Xiang --- fs/xfs/libxfs/xfs_ag.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 0765a0ba30e1..778ec52cce70 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -779,6 +779,7 @@ xfs_ag_shrink_space( struct xfs_buf *agibp, *agfbp; struct xfs_agi *agi; struct xfs_agf *agf; + xfs_agblock_t aglen; int error, err2; ASSERT(agno == mp->m_sb.sb_agcount - 1); @@ -793,14 +794,14 @@ xfs_ag_shrink_space( return error; agf = agfbp->b_addr; + aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) return -EFSCORRUPTED; - if (delta >= agi->agi_length) + if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agi->agi_length) - delta); + args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta); /* * Disable perag reservations so it doesn't cause the allocation request From d3a3340b6af28ab79a66687973fb0287d976d490 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 25 Jun 2021 11:19:58 -0700 Subject: [PATCH 098/102] xfs: Initialize error in xfs_attr_remove_iter A recent bug report generated a warning that a code path in xfs_attr_remove_iter could potentially return error uninitialized in the case of XFS_DAS_RM_SHRINK state. Fix this by initializing error. Signed-off-by: Allison Henderson Reported-by: Dan Carpenter Reviewed-by: Brian Foster Reviewed-by: Bill O'Donnell Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 611dc67234a6..d9d7d5137b73 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1375,7 +1375,7 @@ xfs_attr_remove_iter( { struct xfs_da_args *args = dac->da_args; struct xfs_da_state *state = dac->da_state; - int retval, error; + int retval, error = 0; struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); From 6be001021f0b307c8c1544e8b3ac87de20d711de Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 25 Jun 2021 11:21:00 -0700 Subject: [PATCH 099/102] xfs: don't nest icloglock inside ic_callback_lock It's completely unnecessary because callbacks are added to iclogs without holding the icloglock, hence no amount of ordering between the icloglock and ic_callback_lock will order the removal of callbacks from the iclog. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e93cac6b5378..bb4390942275 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2773,11 +2773,8 @@ static void xlog_state_do_iclog_callbacks( struct xlog *log, struct xlog_in_core *iclog) - __releases(&log->l_icloglock) - __acquires(&log->l_icloglock) { trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); - spin_unlock(&log->l_icloglock); spin_lock(&iclog->ic_callback_lock); while (!list_empty(&iclog->ic_callbacks)) { LIST_HEAD(tmp); @@ -2789,12 +2786,6 @@ xlog_state_do_iclog_callbacks( spin_lock(&iclog->ic_callback_lock); } - /* - * Pick up the icloglock while still holding the callback lock so we - * serialise against anyone trying to add more callbacks to this iclog - * now we've finished processing. - */ - spin_lock(&log->l_icloglock); spin_unlock(&iclog->ic_callback_lock); trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); } @@ -2836,13 +2827,12 @@ xlog_state_do_callback( iclog = iclog->ic_next; continue; } + spin_unlock(&log->l_icloglock); - /* - * Running callbacks will drop the icloglock which means - * we'll have to run at least one more complete loop. - */ - cycled_icloglock = true; xlog_state_do_iclog_callbacks(log, iclog); + cycled_icloglock = true; + + spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) wake_up_all(&iclog->ic_force_wait); else From b6903358c230c517b29ecdb6123276d96cc0beab Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 25 Jun 2021 11:21:01 -0700 Subject: [PATCH 100/102] xfs: remove callback dequeue loop from xlog_state_do_iclog_callbacks If we are processing callbacks on an iclog, nothing can be concurrently adding callbacks to the loop. We only add callbacks to the iclog when they are in ACTIVE or WANT_SYNC state, and we explicitly do not add callbacks if the iclog is already in IOERROR state. The only way to have a dequeue racing with an enqueue is to be processing a shutdown without a direct reference to an iclog in ACTIVE or WANT_SYNC state. As the enqueue avoids this race condition, we only ever need a single dequeue operation in xlog_state_do_iclog_callbacks(). Hence we can remove the loop. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bb4390942275..05b00fa4d661 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2774,19 +2774,15 @@ xlog_state_do_iclog_callbacks( struct xlog *log, struct xlog_in_core *iclog) { + LIST_HEAD(tmp); + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + spin_lock(&iclog->ic_callback_lock); - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp); - - list_splice_init(&iclog->ic_callbacks, &tmp); - - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp); - spin_lock(&iclog->ic_callback_lock); - } - + list_splice_init(&iclog->ic_callbacks, &tmp); spin_unlock(&iclog->ic_callback_lock); + + xlog_cil_process_committed(&tmp); trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); } From a1bb8505e92101df94080f81298e3640f5fbe037 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 25 Jun 2021 11:21:01 -0700 Subject: [PATCH 101/102] xfs: Fix a CIL UAF by getting get rid of the iclog callback lock The iclog callback chain has it's own lock. That was added way back in 2008 by myself to alleviate severe lock contention on the icloglock in commit 114d23aae512 ("[XFS] Per iclog callback chain lock"). This was long before delayed logging took the icloglock out of the hot transaction commit path and removed all contention on it. Hence the separate ic_callback_lock doesn't serve any scalability purpose anymore, and hasn't for close on a decade. Further, we only attach callbacks to iclogs in one place where we are already taking the icloglock soon after attaching the callbacks. We also have to drop the icloglock to run callbacks and grab it immediately afterwards again. So given that the icloglock is no longer hot, making it cover callbacks again doesn't really change the locking patterns very much at all. We also need to extend the icloglock to cover callback addition to fix a zero-day UAF in the CIL push code. This occurs when shutdown races with xlog_cil_push_work() and the shutdown runs the callbacks before the push releases the iclog. This results in the CIL context structure attached to the iclog being freed by the callback before the CIL push has finished referencing it, leading to UAF bugs. Hence, to avoid this UAF, we need the callback attachment to be atomic with post processing of the commit iclog and references to the structures being attached to the iclog. This requires holding the icloglock as that's the only way to serialise iclog state against a shutdown in progress. The result is we need to be using the icloglock to protect the callback list addition and removal and serialise them with shutdown. That makes the ic_callback_lock redundant and so it can be removed. Fixes: 71e330b59390 ("xfs: Introduce delayed logging core code") Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 34 ++++++---------------------------- fs/xfs/xfs_log_cil.c | 16 ++++++++++++---- fs/xfs/xfs_log_priv.h | 3 --- 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 05b00fa4d661..c896c9041b8e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1484,7 +1484,6 @@ xlog_alloc_log( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); - spin_lock_init(&iclog->ic_callback_lock); INIT_LIST_HEAD(&iclog->ic_callbacks); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; @@ -2760,32 +2759,6 @@ xlog_state_iodone_process_iclog( } } -/* - * Keep processing entries in the iclog callback list until we come around and - * it is empty. We need to atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more callbacks being added. - * - * This function is called with the icloglock held and returns with it held. We - * drop it while running callbacks, however, as holding it over thousands of - * callbacks is unnecessary and causes excessive contention if we do. - */ -static void -xlog_state_do_iclog_callbacks( - struct xlog *log, - struct xlog_in_core *iclog) -{ - LIST_HEAD(tmp); - - trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); - - spin_lock(&iclog->ic_callback_lock); - list_splice_init(&iclog->ic_callbacks, &tmp); - spin_unlock(&iclog->ic_callback_lock); - - xlog_cil_process_committed(&tmp); - trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); -} - STATIC void xlog_state_do_callback( struct xlog *log) @@ -2814,6 +2787,8 @@ xlog_state_do_callback( repeats++; do { + LIST_HEAD(cb_list); + if (xlog_state_iodone_process_iclog(log, iclog, &ioerror)) break; @@ -2823,9 +2798,12 @@ xlog_state_do_callback( iclog = iclog->ic_next; continue; } + list_splice_init(&iclog->ic_callbacks, &cb_list); spin_unlock(&log->l_icloglock); - xlog_state_do_iclog_callbacks(log, iclog); + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); cycled_icloglock = true; spin_lock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 3c2b1205944d..db03f6f7b5a4 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -873,15 +873,21 @@ restart: xfs_log_ticket_ungrant(log, tic); - spin_lock(&commit_iclog->ic_callback_lock); + /* + * Once we attach the ctx to the iclog, a shutdown can process the + * iclog, run the callbacks and free the ctx. The only thing preventing + * this potential UAF situation here is that we are holding the + * icloglock. Hence we cannot access the ctx after we have attached the + * callbacks and dropped the icloglock. + */ + spin_lock(&log->l_icloglock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { - spin_unlock(&commit_iclog->ic_callback_lock); + spin_unlock(&log->l_icloglock); goto out_abort; } ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE || commit_iclog->ic_state == XLOG_STATE_WANT_SYNC); list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks); - spin_unlock(&commit_iclog->ic_callback_lock); /* * now the checkpoint commit is complete and we've attached the @@ -898,8 +904,10 @@ restart: * iclogs to complete before we submit the commit_iclog. In this case, * the commit_iclog write needs to issue a pre-flush so that the * ordering is correctly preserved down to stable storage. + * + * NOTE: It is not safe to reference the ctx after this check as we drop + * the icloglock if we have to wait for completion of other iclogs. */ - spin_lock(&log->l_icloglock); if (ctx->start_lsn != commit_lsn) { xlog_wait_on_iclog(commit_iclog->ic_prev); spin_lock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 293d82b1fc0d..4c41bbfa33b0 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -216,9 +216,6 @@ typedef struct xlog_in_core { enum xlog_iclog_state ic_state; unsigned int ic_flags; char *ic_datap; /* pointer to iclog data */ - - /* Callback structures need their own cacheline */ - spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; struct list_head ic_callbacks; /* reference counts need their own cacheline */ From 1effb72a8179a02c2dd8a268454ccf50bf68aa50 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 25 Jun 2021 11:21:02 -0700 Subject: [PATCH 102/102] xfs: don't wait on future iclogs when pushing the CIL The iclogbuf ring attached to the struct xlog is circular, hence the first and last iclogs in the ring can only be determined by comparing them against the log->l_iclog pointer. In xfs_cil_push_work(), we want to wait on previous iclogs that were issued so that we can flush them to stable storage with the commit record write, and it simply waits on the previous iclog in the ring. This, however, leads to CIL push hangs in generic/019 like so: task:kworker/u33:0 state:D stack:12680 pid: 7 ppid: 2 flags:0x00004000 Workqueue: xfs-cil/pmem1 xlog_cil_push_work Call Trace: __schedule+0x30b/0x9f0 schedule+0x68/0xe0 xlog_wait_on_iclog+0x121/0x190 ? wake_up_q+0xa0/0xa0 xlog_cil_push_work+0x994/0xa10 ? _raw_spin_lock+0x15/0x20 ? xfs_swap_extents+0x920/0x920 process_one_work+0x1ab/0x390 worker_thread+0x56/0x3d0 ? rescuer_thread+0x3c0/0x3c0 kthread+0x14d/0x170 ? __kthread_bind_mask+0x70/0x70 ret_from_fork+0x1f/0x30 With other threads blocking in either xlog_state_get_iclog_space() waiting for iclog space or xlog_grant_head_wait() waiting for log reservation space. The problem here is that the previous iclog on the ring might actually be a future iclog. That is, if log->l_iclog points at commit_iclog, commit_iclog is the first (oldest) iclog in the ring and there are no previous iclogs pending as they have all completed their IO and been activated again. IOWs, commit_iclog->ic_prev points to an iclog that will be written in the future, not one that has been written in the past. Hence, in this case, waiting on the ->ic_prev iclog is incorrect behaviour, and depending on the state of the future iclog, we can end up with a circular ABA wait cycle and we hang. The fix is made more complex by the fact that many iclogs states cannot be used to determine if the iclog is a past or future iclog. Hence we have to determine past iclogs by checking the LSN of the iclog rather than their state. A past ACTIVE iclog will have a LSN of zero, while a future ACTIVE iclog will have a LSN greater than the current iclog. We don't wait on either of these cases. Similarly, a future iclog that hasn't completed IO will have an LSN greater than the current iclog and so we don't wait on them. A past iclog that is still undergoing IO completion will have a LSN less than the current iclog and those are the only iclogs that we need to wait on. Hence we can use the iclog LSN to determine what iclogs we need to wait on here. Fixes: 5fd9256ce156 ("xfs: separate CIL commit record IO") Reported-by: Brian Foster Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index db03f6f7b5a4..b128aaa9b870 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -877,7 +877,7 @@ restart: * Once we attach the ctx to the iclog, a shutdown can process the * iclog, run the callbacks and free the ctx. The only thing preventing * this potential UAF situation here is that we are holding the - * icloglock. Hence we cannot access the ctx after we have attached the + * icloglock. Hence we cannot access the ctx once we have attached the * callbacks and dropped the icloglock. */ spin_lock(&log->l_icloglock); @@ -900,17 +900,38 @@ restart: spin_unlock(&cil->xc_push_lock); /* - * If the checkpoint spans multiple iclogs, wait for all previous - * iclogs to complete before we submit the commit_iclog. In this case, - * the commit_iclog write needs to issue a pre-flush so that the - * ordering is correctly preserved down to stable storage. + * If the checkpoint spans multiple iclogs, wait for all previous iclogs + * to complete before we submit the commit_iclog. We can't use state + * checks for this - ACTIVE can be either a past completed iclog or a + * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a + * past or future iclog awaiting IO or ordered IO completion to be run. + * In the latter case, if it's a future iclog and we wait on it, the we + * will hang because it won't get processed through to ic_force_wait + * wakeup until this commit_iclog is written to disk. Hence we use the + * iclog header lsn and compare it to the commit lsn to determine if we + * need to wait on iclogs or not. * * NOTE: It is not safe to reference the ctx after this check as we drop * the icloglock if we have to wait for completion of other iclogs. */ if (ctx->start_lsn != commit_lsn) { - xlog_wait_on_iclog(commit_iclog->ic_prev); - spin_lock(&log->l_icloglock); + xfs_lsn_t plsn; + + plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn); + if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) { + /* + * Waiting on ic_force_wait orders the completion of + * iclogs older than ic_prev. Hence we only need to wait + * on the most recent older iclog here. + */ + xlog_wait_on_iclog(commit_iclog->ic_prev); + spin_lock(&log->l_icloglock); + } + + /* + * We need to issue a pre-flush so that the ordering for this + * checkpoint is correctly preserved down to stable storage. + */ commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; }