cf11da9c5d
The allocation stack switch at xfs_bmapi_allocate() has served it's purpose, but is no longer a sufficient solution to the stack usage problem we have in the XFS allocation path. Whilst the kernel stack size is now 16k, that is not a valid reason for undoing all our "keep stack usage down" modifications. What it does allow us to do is have the freedom to refine and perfect the modifications knowing that if we get it wrong it won't blow up in our faces - we have a safety net now. This is important because we still have the issue of older kernels having smaller stacks and that they are still supported and are demonstrating a wide range of different stack overflows. Red Hat has several open bugs for allocation based stack overflows from directory modifications and direct IO block allocation and these problems still need to be solved. If we can solve them upstream, then distro's won't need to bake their own unique solutions. To that end, I've observed that every allocation based stack overflow report has had a specific characteristic - it has happened during or directly after a bmap btree block split. That event requires a new block to be allocated to the tree, and so we effectively stack one allocation stack on top of another, and that's when we get into trouble. A further observation is that bmap btree block splits are much rarer than writeback allocation - over a range of different workloads I've observed the ratio of bmap btree inserts to splits ranges from 100:1 (xfstests run) to 10000:1 (local VM image server with sparse files that range in the hundreds of thousands to millions of extents). Either way, bmap btree split events are much, much rarer than allocation events. Finally, we have to move the kswapd state to the allocation workqueue work when allocation is done on behalf of kswapd. This is proving to cause significant perturbation in performance under memory pressure and appears to be generating allocation deadlock warnings under some workloads, so avoiding the use of a workqueue for the majority of kswapd writeback allocation will minimise the impact of such behaviour. Hence it makes sense to move the stack switch to xfs_btree_split() and only do it for bmap btree splits. Stack switches during allocation will be much rarer, so there won't be significant performacne overhead caused by switching stacks. The worse case stack from all allocation paths will be split, not just writeback. And the majority of memory allocations will be done in the correct context (e.g. kswapd) without causing additional latency, and so we simplify the memory reclaim interactions between processes, workqueues and kswapd. The worst stack I've been able to generate with this patch in place is 5600 bytes deep. It's very revealing because we exit XFS at: 37) 1768 64 kmem_cache_alloc+0x13b/0x170 about 1800 bytes of stack consumed, and the remaining 3800 bytes (and 36 functions) is memory reclaim, swap and the IO stack. And this occurs in the inode allocation from an open(O_CREAT) syscall, not writeback. The amount of stack being used is much less than I've previously be able to generate - fs_mark testing has been able to generate stack usage of around 7k without too much trouble; with this patch it's only just getting to 5.5k. This is primarily because the metadata allocation paths (e.g. directory blocks) are no longer causing double splits on the same stack, and hence now stack tracing is showing swapping being the worst stack consumer rather than XFS. Performance of fs_mark inode create workloads is unchanged. Performance of fs_mark async fsync workloads is consistently good with context switches reduced by around 150,000/s (30%). Performance of dbench, streaming IO and postmark is unchanged. Allocation deadlock warnings have not been seen on the workloads that generated them since adding this patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
187 lines
6.6 KiB
C
187 lines
6.6 KiB
C
/*
|
|
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
|
|
* All Rights Reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it would be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
#ifndef __XFS_BMAP_H__
|
|
#define __XFS_BMAP_H__
|
|
|
|
struct getbmap;
|
|
struct xfs_bmbt_irec;
|
|
struct xfs_ifork;
|
|
struct xfs_inode;
|
|
struct xfs_mount;
|
|
struct xfs_trans;
|
|
|
|
extern kmem_zone_t *xfs_bmap_free_item_zone;
|
|
|
|
/*
|
|
* List of extents to be free "later".
|
|
* The list is kept sorted on xbf_startblock.
|
|
*/
|
|
typedef struct xfs_bmap_free_item
|
|
{
|
|
xfs_fsblock_t xbfi_startblock;/* starting fs block number */
|
|
xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
|
|
struct xfs_bmap_free_item *xbfi_next; /* link to next entry */
|
|
} xfs_bmap_free_item_t;
|
|
|
|
/*
|
|
* Header for free extent list.
|
|
*
|
|
* xbf_low is used by the allocator to activate the lowspace algorithm -
|
|
* when free space is running low the extent allocator may choose to
|
|
* allocate an extent from an AG without leaving sufficient space for
|
|
* a btree split when inserting the new extent. In this case the allocator
|
|
* will enable the lowspace algorithm which is supposed to allow further
|
|
* allocations (such as btree splits and newroots) to allocate from
|
|
* sequential AGs. In order to avoid locking AGs out of order the lowspace
|
|
* algorithm will start searching for free space from AG 0. If the correct
|
|
* transaction reservations have been made then this algorithm will eventually
|
|
* find all the space it needs.
|
|
*/
|
|
typedef struct xfs_bmap_free
|
|
{
|
|
xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
|
|
int xbf_count; /* count of items on list */
|
|
int xbf_low; /* alloc in low mode */
|
|
} xfs_bmap_free_t;
|
|
|
|
#define XFS_BMAP_MAX_NMAP 4
|
|
|
|
/*
|
|
* Flags for xfs_bmapi_*
|
|
*/
|
|
#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
|
|
#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
|
|
#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
|
|
#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
|
|
#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */
|
|
/* combine contig. space */
|
|
#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
|
|
/*
|
|
* unwritten extent conversion - this needs write cache flushing and no additional
|
|
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
|
|
* from written to unwritten, otherwise convert from unwritten to written.
|
|
*/
|
|
#define XFS_BMAPI_CONVERT 0x040
|
|
|
|
#define XFS_BMAPI_FLAGS \
|
|
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
|
|
{ XFS_BMAPI_METADATA, "METADATA" }, \
|
|
{ XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
|
|
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
|
|
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
|
|
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
|
|
{ XFS_BMAPI_CONVERT, "CONVERT" }
|
|
|
|
|
|
static inline int xfs_bmapi_aflag(int w)
|
|
{
|
|
return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
|
|
}
|
|
|
|
/*
|
|
* Special values for xfs_bmbt_irec_t br_startblock field.
|
|
*/
|
|
#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
|
|
#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
|
|
|
|
static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
|
|
{
|
|
((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
|
|
(flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
|
|
}
|
|
|
|
/*
|
|
* Flags for xfs_bmap_add_extent*.
|
|
*/
|
|
#define BMAP_LEFT_CONTIG (1 << 0)
|
|
#define BMAP_RIGHT_CONTIG (1 << 1)
|
|
#define BMAP_LEFT_FILLING (1 << 2)
|
|
#define BMAP_RIGHT_FILLING (1 << 3)
|
|
#define BMAP_LEFT_DELAY (1 << 4)
|
|
#define BMAP_RIGHT_DELAY (1 << 5)
|
|
#define BMAP_LEFT_VALID (1 << 6)
|
|
#define BMAP_RIGHT_VALID (1 << 7)
|
|
#define BMAP_ATTRFORK (1 << 8)
|
|
|
|
#define XFS_BMAP_EXT_FLAGS \
|
|
{ BMAP_LEFT_CONTIG, "LC" }, \
|
|
{ BMAP_RIGHT_CONTIG, "RC" }, \
|
|
{ BMAP_LEFT_FILLING, "LF" }, \
|
|
{ BMAP_RIGHT_FILLING, "RF" }, \
|
|
{ BMAP_ATTRFORK, "ATTR" }
|
|
|
|
|
|
/*
|
|
* This macro is used to determine how many extents will be shifted
|
|
* in one write transaction. We could require two splits,
|
|
* an extent move on the first and an extent merge on the second,
|
|
* So it is proper that one extent is shifted inside write transaction
|
|
* at a time.
|
|
*/
|
|
#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
|
|
|
|
#ifdef DEBUG
|
|
void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
|
|
int whichfork, unsigned long caller_ip);
|
|
#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
|
|
xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
|
|
#else
|
|
#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
|
|
#endif
|
|
|
|
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
|
|
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
|
|
void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
|
|
struct xfs_bmap_free *flist, struct xfs_mount *mp);
|
|
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
|
|
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
|
|
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
|
|
int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
xfs_fileoff_t *last_block, int whichfork);
|
|
int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
|
|
int whichfork);
|
|
int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
|
|
int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
int whichfork);
|
|
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
|
|
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
|
|
int *nmap, int flags);
|
|
int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
|
|
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
|
|
int *nmap, int flags);
|
|
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
|
|
xfs_fsblock_t *firstblock, xfs_extlen_t total,
|
|
struct xfs_bmbt_irec *mval, int *nmap,
|
|
struct xfs_bmap_free *flist);
|
|
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
|
|
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
|
|
struct xfs_bmap_free *flist, int *done);
|
|
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
|
|
xfs_extnum_t num);
|
|
uint xfs_default_attroffset(struct xfs_inode *ip);
|
|
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
int *done, xfs_fileoff_t start_fsb,
|
|
xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
|
|
xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
|
|
int num_exts);
|
|
|
|
#endif /* __XFS_BMAP_H__ */
|