6bdcf26ade
Replace the current linear list and the indirection array for the in-core extent list with a b+tree to avoid the need for larger memory allocations for the indirection array when lots of extents are present. The current extent list implementations leads to heavy pressure on the memory allocator when modifying files with a high extent count, and can lead to high latencies because of that. The replacement is a b+tree with a few quirks. The leaf nodes directly store the extent record in two u64 values. The encoding is a little bit different from the existing in-core extent records so that the start offset and length which are required for lookups can be retreived with simple mask operations. The inner nodes store a 64-bit key containing the start offset in the first half of the node, and the pointers to the next lower level in the second half. In either case we walk the node from the beginninig to the end and do a linear search, as that is more efficient for the low number of cache lines touched during a search (2 for the inner nodes, 4 for the leaf nodes) than a binary search. We store termination markers (zero length for the leaf nodes, an otherwise impossible high bit for the inner nodes) to terminate the key list / records instead of storing a count to use the available cache lines as efficiently as possible. One quirk of the algorithm is that while we normally split a node half and half like usual btree implementations we just spill over entries added at the very end of the list to a new node on its own. This means we get a 100% fill grade for the common cases of bulk insertion when reading an inode into memory, and when only sequentially appending to a file. The downside is a slightly higher chance of splits on the first random insertions. Both insert and removal manually recurse into the lower levels, but the bulk deletion of the whole tree is still implemented as a recursive function call, although one limited by the overall depth and with very little stack usage in every iteration. For the first few extents we dynamically grow the list from a single extent to the next powers of two until we have a first full leaf block and that building the actual tree. The code started out based on the generic lib/btree.c code from Joern Engel based on earlier work from Peter Zijlstra, but has since been rewritten beyond recognition. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
136 lines
4.3 KiB
C
136 lines
4.3 KiB
C
/*
|
|
* Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
|
|
* All Rights Reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it would be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
#ifndef __XFS_BMAP_BTREE_H__
|
|
#define __XFS_BMAP_BTREE_H__
|
|
|
|
struct xfs_btree_cur;
|
|
struct xfs_btree_block;
|
|
struct xfs_mount;
|
|
struct xfs_inode;
|
|
struct xfs_trans;
|
|
|
|
/*
|
|
* Btree block header size depends on a superblock flag.
|
|
*/
|
|
#define XFS_BMBT_BLOCK_LEN(mp) \
|
|
(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
|
|
XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
|
|
|
|
#define XFS_BMBT_REC_ADDR(mp, block, index) \
|
|
((xfs_bmbt_rec_t *) \
|
|
((char *)(block) + \
|
|
XFS_BMBT_BLOCK_LEN(mp) + \
|
|
((index) - 1) * sizeof(xfs_bmbt_rec_t)))
|
|
|
|
#define XFS_BMBT_KEY_ADDR(mp, block, index) \
|
|
((xfs_bmbt_key_t *) \
|
|
((char *)(block) + \
|
|
XFS_BMBT_BLOCK_LEN(mp) + \
|
|
((index) - 1) * sizeof(xfs_bmbt_key_t)))
|
|
|
|
#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
|
|
((xfs_bmbt_ptr_t *) \
|
|
((char *)(block) + \
|
|
XFS_BMBT_BLOCK_LEN(mp) + \
|
|
(maxrecs) * sizeof(xfs_bmbt_key_t) + \
|
|
((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
|
|
|
|
#define XFS_BMDR_REC_ADDR(block, index) \
|
|
((xfs_bmdr_rec_t *) \
|
|
((char *)(block) + \
|
|
sizeof(struct xfs_bmdr_block) + \
|
|
((index) - 1) * sizeof(xfs_bmdr_rec_t)))
|
|
|
|
#define XFS_BMDR_KEY_ADDR(block, index) \
|
|
((xfs_bmdr_key_t *) \
|
|
((char *)(block) + \
|
|
sizeof(struct xfs_bmdr_block) + \
|
|
((index) - 1) * sizeof(xfs_bmdr_key_t)))
|
|
|
|
#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
|
|
((xfs_bmdr_ptr_t *) \
|
|
((char *)(block) + \
|
|
sizeof(struct xfs_bmdr_block) + \
|
|
(maxrecs) * sizeof(xfs_bmdr_key_t) + \
|
|
((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
|
|
|
|
/*
|
|
* These are to be used when we know the size of the block and
|
|
* we don't have a cursor.
|
|
*/
|
|
#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
|
|
XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
|
|
|
|
#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
|
|
(int)(XFS_BMBT_BLOCK_LEN(mp) + \
|
|
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
|
|
|
|
#define XFS_BMAP_BROOT_SPACE(mp, bb) \
|
|
(XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
|
|
#define XFS_BMDR_SPACE_CALC(nrecs) \
|
|
(int)(sizeof(xfs_bmdr_block_t) + \
|
|
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
|
|
#define XFS_BMAP_BMDR_SPACE(bb) \
|
|
(XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
|
|
|
|
/*
|
|
* Maximum number of bmap btree levels.
|
|
*/
|
|
#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
|
|
|
|
/*
|
|
* Prototypes for xfs_bmap.c to call.
|
|
*/
|
|
extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
|
|
struct xfs_btree_block *, int);
|
|
|
|
void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
|
|
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
|
|
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
|
|
extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
|
|
|
|
extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
|
|
xfs_bmdr_block_t *, int);
|
|
|
|
extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
|
|
extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
|
|
extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
|
|
|
|
extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
|
|
int whichfork, xfs_ino_t new_owner,
|
|
struct list_head *buffer_list);
|
|
|
|
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
|
|
struct xfs_trans *, struct xfs_inode *, int);
|
|
|
|
/*
|
|
* Check that the extent does not contain an invalid unwritten extent flag.
|
|
*/
|
|
static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
|
|
struct xfs_bmbt_rec *ep)
|
|
{
|
|
if (get_unaligned_be64(&ep->l0) >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
|
|
return true;
|
|
if (whichfork == XFS_DATA_FORK &&
|
|
xfs_sb_version_hasextflgbit(&mp->m_sb))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
#endif /* __XFS_BMAP_BTREE_H__ */
|