2018-06-05 19:42:14 -07:00
// SPDX-License-Identifier: GPL-2.0
2005-04-16 15:20:36 -07:00
/*
2005-11-02 14:58:39 +11:00
* Copyright ( c ) 2000 - 2005 Silicon Graphics , Inc .
2013-04-24 18:58:55 +10:00
* Copyright ( c ) 2013 Red Hat , Inc .
2005-11-02 14:58:39 +11:00
* All Rights Reserved .
2005-04-16 15:20:36 -07:00
*/
# include "xfs.h"
2005-11-02 14:38:42 +11:00
# include "xfs_fs.h"
2013-10-29 22:11:58 +11:00
# include "xfs_shared.h"
2013-10-23 10:51:50 +11:00
# include "xfs_format.h"
2013-10-23 10:50:10 +11:00
# include "xfs_log_format.h"
# include "xfs_trans_resv.h"
2005-04-16 15:20:36 -07:00
# include "xfs_sb.h"
# include "xfs_mount.h"
2013-10-15 09:17:51 +11:00
# include "xfs_da_format.h"
2005-11-02 14:38:42 +11:00
# include "xfs_da_btree.h"
2005-04-16 15:20:36 -07:00
# include "xfs_inode.h"
2013-10-23 10:50:10 +11:00
# include "xfs_trans.h"
2013-10-23 10:51:50 +11:00
# include "xfs_bmap_btree.h"
2005-04-16 15:20:36 -07:00
# include "xfs_bmap.h"
2013-10-23 10:51:50 +11:00
# include "xfs_attr_sf.h"
2005-04-16 15:20:36 -07:00
# include "xfs_attr.h"
2021-04-26 15:00:33 -07:00
# include "xfs_attr_remote.h"
2005-04-16 15:20:36 -07:00
# include "xfs_attr_leaf.h"
# include "xfs_error.h"
2009-12-14 23:14:59 +00:00
# include "xfs_trace.h"
2013-04-24 18:58:55 +10:00
# include "xfs_buf_item.h"
2013-10-29 22:11:51 +11:00
# include "xfs_dir2.h"
2015-10-12 15:59:25 +11:00
# include "xfs_log.h"
2021-06-02 10:48:24 +10:00
# include "xfs_ag.h"
2022-05-11 17:01:23 +10:00
# include "xfs_errortag.h"
2024-02-22 12:32:18 -08:00
# include "xfs_health.h"
2013-04-24 18:58:55 +10:00
2005-04-16 15:20:36 -07:00
/*
* xfs_attr_leaf . c
*
* Routines to implement leaf blocks of attributes as Btrees of hashed names .
*/
/*========================================================================
* Function prototypes for the kernel .
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
* Routines used for growing the Btree .
*/
2013-04-24 18:58:55 +10:00
STATIC int xfs_attr3_leaf_create ( struct xfs_da_args * args ,
xfs_dablk_t which_block , struct xfs_buf * * bpp ) ;
STATIC int xfs_attr3_leaf_add_work ( struct xfs_buf * leaf_buffer ,
struct xfs_attr3_icleaf_hdr * ichdr ,
struct xfs_da_args * args , int freemap_index ) ;
STATIC void xfs_attr3_leaf_compact ( struct xfs_da_args * args ,
struct xfs_attr3_icleaf_hdr * ichdr ,
struct xfs_buf * leaf_buffer ) ;
STATIC void xfs_attr3_leaf_rebalance ( xfs_da_state_t * state ,
2005-04-16 15:20:36 -07:00
xfs_da_state_blk_t * blk1 ,
xfs_da_state_blk_t * blk2 ) ;
2013-04-24 18:58:55 +10:00
STATIC int xfs_attr3_leaf_figure_balance ( xfs_da_state_t * state ,
xfs_da_state_blk_t * leaf_blk_1 ,
struct xfs_attr3_icleaf_hdr * ichdr1 ,
xfs_da_state_blk_t * leaf_blk_2 ,
struct xfs_attr3_icleaf_hdr * ichdr2 ,
int * number_entries_in_blk1 ,
int * number_usedbytes_in_blk1 ) ;
2005-04-16 15:20:36 -07:00
/*
* Utility routines .
*/
2014-06-06 15:21:45 +10:00
STATIC void xfs_attr3_leaf_moveents ( struct xfs_da_args * args ,
struct xfs_attr_leafblock * src_leaf ,
2013-04-24 18:58:55 +10:00
struct xfs_attr3_icleaf_hdr * src_ichdr , int src_start ,
struct xfs_attr_leafblock * dst_leaf ,
struct xfs_attr3_icleaf_hdr * dst_ichdr , int dst_start ,
2014-06-06 15:21:45 +10:00
int move_count ) ;
2005-06-21 15:36:52 +10:00
STATIC int xfs_attr_leaf_entsize ( xfs_attr_leafblock_t * leaf , int index ) ;
2006-09-28 11:01:37 +10:00
2015-04-13 11:27:10 +10:00
/*
* attr3 block ' firstused ' conversion helpers .
*
* firstused refers to the offset of the first used byte of the nameval region
* of an attr leaf block . The region starts at the tail of the block and expands
* backwards towards the middle . As such , firstused is initialized to the block
* size for an empty leaf block and is reduced from there .
*
* The attr3 block size is pegged to the fsb size and the maximum fsb is 64 k .
* The in - core firstused field is 32 - bit and thus supports the maximum fsb size .
* The on - disk field is only 16 - bit , however , and overflows at 64 k . Since this
* only occurs at exactly 64 k , we use zero as a magic on - disk value to represent
* the attr block size . The following helpers manage the conversion between the
* in - core and on - disk formats .
*/
static void
xfs_attr3_leaf_firstused_from_disk (
struct xfs_da_geometry * geo ,
struct xfs_attr3_icleaf_hdr * to ,
struct xfs_attr_leafblock * from )
{
struct xfs_attr3_leaf_hdr * hdr3 ;
if ( from - > hdr . info . magic = = cpu_to_be16 ( XFS_ATTR3_LEAF_MAGIC ) ) {
hdr3 = ( struct xfs_attr3_leaf_hdr * ) from ;
to - > firstused = be16_to_cpu ( hdr3 - > firstused ) ;
} else {
to - > firstused = be16_to_cpu ( from - > hdr . firstused ) ;
}
/*
* Convert from the magic fsb size value to actual blocksize . This
* should only occur for empty blocks when the block size overflows
* 16 - bits .
*/
if ( to - > firstused = = XFS_ATTR3_LEAF_NULLOFF ) {
ASSERT ( ! to - > count & & ! to - > usedbytes ) ;
ASSERT ( geo - > blksize > USHRT_MAX ) ;
to - > firstused = geo - > blksize ;
}
}
static void
xfs_attr3_leaf_firstused_to_disk (
struct xfs_da_geometry * geo ,
struct xfs_attr_leafblock * to ,
struct xfs_attr3_icleaf_hdr * from )
{
struct xfs_attr3_leaf_hdr * hdr3 ;
uint32_t firstused ;
/* magic value should only be seen on disk */
ASSERT ( from - > firstused ! = XFS_ATTR3_LEAF_NULLOFF ) ;
/*
* Scale down the 32 - bit in - core firstused value to the 16 - bit on - disk
* value . This only overflows at the max supported value of 64 k . Use the
* magic on - disk value to represent block size in this case .
*/
firstused = from - > firstused ;
if ( firstused > USHRT_MAX ) {
ASSERT ( from - > firstused = = geo - > blksize ) ;
firstused = XFS_ATTR3_LEAF_NULLOFF ;
}
if ( from - > magic = = XFS_ATTR3_LEAF_MAGIC ) {
hdr3 = ( struct xfs_attr3_leaf_hdr * ) to ;
hdr3 - > firstused = cpu_to_be16 ( firstused ) ;
} else {
to - > hdr . firstused = cpu_to_be16 ( firstused ) ;
}
}
2013-04-24 18:58:55 +10:00
void
xfs_attr3_leaf_hdr_from_disk (
2015-04-13 11:26:02 +10:00
struct xfs_da_geometry * geo ,
2013-04-24 18:58:55 +10:00
struct xfs_attr3_icleaf_hdr * to ,
struct xfs_attr_leafblock * from )
{
int i ;
ASSERT ( from - > hdr . info . magic = = cpu_to_be16 ( XFS_ATTR_LEAF_MAGIC ) | |
from - > hdr . info . magic = = cpu_to_be16 ( XFS_ATTR3_LEAF_MAGIC ) ) ;
if ( from - > hdr . info . magic = = cpu_to_be16 ( XFS_ATTR3_LEAF_MAGIC ) ) {
struct xfs_attr3_leaf_hdr * hdr3 = ( struct xfs_attr3_leaf_hdr * ) from ;
to - > forw = be32_to_cpu ( hdr3 - > info . hdr . forw ) ;
to - > back = be32_to_cpu ( hdr3 - > info . hdr . back ) ;
to - > magic = be16_to_cpu ( hdr3 - > info . hdr . magic ) ;
to - > count = be16_to_cpu ( hdr3 - > count ) ;
to - > usedbytes = be16_to_cpu ( hdr3 - > usedbytes ) ;
2015-04-13 11:27:10 +10:00
xfs_attr3_leaf_firstused_from_disk ( geo , to , from ) ;
2013-04-24 18:58:55 +10:00
to - > holes = hdr3 - > holes ;
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
to - > freemap [ i ] . base = be16_to_cpu ( hdr3 - > freemap [ i ] . base ) ;
to - > freemap [ i ] . size = be16_to_cpu ( hdr3 - > freemap [ i ] . size ) ;
}
return ;
}
to - > forw = be32_to_cpu ( from - > hdr . info . forw ) ;
to - > back = be32_to_cpu ( from - > hdr . info . back ) ;
to - > magic = be16_to_cpu ( from - > hdr . info . magic ) ;
to - > count = be16_to_cpu ( from - > hdr . count ) ;
to - > usedbytes = be16_to_cpu ( from - > hdr . usedbytes ) ;
2015-04-13 11:27:10 +10:00
xfs_attr3_leaf_firstused_from_disk ( geo , to , from ) ;
2013-04-24 18:58:55 +10:00
to - > holes = from - > hdr . holes ;
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
to - > freemap [ i ] . base = be16_to_cpu ( from - > hdr . freemap [ i ] . base ) ;
to - > freemap [ i ] . size = be16_to_cpu ( from - > hdr . freemap [ i ] . size ) ;
}
}
void
xfs_attr3_leaf_hdr_to_disk (
2015-04-13 11:26:02 +10:00
struct xfs_da_geometry * geo ,
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * to ,
struct xfs_attr3_icleaf_hdr * from )
{
2015-04-13 11:27:10 +10:00
int i ;
2013-04-24 18:58:55 +10:00
ASSERT ( from - > magic = = XFS_ATTR_LEAF_MAGIC | |
from - > magic = = XFS_ATTR3_LEAF_MAGIC ) ;
if ( from - > magic = = XFS_ATTR3_LEAF_MAGIC ) {
struct xfs_attr3_leaf_hdr * hdr3 = ( struct xfs_attr3_leaf_hdr * ) to ;
hdr3 - > info . hdr . forw = cpu_to_be32 ( from - > forw ) ;
hdr3 - > info . hdr . back = cpu_to_be32 ( from - > back ) ;
hdr3 - > info . hdr . magic = cpu_to_be16 ( from - > magic ) ;
hdr3 - > count = cpu_to_be16 ( from - > count ) ;
hdr3 - > usedbytes = cpu_to_be16 ( from - > usedbytes ) ;
2015-04-13 11:27:10 +10:00
xfs_attr3_leaf_firstused_to_disk ( geo , to , from ) ;
2013-04-24 18:58:55 +10:00
hdr3 - > holes = from - > holes ;
hdr3 - > pad1 = 0 ;
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
hdr3 - > freemap [ i ] . base = cpu_to_be16 ( from - > freemap [ i ] . base ) ;
hdr3 - > freemap [ i ] . size = cpu_to_be16 ( from - > freemap [ i ] . size ) ;
}
return ;
}
to - > hdr . info . forw = cpu_to_be32 ( from - > forw ) ;
to - > hdr . info . back = cpu_to_be32 ( from - > back ) ;
to - > hdr . info . magic = cpu_to_be16 ( from - > magic ) ;
to - > hdr . count = cpu_to_be16 ( from - > count ) ;
to - > hdr . usedbytes = cpu_to_be16 ( from - > usedbytes ) ;
2015-04-13 11:27:10 +10:00
xfs_attr3_leaf_firstused_to_disk ( geo , to , from ) ;
2013-04-24 18:58:55 +10:00
to - > hdr . holes = from - > holes ;
to - > hdr . pad1 = 0 ;
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
to - > hdr . freemap [ i ] . base = cpu_to_be16 ( from - > freemap [ i ] . base ) ;
to - > hdr . freemap [ i ] . size = cpu_to_be16 ( from - > freemap [ i ] . size ) ;
}
}
2019-10-28 16:12:33 -07:00
static xfs_failaddr_t
xfs_attr3_leaf_verify_entry (
struct xfs_mount * mp ,
char * buf_end ,
struct xfs_attr_leafblock * leaf ,
struct xfs_attr3_icleaf_hdr * leafhdr ,
struct xfs_attr_leaf_entry * ent ,
int idx ,
__u32 * last_hashval )
{
struct xfs_attr_leaf_name_local * lentry ;
struct xfs_attr_leaf_name_remote * rentry ;
char * name_end ;
unsigned int nameidx ;
unsigned int namesize ;
__u32 hashval ;
/* hash order check */
hashval = be32_to_cpu ( ent - > hashval ) ;
if ( hashval < * last_hashval )
return __this_address ;
* last_hashval = hashval ;
nameidx = be16_to_cpu ( ent - > nameidx ) ;
if ( nameidx < leafhdr - > firstused | | nameidx > = mp - > m_attr_geo - > blksize )
return __this_address ;
/*
* Check the name information . The namelen fields are u8 so we can ' t
* possibly exceed the maximum name length of 255 bytes .
*/
if ( ent - > flags & XFS_ATTR_LOCAL ) {
lentry = xfs_attr3_leaf_name_local ( leaf , idx ) ;
namesize = xfs_attr_leaf_entsize_local ( lentry - > namelen ,
be16_to_cpu ( lentry - > valuelen ) ) ;
name_end = ( char * ) lentry + namesize ;
if ( lentry - > namelen = = 0 )
return __this_address ;
} else {
rentry = xfs_attr3_leaf_name_remote ( leaf , idx ) ;
namesize = xfs_attr_leaf_entsize_remote ( rentry - > namelen ) ;
name_end = ( char * ) rentry + namesize ;
if ( rentry - > namelen = = 0 )
return __this_address ;
if ( ! ( ent - > flags & XFS_ATTR_INCOMPLETE ) & &
rentry - > valueblk = = 0 )
return __this_address ;
}
if ( name_end > buf_end )
return __this_address ;
return NULL ;
}
xfs: empty xattr leaf header blocks are not corruption
TLDR: Revert commit 51e6104fdb95 ("xfs: detect empty attr leaf blocks in
xfs_attr3_leaf_verify") because it was wrong.
Every now and then we get a corruption report from the kernel or
xfs_repair about empty leaf blocks in the extended attribute structure.
We've long thought that these shouldn't be possible, but prior to 5.18
one would shake loose in the recoveryloop fstests about once a month.
A new addition to the xattr leaf block verifier in 5.19-rc1 makes this
happen every 7 minutes on my testing cloud. I added a ton of logging to
detect any time we set the header count on an xattr leaf block to zero.
This produced the following dmesg output on generic/388:
XFS (sda4): ino 0x21fcbaf leaf 0x129bf78 hdcount==0!
Call Trace:
<TASK>
dump_stack_lvl+0x34/0x44
xfs_attr3_leaf_create+0x187/0x230
xfs_attr_shortform_to_leaf+0xd1/0x2f0
xfs_attr_set_iter+0x73e/0xa90
xfs_xattri_finish_update+0x45/0x80
xfs_attr_finish_item+0x1b/0xd0
xfs_defer_finish_noroll+0x19c/0x770
__xfs_trans_commit+0x153/0x3e0
xfs_attr_set+0x36b/0x740
xfs_xattr_set+0x89/0xd0
__vfs_setxattr+0x67/0x80
__vfs_setxattr_noperm+0x6e/0x120
vfs_setxattr+0x97/0x180
setxattr+0x88/0xa0
path_setxattr+0xc3/0xe0
__x64_sys_setxattr+0x27/0x30
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
So now we know that someone is creating empty xattr leaf blocks as part
of converting a sf xattr structure into a leaf xattr structure. The
conversion routine logs any existing sf attributes in the same
transaction that creates the leaf block, so we know this is a setxattr
to a file that has no attributes at all.
Next, g/388 calls the shutdown ioctl and cycles the mount to trigger log
recovery. I also augmented buffer item recovery to call ->verify_struct
on any attr leaf blocks and complain if it finds a failure:
XFS (sda4): Unmounting Filesystem
XFS (sda4): Mounting V5 Filesystem
XFS (sda4): Starting recovery (logdev: internal)
XFS (sda4): xattr leaf daddr 0x129bf78 hdrcount == 0!
Call Trace:
<TASK>
dump_stack_lvl+0x34/0x44
xfs_attr3_leaf_verify+0x3b8/0x420
xlog_recover_buf_commit_pass2+0x60a/0x6c0
xlog_recover_items_pass2+0x4e/0xc0
xlog_recover_commit_trans+0x33c/0x350
xlog_recovery_process_trans+0xa5/0xe0
xlog_recover_process_data+0x8d/0x140
xlog_do_recovery_pass+0x19b/0x720
xlog_do_log_recovery+0x62/0xc0
xlog_do_recover+0x33/0x1d0
xlog_recover+0xda/0x190
xfs_log_mount+0x14c/0x360
xfs_mountfs+0x517/0xa60
xfs_fs_fill_super+0x6bc/0x950
get_tree_bdev+0x175/0x280
vfs_get_tree+0x1a/0x80
path_mount+0x6f5/0xaa0
__x64_sys_mount+0x103/0x140
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7fc61e241eae
And a moment later, the _delwri_submit of the recovered buffers trips
the same verifier and recovery fails:
XFS (sda4): Metadata corruption detected at xfs_attr3_leaf_verify+0x393/0x420 [xfs], xfs_attr3_leaf block 0x129bf78
XFS (sda4): Unmount and run xfs_repair
XFS (sda4): First 128 bytes of corrupted metadata buffer:
00000000: 00 00 00 00 00 00 00 00 3b ee 00 00 00 00 00 00 ........;.......
00000010: 00 00 00 00 01 29 bf 78 00 00 00 00 00 00 00 00 .....).x........
00000020: a5 1b d0 02 b2 9a 49 df 8e 9c fb 8d f8 31 3e 9d ......I......1>.
00000030: 00 00 00 00 02 1f cb af 00 00 00 00 10 00 00 00 ................
00000040: 00 50 0f b0 00 00 00 00 00 00 00 00 00 00 00 00 .P..............
00000050: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
XFS (sda4): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply+0x37f/0x3b0 [xfs] (fs/xfs/xfs_buf.c:1518). Shutting down filesystem.
XFS (sda4): Please unmount the filesystem and rectify the problem(s)
XFS (sda4): log mount/recovery failed: error -117
XFS (sda4): log mount failed
I think I see what's going on here -- setxattr is racing with something
that shuts down the filesystem:
Thread 1 Thread 2
-------- --------
xfs_attr_sf_addname
xfs_attr_shortform_to_leaf
<create empty leaf>
xfs_trans_bhold(leaf)
xattri_dela_state = XFS_DAS_LEAF_ADD
<roll transaction>
<flush log>
<shut down filesystem>
xfs_trans_bhold_release(leaf)
<discover fs is dead, bail>
Thread 3
--------
<cycle mount, start recovery>
xlog_recover_buf_commit_pass2
xlog_recover_do_reg_buffer
<replay empty leaf buffer from recovered buf item>
xfs_buf_delwri_queue(leaf)
xfs_buf_delwri_submit
_xfs_buf_ioapply(leaf)
xfs_attr3_leaf_write_verify
<trip over empty leaf buffer>
<fail recovery>
As you can see, the bhold keeps the leaf buffer locked and thus prevents
the *AIL* from tripping over the ichdr.count==0 check in the write
verifier. Unfortunately, it doesn't prevent the log from getting
flushed to disk, which sets up log recovery to fail.
So. It's clear that the kernel has always had the ability to persist
attr leaf blocks with ichdr.count==0, which means that it's part of the
ondisk format now.
Unfortunately, this check has been added and removed multiple times
throughout history. It first appeared in[1] kernel 3.10 as part of the
early V5 format patches. The check was later discovered to break log
recovery and hence disabled[2] during log recovery in kernel 4.10.
Simultaneously, the check was added[3] to xfs_repair 4.9.0 to try to
weed out the empty leaf blocks. This was still not correct because log
recovery would recover an empty attr leaf block successfully only for
regular xattr operations to trip over the empty block during of the
block during regular operation. Therefore, the check was removed
entirely[4] in kernel 5.7 but removal of the xfs_repair check was
forgotten. The continued complaints from xfs_repair lead to us
mistakenly re-adding[5] the verifier check for kernel 5.19. Remove it
once again.
[1] 517c22207b04 ("xfs: add CRCs to attr leaf blocks")
[2] 2e1d23370e75 ("xfs: ignore leaf attr ichdr.count in verifier
during log replay")
[3] f7140161 ("xfs_repair: junk leaf attribute if count == 0")
[4] f28cef9e4dac ("xfs: don't fail verifier on empty attr3 leaf
block")
[5] 51e6104fdb95 ("xfs: detect empty attr leaf blocks in
xfs_attr3_leaf_verify")
Looking at the rest of the xattr code, it seems that files with empty
leaf blocks behave as expected -- listxattr reports no attributes;
getxattr on any xattr returns nothing as expected; removexattr does
nothing; and setxattr can add attributes just fine.
Original-bug: 517c22207b04 ("xfs: add CRCs to attr leaf blocks")
Still-not-fixed-by: 2e1d23370e75 ("xfs: ignore leaf attr ichdr.count in verifier during log replay")
Removed-in: f28cef9e4dac ("xfs: don't fail verifier on empty attr3 leaf block")
Fixes: 51e6104fdb95 ("xfs: detect empty attr leaf blocks in xfs_attr3_leaf_verify")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-06-24 15:01:28 -07:00
/*
* Validate an attribute leaf block .
*
* Empty leaf blocks can occur under the following circumstances :
*
* 1. setxattr adds a new extended attribute to a file ;
* 2. The file has zero existing attributes ;
* 3. The attribute is too large to fit in the attribute fork ;
* 4. The attribute is small enough to fit in a leaf block ;
* 5. A log flush occurs after committing the transaction that creates
* the ( empty ) leaf block ; and
* 6. The filesystem goes down after the log flush but before the new
* attribute can be committed to the leaf block .
*
* Hence we need to ensure that we don ' t fail the validation purely
* because the leaf is empty .
*/
2018-01-08 10:51:03 -08:00
static xfs_failaddr_t
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_verify (
2018-01-16 18:54:12 -08:00
struct xfs_buf * bp )
2012-11-12 22:54:16 +11:00
{
2018-01-16 18:54:12 -08:00
struct xfs_attr3_icleaf_hdr ichdr ;
2019-06-28 19:27:29 -07:00
struct xfs_mount * mp = bp - > b_mount ;
2018-01-16 18:54:12 -08:00
struct xfs_attr_leafblock * leaf = bp - > b_addr ;
struct xfs_attr_leaf_entry * entries ;
2019-10-28 16:12:33 -07:00
struct xfs_attr_leaf_entry * ent ;
char * buf_end ;
2018-11-06 07:50:50 -08:00
uint32_t end ; /* must be 32bit - see below */
2019-10-28 16:12:33 -07:00
__u32 last_hashval = 0 ;
2018-07-19 12:24:55 -07:00
int i ;
2019-02-07 10:45:48 -08:00
xfs_failaddr_t fa ;
2012-11-12 22:54:16 +11:00
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( mp - > m_attr_geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
2019-02-07 10:45:48 -08:00
fa = xfs_da3_blkinfo_verify ( bp , bp - > b_addr ) ;
if ( fa )
return fa ;
2013-04-24 18:58:55 +10:00
2018-01-16 18:54:12 -08:00
/*
* firstused is the block offset of the first name info structure .
* Make sure it doesn ' t go off the block or crash into the header .
*/
if ( ichdr . firstused > mp - > m_attr_geo - > blksize )
return __this_address ;
if ( ichdr . firstused < xfs_attr3_leaf_hdr_size ( leaf ) )
return __this_address ;
/* Make sure the entries array doesn't crash into the name info. */
entries = xfs_attr3_leaf_entryp ( bp - > b_addr ) ;
if ( ( char * ) & entries [ ichdr . count ] >
( char * ) bp - > b_addr + ichdr . firstused )
return __this_address ;
2020-05-14 13:50:25 -07:00
/*
* NOTE : This verifier historically failed empty leaf buffers because
* we expect the fork to be in another format . Empty attr fork format
* conversions are possible during xattr set , however , and format
* conversion is not atomic with the xattr set that triggers it . We
* cannot assume leaf blocks are non - empty until that is addressed .
*/
2019-10-28 16:12:33 -07:00
buf_end = ( char * ) bp - > b_addr + mp - > m_attr_geo - > blksize ;
for ( i = 0 , ent = entries ; i < ichdr . count ; ent + + , i + + ) {
fa = xfs_attr3_leaf_verify_entry ( mp , buf_end , leaf , & ichdr ,
ent , i , & last_hashval ) ;
if ( fa )
return fa ;
}
2013-04-24 18:58:55 +10:00
2018-07-19 12:24:55 -07:00
/*
* Quickly check the freemap information . Attribute data has to be
* aligned to 4 - byte boundaries , and likewise for the free space .
2018-11-06 07:50:50 -08:00
*
* Note that for 64 k block size filesystems , the freemap entries cannot
* overflow as they are only be16 fields . However , when checking end
* pointer of the freemap , we have to be careful to detect overflows and
* so use uint32_t for those checks .
2018-07-19 12:24:55 -07:00
*/
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
if ( ichdr . freemap [ i ] . base > mp - > m_attr_geo - > blksize )
return __this_address ;
if ( ichdr . freemap [ i ] . base & 0x3 )
return __this_address ;
if ( ichdr . freemap [ i ] . size > mp - > m_attr_geo - > blksize )
return __this_address ;
if ( ichdr . freemap [ i ] . size & 0x3 )
return __this_address ;
2018-11-06 07:50:50 -08:00
/* be care of 16 bit overflows here */
end = ( uint32_t ) ichdr . freemap [ i ] . base + ichdr . freemap [ i ] . size ;
2018-07-19 12:24:55 -07:00
if ( end < ichdr . freemap [ i ] . base )
return __this_address ;
if ( end > mp - > m_attr_geo - > blksize )
return __this_address ;
}
2018-01-08 10:51:03 -08:00
return NULL ;
2012-11-14 17:52:32 +11:00
}
static void
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_write_verify (
2012-11-14 17:52:32 +11:00
struct xfs_buf * bp )
{
2019-06-28 19:27:29 -07:00
struct xfs_mount * mp = bp - > b_mount ;
2018-01-24 13:38:48 -08:00
struct xfs_buf_log_item * bip = bp - > b_log_item ;
2013-04-24 18:58:55 +10:00
struct xfs_attr3_leaf_hdr * hdr3 = bp - > b_addr ;
2018-01-08 10:51:03 -08:00
xfs_failaddr_t fa ;
2013-04-24 18:58:55 +10:00
2018-01-08 10:51:03 -08:00
fa = xfs_attr3_leaf_verify ( bp ) ;
if ( fa ) {
xfs_verifier_error ( bp , - EFSCORRUPTED , fa ) ;
2013-04-24 18:58:55 +10:00
return ;
}
2021-08-18 18:46:37 -07:00
if ( ! xfs_has_crc ( mp ) )
2013-04-24 18:58:55 +10:00
return ;
if ( bip )
hdr3 - > info . lsn = cpu_to_be64 ( bip - > bli_item . li_lsn ) ;
2014-02-27 15:18:23 +11:00
xfs_buf_update_cksum ( bp , XFS_ATTR3_LEAF_CRC_OFF ) ;
2012-11-14 17:52:32 +11:00
}
2012-11-12 22:54:16 +11:00
2013-04-24 18:58:55 +10:00
/*
* leaf / node format detection on trees is sketchy , so a node read can be done on
* leaf level blocks when detection identifies the tree as a node format tree
* incorrectly . In this case , we need to swap the verifier to match the correct
* format of the block being read .
*/
2012-11-14 17:54:40 +11:00
static void
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_read_verify (
struct xfs_buf * bp )
2012-11-14 17:52:32 +11:00
{
2019-06-28 19:27:29 -07:00
struct xfs_mount * mp = bp - > b_mount ;
2018-01-08 10:51:03 -08:00
xfs_failaddr_t fa ;
2013-04-24 18:58:55 +10:00
2021-08-18 18:46:37 -07:00
if ( xfs_has_crc ( mp ) & &
2014-02-27 15:23:10 +11:00
! xfs_buf_verify_cksum ( bp , XFS_ATTR3_LEAF_CRC_OFF ) )
2018-01-08 10:51:03 -08:00
xfs_verifier_error ( bp , - EFSBADCRC , __this_address ) ;
else {
fa = xfs_attr3_leaf_verify ( bp ) ;
if ( fa )
xfs_verifier_error ( bp , - EFSCORRUPTED , fa ) ;
}
2012-11-12 22:54:16 +11:00
}
2013-04-24 18:58:55 +10:00
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
2016-01-04 16:10:19 +11:00
. name = " xfs_attr3_leaf " ,
2019-02-16 11:47:28 -08:00
. magic16 = { cpu_to_be16 ( XFS_ATTR_LEAF_MAGIC ) ,
cpu_to_be16 ( XFS_ATTR3_LEAF_MAGIC ) } ,
2013-04-24 18:58:55 +10:00
. verify_read = xfs_attr3_leaf_read_verify ,
. verify_write = xfs_attr3_leaf_write_verify ,
2018-01-08 10:51:08 -08:00
. verify_struct = xfs_attr3_leaf_verify ,
2012-11-14 17:54:40 +11:00
} ;
2012-11-14 17:52:32 +11:00
2012-11-12 22:54:16 +11:00
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_read (
2012-11-12 22:54:16 +11:00
struct xfs_trans * tp ,
struct xfs_inode * dp ,
xfs_dablk_t bno ,
struct xfs_buf * * bpp )
{
2013-04-03 16:11:29 +11:00
int err ;
2019-11-20 09:46:04 -08:00
err = xfs_da_read_buf ( tp , dp , bno , 0 , bpp , XFS_ATTR_FORK ,
2019-11-20 09:46:02 -08:00
& xfs_attr3_leaf_buf_ops ) ;
2017-07-07 18:55:17 -07:00
if ( ! err & & tp & & * bpp )
2013-04-03 16:11:30 +11:00
xfs_trans_buf_set_type ( tp , * bpp , XFS_BLFT_ATTR_LEAF_BUF ) ;
2013-04-03 16:11:29 +11:00
return err ;
2012-11-12 22:54:16 +11:00
}
2006-09-28 11:01:37 +10:00
/*========================================================================
* Namespace helper routines
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
/*
* If we are in log recovery , then we want the lookup to ignore the INCOMPLETE
* flag on disk - if there ' s an incomplete attr then recovery needs to tear it
* down . If there ' s no incomplete attr , then recovery needs to tear that attr
* down to replace it with the attr that has been logged . In this case , the
* INCOMPLETE flag will not be set in attr - > attr_filter , but rather
* XFS_DA_OP_RECOVERY will be set in args - > op_flags .
*/
2020-02-26 17:30:36 -08:00
static bool
xfs_attr_match (
struct xfs_da_args * args ,
uint8_t namelen ,
unsigned char * name ,
int flags )
2006-09-28 11:01:37 +10:00
{
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
2020-02-26 17:30:36 -08:00
if ( args - > namelen ! = namelen )
return false ;
if ( memcmp ( args - > name , name , namelen ) ! = 0 )
return false ;
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
/* Recovery ignores the INCOMPLETE flag. */
if ( ( args - > op_flags & XFS_DA_OP_RECOVERY ) & &
args - > attr_filter = = ( flags & XFS_ATTR_NSP_ONDISK_MASK ) )
return true ;
/* All remaining matches need to be filtered by INCOMPLETE state. */
2020-02-26 17:30:43 -08:00
if ( args - > attr_filter ! =
( flags & ( XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE ) ) )
2020-02-26 17:30:36 -08:00
return false ;
return true ;
2006-09-28 11:01:37 +10:00
}
2019-08-29 09:04:10 -07:00
static int
xfs_attr_copy_value (
struct xfs_da_args * args ,
unsigned char * value ,
int valuelen )
{
/*
* No copy if all we have to do is get the length
*/
2020-02-26 17:30:35 -08:00
if ( ! args - > valuelen ) {
2019-08-29 09:04:10 -07:00
args - > valuelen = valuelen ;
return 0 ;
}
/*
* No copy if the length of the existing buffer is too small
*/
if ( args - > valuelen < valuelen ) {
args - > valuelen = valuelen ;
return - ERANGE ;
}
xfs: allocate xattr buffer on demand
When doing file lookups and checking for permissions, we end up in
xfs_get_acl() to see if there are any ACLs on the inode. This
requires and xattr lookup, and to do that we have to supply a buffer
large enough to hold an maximum sized xattr.
On workloads were we are accessing a wide range of cache cold files
under memory pressure (e.g. NFS fileservers) we end up spending a
lot of time allocating the buffer. The buffer is 64k in length, so
is a contiguous multi-page allocation, and if that then fails we
fall back to vmalloc(). Hence the allocation here is /expensive/
when we are looking up hundreds of thousands of files a second.
Initial numbers from a bpf trace show average time in xfs_get_acl()
is ~32us, with ~19us of that in the memory allocation. Note these
are average times, so there are going to be affected by the worst
case allocations more than the common fast case...
To avoid this, we could just do a "null" lookup to see if the ACL
xattr exists and then only do the allocation if it exists. This,
however, optimises the path for the "no ACL present" case at the
expense of the "acl present" case. i.e. we can halve the time in
xfs_get_acl() for the no acl case (i.e down to ~10-15us), but that
then increases the ACL case by 30% (i.e. up to 40-45us).
To solve this and speed up both cases, drive the xattr buffer
allocation into the attribute code once we know what the actual
xattr length is. For the no-xattr case, we avoid the allocation
completely, speeding up that case. For the common ACL case, we'll
end up with a fast heap allocation (because it'll be smaller than a
page), and only for the rarer "we have a remote xattr" will we have
a multi-page allocation occur. Hence the common ACL case will be
much faster, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-08-29 09:04:10 -07:00
2020-02-26 17:30:35 -08:00
if ( ! args - > value ) {
2021-08-09 10:10:01 -07:00
args - > value = kvmalloc ( valuelen , GFP_KERNEL | __GFP_NOLOCKDEP ) ;
xfs: allocate xattr buffer on demand
When doing file lookups and checking for permissions, we end up in
xfs_get_acl() to see if there are any ACLs on the inode. This
requires and xattr lookup, and to do that we have to supply a buffer
large enough to hold an maximum sized xattr.
On workloads were we are accessing a wide range of cache cold files
under memory pressure (e.g. NFS fileservers) we end up spending a
lot of time allocating the buffer. The buffer is 64k in length, so
is a contiguous multi-page allocation, and if that then fails we
fall back to vmalloc(). Hence the allocation here is /expensive/
when we are looking up hundreds of thousands of files a second.
Initial numbers from a bpf trace show average time in xfs_get_acl()
is ~32us, with ~19us of that in the memory allocation. Note these
are average times, so there are going to be affected by the worst
case allocations more than the common fast case...
To avoid this, we could just do a "null" lookup to see if the ACL
xattr exists and then only do the allocation if it exists. This,
however, optimises the path for the "no ACL present" case at the
expense of the "acl present" case. i.e. we can halve the time in
xfs_get_acl() for the no acl case (i.e down to ~10-15us), but that
then increases the ACL case by 30% (i.e. up to 40-45us).
To solve this and speed up both cases, drive the xattr buffer
allocation into the attribute code once we know what the actual
xattr length is. For the no-xattr case, we avoid the allocation
completely, speeding up that case. For the common ACL case, we'll
end up with a fast heap allocation (because it'll be smaller than a
page), and only for the rarer "we have a remote xattr" will we have
a multi-page allocation occur. Hence the common ACL case will be
much faster, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-08-29 09:04:10 -07:00
if ( ! args - > value )
return - ENOMEM ;
}
2019-08-29 09:04:10 -07:00
args - > valuelen = valuelen ;
/* remote block xattr requires IO for copy-in */
if ( args - > rmtblkno )
return xfs_attr_rmtval_get ( args ) ;
/*
* This is to prevent a GCC warning because the remote xattr case
* doesn ' t have a value to pass in . In that case , we never reach here ,
* but GCC can ' t work that out and so throws a " passing NULL to
* memcpy " warning.
*/
if ( ! value )
return - EINVAL ;
memcpy ( args - > value , value , valuelen ) ;
return 0 ;
}
2005-04-16 15:20:36 -07:00
/*========================================================================
2005-11-02 10:34:53 +11:00
* External routines when attribute fork size < XFS_LITINO ( mp ) .
2005-04-16 15:20:36 -07:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
xfs: fix forkoff miscalculation related to XFS_LITINO(mp)
Currently, commit e9e2eae89ddb dropped a (int) decoration from
XFS_LITINO(mp), and since sizeof() expression is also involved,
the result of XFS_LITINO(mp) is simply as the size_t type
(commonly unsigned long).
Considering the expression in xfs_attr_shortform_bytesfit():
offset = (XFS_LITINO(mp) - bytes) >> 3;
let "bytes" be (int)340, and
"XFS_LITINO(mp)" be (unsigned long)336.
on 64-bit platform, the expression is
offset = ((unsigned long)336 - (int)340) >> 3 =
(int)(0xfffffffffffffffcUL >> 3) = -1
but on 32-bit platform, the expression is
offset = ((unsigned long)336 - (int)340) >> 3 =
(int)(0xfffffffcUL >> 3) = 0x1fffffff
instead.
so offset becomes a large positive number on 32-bit platform, and
cause xfs_attr_shortform_bytesfit() returns maxforkoff rather than 0.
Therefore, one result is
"ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));"
assertion failure in xfs_idata_realloc(), which was also the root
cause of the original bugreport from Dennis, see:
https://bugzilla.redhat.com/show_bug.cgi?id=1894177
And it can also be manually triggered with the following commands:
$ touch a;
$ setfattr -n user.0 -v "`seq 0 80`" a;
$ setfattr -n user.1 -v "`seq 0 80`" a
on 32-bit platform.
Fix the case in xfs_attr_shortform_bytesfit() by bailing out
"XFS_LITINO(mp) < bytes" in advance suggested by Eric and a misleading
comment together with this bugfix suggested by Darrick. It seems the
other users of XFS_LITINO(mp) are not impacted.
Fixes: e9e2eae89ddb ("xfs: only check the superblock version for dinode size calculation")
Cc: <stable@vger.kernel.org> # 5.7+
Reported-and-tested-by: Dennis Gilmore <dgilmore@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-11-14 11:06:01 -08:00
* Query whether the total requested number of attr fork bytes of extended
2005-11-02 10:34:53 +11:00
* attribute space will be able to fit inline .
2011-11-19 17:44:30 +00:00
*
2021-03-29 11:11:44 -07:00
* Returns zero if not , else the i_forkoff fork offset to be used in the
2005-11-02 10:34:53 +11:00
* literal area for attribute data once the new bytes have been added .
*
2021-03-29 11:11:44 -07:00
* i_forkoff must be 8 byte aligned , hence is stored as a > > 3 value ;
2005-11-02 10:34:53 +11:00
* special case for dev / uuid inodes , they have fixed size data forks .
2005-04-16 15:20:36 -07:00
*/
int
xfs: fix inode fork extent count overflow
[commit message is verbose for discussion purposes - will trim it
down later. Some questions about implementation details at the end.]
Zorro Lang recently ran a new test to stress single inode extent
counts now that they are no longer limited by memory allocation.
The test was simply:
# xfs_io -f -c "falloc 0 40t" /mnt/scratch/big-file
# ~/src/xfstests-dev/punch-alternating /mnt/scratch/big-file
This test uncovered a problem where the hole punching operation
appeared to finish with no error, but apparently only created 268M
extents instead of the 10 billion it was supposed to.
Further, trying to punch out extents that should have been present
resulted in success, but no change in the extent count. It looked
like a silent failure.
While running the test and observing the behaviour in real time,
I observed the extent coutn growing at ~2M extents/minute, and saw
this after about an hour:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next ; \
> sleep 60 ; \
> xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 127657993
fsxattr.nextents = 129683339
#
And a few minutes later this:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 4177861124
#
Ah, what? Where did that 4 billion extra extents suddenly come from?
Stop the workload, unmount, mount:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 166044375
#
And it's back at the expected number. i.e. the extent count is
correct on disk, but it's screwed up in memory. I loaded up the
extent list, and immediately:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 4192576215
#
It's bad again. So, where does that number come from?
xfs_fill_fsxattr():
if (ip->i_df.if_flags & XFS_IFEXTENTS)
fa->fsx_nextents = xfs_iext_count(&ip->i_df);
else
fa->fsx_nextents = ip->i_d.di_nextents;
And that's the behaviour I just saw in a nutshell. The on disk count
is correct, but once the tree is loaded into memory, it goes whacky.
Clearly there's something wrong with xfs_iext_count():
inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
{
return ifp->if_bytes / sizeof(struct xfs_iext_rec);
}
Simple enough, but 134M extents is 2**27, and that's right about
where things went wrong. A struct xfs_iext_rec is 16 bytes in size,
which means 2**27 * 2**4 = 2**31 and we're right on target for an
integer overflow. And, sure enough:
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
....
Once we get 2**27 extents in a file, we overflow if_bytes and the
in-core extent count goes wrong. And when we reach 2**28 extents,
if_bytes wraps back to zero and things really start to go wrong
there. This is where the silent failure comes from - only the first
2**28 extents can be looked up directly due to the overflow, all the
extents above this index wrap back to somewhere in the first 2**28
extents. Hence with a regular pattern, trying to punch a hole in the
range that didn't have holes mapped to a hole in the first 2**28
extents and so "succeeded" without changing anything. Hence "silent
failure"...
Fix this by converting if_bytes to a int64_t and converting all the
index variables and size calculations to use int64_t types to avoid
overflows in future. Signed integers are still used to enable easy
detection of extent count underflows. This enables scalability of
extent counts to the limits of the on-disk format - MAXEXTNUM
(2**31) extents.
Current testing is at over 500M extents and still going:
fsxattr.nextents = 517310478
Reported-by: Zorro Lang <zlang@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-10-17 13:40:33 -07:00
xfs_attr_shortform_bytesfit (
struct xfs_inode * dp ,
int bytes )
2005-11-02 10:34:53 +11:00
{
xfs: fix inode fork extent count overflow
[commit message is verbose for discussion purposes - will trim it
down later. Some questions about implementation details at the end.]
Zorro Lang recently ran a new test to stress single inode extent
counts now that they are no longer limited by memory allocation.
The test was simply:
# xfs_io -f -c "falloc 0 40t" /mnt/scratch/big-file
# ~/src/xfstests-dev/punch-alternating /mnt/scratch/big-file
This test uncovered a problem where the hole punching operation
appeared to finish with no error, but apparently only created 268M
extents instead of the 10 billion it was supposed to.
Further, trying to punch out extents that should have been present
resulted in success, but no change in the extent count. It looked
like a silent failure.
While running the test and observing the behaviour in real time,
I observed the extent coutn growing at ~2M extents/minute, and saw
this after about an hour:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next ; \
> sleep 60 ; \
> xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 127657993
fsxattr.nextents = 129683339
#
And a few minutes later this:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 4177861124
#
Ah, what? Where did that 4 billion extra extents suddenly come from?
Stop the workload, unmount, mount:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 166044375
#
And it's back at the expected number. i.e. the extent count is
correct on disk, but it's screwed up in memory. I loaded up the
extent list, and immediately:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 4192576215
#
It's bad again. So, where does that number come from?
xfs_fill_fsxattr():
if (ip->i_df.if_flags & XFS_IFEXTENTS)
fa->fsx_nextents = xfs_iext_count(&ip->i_df);
else
fa->fsx_nextents = ip->i_d.di_nextents;
And that's the behaviour I just saw in a nutshell. The on disk count
is correct, but once the tree is loaded into memory, it goes whacky.
Clearly there's something wrong with xfs_iext_count():
inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
{
return ifp->if_bytes / sizeof(struct xfs_iext_rec);
}
Simple enough, but 134M extents is 2**27, and that's right about
where things went wrong. A struct xfs_iext_rec is 16 bytes in size,
which means 2**27 * 2**4 = 2**31 and we're right on target for an
integer overflow. And, sure enough:
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
....
Once we get 2**27 extents in a file, we overflow if_bytes and the
in-core extent count goes wrong. And when we reach 2**28 extents,
if_bytes wraps back to zero and things really start to go wrong
there. This is where the silent failure comes from - only the first
2**28 extents can be looked up directly due to the overflow, all the
extents above this index wrap back to somewhere in the first 2**28
extents. Hence with a regular pattern, trying to punch a hole in the
range that didn't have holes mapped to a hole in the first 2**28
extents and so "succeeded" without changing anything. Hence "silent
failure"...
Fix this by converting if_bytes to a int64_t and converting all the
index variables and size calculations to use int64_t types to avoid
overflows in future. Signed integers are still used to enable easy
detection of extent count underflows. This enables scalability of
extent counts to the limits of the on-disk format - MAXEXTNUM
(2**31) extents.
Current testing is at over 500M extents and still going:
fsxattr.nextents = 517310478
Reported-by: Zorro Lang <zlang@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-10-17 13:40:33 -07:00
struct xfs_mount * mp = dp - > i_mount ;
int64_t dsize ;
int minforkoff ;
int maxforkoff ;
int offset ;
2005-11-02 10:34:53 +11:00
xfs: fix forkoff miscalculation related to XFS_LITINO(mp)
Currently, commit e9e2eae89ddb dropped a (int) decoration from
XFS_LITINO(mp), and since sizeof() expression is also involved,
the result of XFS_LITINO(mp) is simply as the size_t type
(commonly unsigned long).
Considering the expression in xfs_attr_shortform_bytesfit():
offset = (XFS_LITINO(mp) - bytes) >> 3;
let "bytes" be (int)340, and
"XFS_LITINO(mp)" be (unsigned long)336.
on 64-bit platform, the expression is
offset = ((unsigned long)336 - (int)340) >> 3 =
(int)(0xfffffffffffffffcUL >> 3) = -1
but on 32-bit platform, the expression is
offset = ((unsigned long)336 - (int)340) >> 3 =
(int)(0xfffffffcUL >> 3) = 0x1fffffff
instead.
so offset becomes a large positive number on 32-bit platform, and
cause xfs_attr_shortform_bytesfit() returns maxforkoff rather than 0.
Therefore, one result is
"ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));"
assertion failure in xfs_idata_realloc(), which was also the root
cause of the original bugreport from Dennis, see:
https://bugzilla.redhat.com/show_bug.cgi?id=1894177
And it can also be manually triggered with the following commands:
$ touch a;
$ setfattr -n user.0 -v "`seq 0 80`" a;
$ setfattr -n user.1 -v "`seq 0 80`" a
on 32-bit platform.
Fix the case in xfs_attr_shortform_bytesfit() by bailing out
"XFS_LITINO(mp) < bytes" in advance suggested by Eric and a misleading
comment together with this bugfix suggested by Darrick. It seems the
other users of XFS_LITINO(mp) are not impacted.
Fixes: e9e2eae89ddb ("xfs: only check the superblock version for dinode size calculation")
Cc: <stable@vger.kernel.org> # 5.7+
Reported-and-tested-by: Dennis Gilmore <dgilmore@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-11-14 11:06:01 -08:00
/*
* Check if the new size could fit at all first :
*/
if ( bytes > XFS_LITINO ( mp ) )
return 0 ;
2013-03-12 23:30:36 +11:00
/* rounded down */
2020-03-18 08:15:10 -07:00
offset = ( XFS_LITINO ( mp ) - bytes ) > > 3 ;
2005-11-02 10:34:53 +11:00
2020-05-18 10:28:05 -07:00
if ( dp - > i_df . if_format = = XFS_DINODE_FMT_DEV ) {
2005-11-02 10:34:53 +11:00
minforkoff = roundup ( sizeof ( xfs_dev_t ) , 8 ) > > 3 ;
return ( offset > = minforkoff ) ? minforkoff : 0 ;
}
2011-11-19 17:44:30 +00:00
/*
* If the requested numbers of bytes is smaller or equal to the
* current attribute fork size we can always proceed .
*
* Note that if_bytes in the data fork might actually be larger than
* the current data fork size is due to delalloc extents . In that
* case either the extent count will go down when they are converted
* to real extents , or the delalloc conversion will take care of the
* literal area rebalancing .
*/
2022-07-09 10:56:07 -07:00
if ( bytes < = xfs_inode_attr_fork_size ( dp ) )
2021-03-29 11:11:44 -07:00
return dp - > i_forkoff ;
2011-11-19 17:44:30 +00:00
/*
* For attr2 we can try to move the forkoff if there is space in the
* literal area , but for the old format we are done if there is no
* space in the fixed attribute fork .
*/
2021-08-18 18:46:52 -07:00
if ( ! xfs_has_attr2 ( mp ) )
2005-11-02 15:00:20 +11:00
return 0 ;
2007-02-10 18:35:58 +11:00
dsize = dp - > i_df . if_bytes ;
2011-11-19 17:44:30 +00:00
2020-05-18 10:28:05 -07:00
switch ( dp - > i_df . if_format ) {
2007-02-10 18:35:58 +11:00
case XFS_DINODE_FMT_EXTENTS :
2011-11-19 17:44:30 +00:00
/*
2021-08-06 11:05:44 -07:00
* If there is no attr fork and the data fork is extents ,
2011-11-19 17:44:30 +00:00
* determine if creating the default attr fork will result
* in the extents form migrating to btree . If so , the
* minimum offset only needs to be the space required for
2007-02-10 18:35:58 +11:00
* the btree root .
2011-11-19 17:44:30 +00:00
*/
2021-03-29 11:11:44 -07:00
if ( ! dp - > i_forkoff & & dp - > i_df . if_bytes >
2009-03-29 19:26:46 +02:00
xfs_default_attroffset ( dp ) )
2007-02-10 18:35:58 +11:00
dsize = XFS_BMDR_SPACE_CALC ( MINDBTPTRS ) ;
break ;
case XFS_DINODE_FMT_BTREE :
/*
2011-11-19 17:44:30 +00:00
* If we have a data btree then keep forkoff if we have one ,
* otherwise we are adding a new attr , so then we set
* minforkoff to where the btree root can finish so we have
2007-02-10 18:35:58 +11:00
* plenty of room for attrs
*/
2021-03-29 11:11:44 -07:00
if ( dp - > i_forkoff ) {
if ( offset < dp - > i_forkoff )
2007-02-10 18:35:58 +11:00
return 0 ;
2021-03-29 11:11:44 -07:00
return dp - > i_forkoff ;
2011-11-19 17:44:30 +00:00
}
2013-04-21 14:53:46 -05:00
dsize = XFS_BMAP_BROOT_SPACE ( mp , dp - > i_df . if_broot ) ;
2007-02-10 18:35:58 +11:00
break ;
}
2011-11-19 17:44:30 +00:00
/*
* A data fork btree root must have space for at least
2007-02-10 18:35:58 +11:00
* MINDBTPTRS key / ptr pairs if the data fork is small or empty .
*/
xfs: fix inode fork extent count overflow
[commit message is verbose for discussion purposes - will trim it
down later. Some questions about implementation details at the end.]
Zorro Lang recently ran a new test to stress single inode extent
counts now that they are no longer limited by memory allocation.
The test was simply:
# xfs_io -f -c "falloc 0 40t" /mnt/scratch/big-file
# ~/src/xfstests-dev/punch-alternating /mnt/scratch/big-file
This test uncovered a problem where the hole punching operation
appeared to finish with no error, but apparently only created 268M
extents instead of the 10 billion it was supposed to.
Further, trying to punch out extents that should have been present
resulted in success, but no change in the extent count. It looked
like a silent failure.
While running the test and observing the behaviour in real time,
I observed the extent coutn growing at ~2M extents/minute, and saw
this after about an hour:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next ; \
> sleep 60 ; \
> xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 127657993
fsxattr.nextents = 129683339
#
And a few minutes later this:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 4177861124
#
Ah, what? Where did that 4 billion extra extents suddenly come from?
Stop the workload, unmount, mount:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 166044375
#
And it's back at the expected number. i.e. the extent count is
correct on disk, but it's screwed up in memory. I loaded up the
extent list, and immediately:
# xfs_io -f -c "stat" /mnt/scratch/big-file |grep next
fsxattr.nextents = 4192576215
#
It's bad again. So, where does that number come from?
xfs_fill_fsxattr():
if (ip->i_df.if_flags & XFS_IFEXTENTS)
fa->fsx_nextents = xfs_iext_count(&ip->i_df);
else
fa->fsx_nextents = ip->i_d.di_nextents;
And that's the behaviour I just saw in a nutshell. The on disk count
is correct, but once the tree is loaded into memory, it goes whacky.
Clearly there's something wrong with xfs_iext_count():
inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
{
return ifp->if_bytes / sizeof(struct xfs_iext_rec);
}
Simple enough, but 134M extents is 2**27, and that's right about
where things went wrong. A struct xfs_iext_rec is 16 bytes in size,
which means 2**27 * 2**4 = 2**31 and we're right on target for an
integer overflow. And, sure enough:
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
....
Once we get 2**27 extents in a file, we overflow if_bytes and the
in-core extent count goes wrong. And when we reach 2**28 extents,
if_bytes wraps back to zero and things really start to go wrong
there. This is where the silent failure comes from - only the first
2**28 extents can be looked up directly due to the overflow, all the
extents above this index wrap back to somewhere in the first 2**28
extents. Hence with a regular pattern, trying to punch a hole in the
range that didn't have holes mapped to a hole in the first 2**28
extents and so "succeeded" without changing anything. Hence "silent
failure"...
Fix this by converting if_bytes to a int64_t and converting all the
index variables and size calculations to use int64_t types to avoid
overflows in future. Signed integers are still used to enable easy
detection of extent count underflows. This enables scalability of
extent counts to the limits of the on-disk format - MAXEXTNUM
(2**31) extents.
Current testing is at over 500M extents and still going:
fsxattr.nextents = 517310478
Reported-by: Zorro Lang <zlang@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-10-17 13:40:33 -07:00
minforkoff = max_t ( int64_t , dsize , XFS_BMDR_SPACE_CALC ( MINDBTPTRS ) ) ;
2005-11-02 10:34:53 +11:00
minforkoff = roundup ( minforkoff , 8 ) > > 3 ;
/* attr fork btree root can have at least this many key/ptr pairs */
2020-03-18 08:15:10 -07:00
maxforkoff = XFS_LITINO ( mp ) - XFS_BMDR_SPACE_CALC ( MINABTPTRS ) ;
2005-11-02 10:34:53 +11:00
maxforkoff = maxforkoff > > 3 ; /* rounded down */
if ( offset > = maxforkoff )
return maxforkoff ;
2011-11-19 17:44:30 +00:00
if ( offset > = minforkoff )
return offset ;
2005-11-02 10:34:53 +11:00
return 0 ;
}
/*
2021-08-18 18:46:52 -07:00
* Switch on the ATTR2 superblock bit ( implies also FEATURES2 ) unless :
* - noattr2 mount option is set ,
* - on - disk version bit says it is already set , or
* - the attr2 mount option is not set to enable automatic upgrade from attr1 .
2005-11-02 10:34:53 +11:00
*/
STATIC void
2021-08-18 18:46:52 -07:00
xfs_sbversion_add_attr2 (
struct xfs_mount * mp ,
struct xfs_trans * tp )
2005-11-02 10:34:53 +11:00
{
2021-08-18 18:46:52 -07:00
if ( xfs_has_noattr2 ( mp ) )
return ;
if ( mp - > m_sb . sb_features2 & XFS_SB_VERSION2_ATTR2BIT )
return ;
if ( ! xfs_has_attr2 ( mp ) )
return ;
spin_lock ( & mp - > m_sb_lock ) ;
xfs_add_attr2 ( mp ) ;
spin_unlock ( & mp - > m_sb_lock ) ;
xfs_log_sb ( tp ) ;
2005-11-02 10:34:53 +11:00
}
/*
* Create the initial contents of a shortform attribute list .
*/
void
2020-05-18 10:28:05 -07:00
xfs_attr_shortform_create (
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2020-05-18 10:28:05 -07:00
struct xfs_inode * dp = args - > dp ;
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-09 10:56:06 -07:00
struct xfs_ifork * ifp = & dp - > i_af ;
2020-05-18 10:28:05 -07:00
struct xfs_attr_sf_hdr * hdr ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_sf_create ( args ) ;
2005-04-16 15:20:36 -07:00
ASSERT ( ifp - > if_bytes = = 0 ) ;
2021-04-13 11:15:12 -07:00
if ( ifp - > if_format = = XFS_DINODE_FMT_EXTENTS )
2020-05-18 10:28:05 -07:00
ifp - > if_format = XFS_DINODE_FMT_LOCAL ;
2023-12-20 07:34:56 +01:00
hdr = xfs_idata_realloc ( dp , sizeof ( * hdr ) , XFS_ATTR_FORK ) ;
2020-08-26 14:12:18 -07:00
memset ( hdr , 0 , sizeof ( * hdr ) ) ;
2006-03-17 17:29:25 +11:00
hdr - > totsize = cpu_to_be16 ( sizeof ( * hdr ) ) ;
2005-04-16 15:20:36 -07:00
xfs_trans_log_inode ( args - > trans , dp , XFS_ILOG_CORE | XFS_ILOG_ADATA ) ;
}
2020-07-20 21:47:22 -07:00
/*
2023-12-20 07:34:58 +01:00
* Return the entry if the attr in args is found , or NULL if not .
2020-07-20 21:47:22 -07:00
*/
2023-12-20 07:34:58 +01:00
struct xfs_attr_sf_entry *
2020-07-20 21:47:22 -07:00
xfs_attr_sf_findname (
2023-12-20 07:34:58 +01:00
struct xfs_da_args * args )
2020-07-20 21:47:22 -07:00
{
2023-12-20 07:35:01 +01:00
struct xfs_attr_sf_hdr * sf = args - > dp - > i_af . if_data ;
2023-12-20 07:34:58 +01:00
struct xfs_attr_sf_entry * sfe ;
2020-07-20 21:47:22 -07:00
2023-12-20 07:35:01 +01:00
for ( sfe = xfs_attr_sf_firstentry ( sf ) ;
2023-12-20 07:34:58 +01:00
sfe < xfs_attr_sf_endptr ( sf ) ;
sfe = xfs_attr_sf_nextentry ( sfe ) ) {
if ( xfs_attr_match ( args , sfe - > namelen , sfe - > nameval ,
sfe - > flags ) )
return sfe ;
}
2020-07-20 21:47:22 -07:00
2023-12-20 07:34:58 +01:00
return NULL ;
2020-07-20 21:47:22 -07:00
}
2005-04-16 15:20:36 -07:00
/*
* Add a name / value pair to the shortform attribute list .
* Overflow from the inode has already been checked for .
*/
2005-11-02 10:34:53 +11:00
void
2020-07-20 21:47:22 -07:00
xfs_attr_shortform_add (
struct xfs_da_args * args ,
int forkoff )
2005-04-16 15:20:36 -07:00
{
2023-12-20 07:34:55 +01:00
struct xfs_inode * dp = args - > dp ;
struct xfs_mount * mp = dp - > i_mount ;
struct xfs_ifork * ifp = & dp - > i_af ;
2023-12-20 07:35:01 +01:00
struct xfs_attr_sf_hdr * sf = ifp - > if_data ;
2020-07-20 21:47:22 -07:00
struct xfs_attr_sf_entry * sfe ;
2023-12-20 07:34:58 +01:00
int size ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_sf_add ( args ) ;
2021-03-29 11:11:44 -07:00
dp - > i_forkoff = forkoff ;
2005-11-02 10:34:53 +11:00
2021-04-13 11:15:11 -07:00
ASSERT ( ifp - > if_format = = XFS_DINODE_FMT_LOCAL ) ;
2023-12-20 07:34:58 +01:00
ASSERT ( ! xfs_attr_sf_findname ( args ) ) ;
2005-04-16 15:20:36 -07:00
2020-09-07 08:08:50 -07:00
size = xfs_attr_sf_entsize_byname ( args - > namelen , args - > valuelen ) ;
2023-12-20 07:34:56 +01:00
sf = xfs_idata_realloc ( dp , size , XFS_ATTR_FORK ) ;
2005-04-16 15:20:36 -07:00
2023-12-20 07:34:58 +01:00
sfe = xfs_attr_sf_endptr ( sf ) ;
2005-04-16 15:20:36 -07:00
sfe - > namelen = args - > namelen ;
2006-03-17 17:29:25 +11:00
sfe - > valuelen = args - > valuelen ;
2020-02-26 17:30:42 -08:00
sfe - > flags = args - > attr_filter ;
2005-04-16 15:20:36 -07:00
memcpy ( sfe - > nameval , args - > name , args - > namelen ) ;
memcpy ( & sfe - > nameval [ args - > namelen ] , args - > value , args - > valuelen ) ;
2023-12-20 07:35:01 +01:00
sf - > count + + ;
be16_add_cpu ( & sf - > totsize , size ) ;
2005-04-16 15:20:36 -07:00
xfs_trans_log_inode ( args - > trans , dp , XFS_ILOG_CORE | XFS_ILOG_ADATA ) ;
2005-11-02 10:34:53 +11:00
xfs_sbversion_add_attr2 ( mp , args - > trans ) ;
2005-04-16 15:20:36 -07:00
}
2009-02-04 09:36:00 +01:00
/*
* After the last attribute is removed revert to original inode format ,
* making all literal area available to the data fork once more .
*/
2015-05-29 07:40:08 +10:00
void
xfs_attr_fork_remove (
2009-02-04 09:36:00 +01:00
struct xfs_inode * ip ,
struct xfs_trans * tp )
{
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-09 10:56:06 -07:00
ASSERT ( ip - > i_af . if_nextents = = 0 ) ;
2020-05-18 10:27:22 -07:00
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-09 10:56:06 -07:00
xfs_ifork_zap_attr ( ip ) ;
2021-03-29 11:11:44 -07:00
ip - > i_forkoff = 0 ;
2009-02-04 09:36:00 +01:00
xfs_trans_log_inode ( tp , ip , XFS_ILOG_CORE ) ;
}
2005-04-16 15:20:36 -07:00
/*
2005-11-02 10:34:53 +11:00
* Remove an attribute from the shortform attribute list structure .
2005-04-16 15:20:36 -07:00
*/
int
2021-05-28 15:15:05 -07:00
xfs_attr_sf_removename (
2020-07-20 21:47:22 -07:00
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2023-12-20 07:34:55 +01:00
struct xfs_inode * dp = args - > dp ;
struct xfs_mount * mp = dp - > i_mount ;
2023-12-20 07:35:01 +01:00
struct xfs_attr_sf_hdr * sf = dp - > i_af . if_data ;
2020-07-20 21:47:22 -07:00
struct xfs_attr_sf_entry * sfe ;
2023-12-20 07:35:01 +01:00
uint16_t totsize = be16_to_cpu ( sf - > totsize ) ;
2023-12-20 07:34:58 +01:00
void * next , * end ;
int size = 0 ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_sf_remove ( args ) ;
2023-12-20 07:34:58 +01:00
sfe = xfs_attr_sf_findname ( args ) ;
if ( ! sfe ) {
/*
* If we are recovering an operation , finding nothing to remove
* is not an error , it just means there was nothing to clean up .
*/
if ( args - > op_flags & XFS_DA_OP_RECOVERY )
return 0 ;
return - ENOATTR ;
}
2005-04-16 15:20:36 -07:00
2005-11-02 10:34:53 +11:00
/*
* Fix up the attribute fork data , covering the hole
*/
2023-12-20 07:34:58 +01:00
size = xfs_attr_sf_entsize ( sfe ) ;
next = xfs_attr_sf_nextentry ( sfe ) ;
end = xfs_attr_sf_endptr ( sf ) ;
if ( next < end )
memmove ( sfe , next , end - next ) ;
2023-12-20 07:35:01 +01:00
sf - > count - - ;
2023-12-20 07:34:58 +01:00
totsize - = size ;
2023-12-20 07:35:01 +01:00
sf - > totsize = cpu_to_be16 ( totsize ) ;
2005-11-02 10:34:53 +11:00
/*
* Fix up the start offset of the attribute fork
*/
2023-12-20 07:35:02 +01:00
if ( totsize = = sizeof ( struct xfs_attr_sf_hdr ) & & xfs_has_attr2 ( mp ) & &
2020-05-18 10:28:05 -07:00
( dp - > i_df . if_format ! = XFS_DINODE_FMT_BTREE ) & &
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
! ( args - > op_flags & ( XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE ) ) ) {
2015-05-29 07:40:08 +10:00
xfs_attr_fork_remove ( dp , args - > trans ) ;
2005-11-02 10:34:53 +11:00
} else {
xfs_idata_realloc ( dp , - size , XFS_ATTR_FORK ) ;
2021-03-29 11:11:44 -07:00
dp - > i_forkoff = xfs_attr_shortform_bytesfit ( dp , totsize ) ;
ASSERT ( dp - > i_forkoff ) ;
2023-12-20 07:35:02 +01:00
ASSERT ( totsize > sizeof ( struct xfs_attr_sf_hdr ) | |
2008-05-21 16:42:05 +10:00
( args - > op_flags & XFS_DA_OP_ADDNAME ) | |
2021-08-18 18:46:52 -07:00
! xfs_has_attr2 ( mp ) | |
2020-05-18 10:28:05 -07:00
dp - > i_df . if_format = = XFS_DINODE_FMT_BTREE ) ;
2005-11-02 10:34:53 +11:00
xfs_trans_log_inode ( args - > trans , dp ,
XFS_ILOG_CORE | XFS_ILOG_ADATA ) ;
}
xfs_sbversion_add_attr2 ( mp , args - > trans ) ;
2005-04-16 15:20:36 -07:00
2014-06-22 15:03:54 +10:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
2019-11-07 13:24:52 -08:00
* Retrieve the attribute value and length .
2019-08-29 09:04:08 -07:00
*
2020-02-26 17:30:35 -08:00
* If args - > valuelen is zero , only the length needs to be returned . Unlike a
* lookup , we only return an error if the attribute does not exist or we can ' t
* retrieve the value .
2005-04-16 15:20:36 -07:00
*/
int
2019-08-29 09:04:10 -07:00
xfs_attr_shortform_getvalue (
2023-12-20 07:34:55 +01:00
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2023-12-20 07:34:55 +01:00
struct xfs_attr_sf_entry * sfe ;
2005-04-16 15:20:36 -07:00
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-09 10:56:06 -07:00
ASSERT ( args - > dp - > i_af . if_format = = XFS_DINODE_FMT_LOCAL ) ;
2023-12-20 07:34:57 +01:00
trace_xfs_attr_sf_lookup ( args ) ;
2023-12-20 07:35:00 +01:00
sfe = xfs_attr_sf_findname ( args ) ;
if ( ! sfe )
return - ENOATTR ;
return xfs_attr_copy_value ( args , & sfe - > nameval [ args - > namelen ] ,
sfe - > valuelen ) ;
2005-04-16 15:20:36 -07:00
}
2022-06-25 10:01:20 -07:00
/* Convert from using the shortform to the leaf format. */
2005-04-16 15:20:36 -07:00
int
2017-12-07 19:07:02 -08:00
xfs_attr_shortform_to_leaf (
2022-06-25 10:01:20 -07:00
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2023-12-20 07:34:55 +01:00
struct xfs_inode * dp = args - > dp ;
struct xfs_ifork * ifp = & dp - > i_af ;
2023-12-20 07:35:01 +01:00
struct xfs_attr_sf_hdr * sf = ifp - > if_data ;
2018-07-11 22:26:11 -07:00
struct xfs_attr_sf_entry * sfe ;
2023-12-20 07:35:01 +01:00
int size = be16_to_cpu ( sf - > totsize ) ;
2018-07-11 22:26:11 -07:00
struct xfs_da_args nargs ;
char * tmpbuffer ;
2023-12-20 07:35:01 +01:00
int error , i ;
2018-07-11 22:26:11 -07:00
xfs_dablk_t blkno ;
struct xfs_buf * bp ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_sf_to_leaf ( args ) ;
2024-01-16 09:59:40 +11:00
tmpbuffer = kmalloc ( size , GFP_KERNEL | __GFP_NOFAIL ) ;
2023-12-20 07:34:55 +01:00
memcpy ( tmpbuffer , ifp - > if_data , size ) ;
2023-12-20 07:35:01 +01:00
sf = ( struct xfs_attr_sf_hdr * ) tmpbuffer ;
2005-04-16 15:20:36 -07:00
xfs_idata_realloc ( dp , - size , XFS_ATTR_FORK ) ;
2019-10-07 12:54:16 -07:00
xfs_bmap_local_to_extents_empty ( args - > trans , dp , XFS_ATTR_FORK ) ;
2013-07-10 07:04:00 +10:00
2005-04-16 15:20:36 -07:00
bp = NULL ;
error = xfs_da_grow_inode ( args , & blkno ) ;
2019-10-07 12:54:15 -07:00
if ( error )
2005-04-16 15:20:36 -07:00
goto out ;
ASSERT ( blkno = = 0 ) ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_create ( args , blkno , & bp ) ;
2019-10-07 12:54:15 -07:00
if ( error )
2005-04-16 15:20:36 -07:00
goto out ;
memset ( ( char * ) & nargs , 0 , sizeof ( nargs ) ) ;
nargs . dp = dp ;
2014-06-06 15:01:58 +10:00
nargs . geo = args - > geo ;
2005-04-16 15:20:36 -07:00
nargs . total = args - > total ;
nargs . whichfork = XFS_ATTR_FORK ;
nargs . trans = args - > trans ;
2008-05-21 16:42:05 +10:00
nargs . op_flags = XFS_DA_OP_OKNOENT ;
2005-04-16 15:20:36 -07:00
2023-12-20 07:35:01 +01:00
sfe = xfs_attr_sf_firstentry ( sf ) ;
for ( i = 0 ; i < sf - > count ; i + + ) {
2010-01-20 10:47:48 +11:00
nargs . name = sfe - > nameval ;
2005-04-16 15:20:36 -07:00
nargs . namelen = sfe - > namelen ;
2010-01-20 10:47:48 +11:00
nargs . value = & sfe - > nameval [ nargs . namelen ] ;
2006-03-17 17:29:25 +11:00
nargs . valuelen = sfe - > valuelen ;
2010-01-20 10:47:48 +11:00
nargs . hashval = xfs_da_hashname ( sfe - > nameval ,
2005-04-16 15:20:36 -07:00
sfe - > namelen ) ;
2020-02-26 17:30:42 -08:00
nargs . attr_filter = sfe - > flags & XFS_ATTR_NSP_ONDISK_MASK ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_lookup_int ( bp , & nargs ) ; /* set a->index */
2014-06-25 14:58:08 +10:00
ASSERT ( error = = - ENOATTR ) ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_add ( bp , & nargs ) ;
2014-06-25 14:58:08 +10:00
ASSERT ( error ! = - ENOSPC ) ;
2005-04-16 15:20:36 -07:00
if ( error )
goto out ;
2020-09-07 08:08:50 -07:00
sfe = xfs_attr_sf_nextentry ( sfe ) ;
2005-04-16 15:20:36 -07:00
}
error = 0 ;
out :
2024-01-16 09:59:43 +11:00
kfree ( tmpbuffer ) ;
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
}
/*
* Check a leaf attribute block to see if all the entries would fit into
* a shortform attribute list .
*/
int
2012-06-22 18:50:14 +10:00
xfs_attr_shortform_allfit (
2013-05-20 09:51:14 +10:00
struct xfs_buf * bp ,
struct xfs_inode * dp )
2005-04-16 15:20:36 -07:00
{
2013-05-20 09:51:14 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr_leaf_entry * entry ;
2005-04-16 15:20:36 -07:00
xfs_attr_leaf_name_local_t * name_loc ;
2013-05-20 09:51:14 +10:00
struct xfs_attr3_icleaf_hdr leafhdr ;
int bytes ;
int i ;
2019-06-28 19:27:29 -07:00
struct xfs_mount * mp = bp - > b_mount ;
2005-04-16 15:20:36 -07:00
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( mp - > m_attr_geo , & leafhdr , leaf ) ;
2013-05-20 09:51:14 +10:00
entry = xfs_attr3_leaf_entryp ( leaf ) ;
2005-04-16 15:20:36 -07:00
bytes = sizeof ( struct xfs_attr_sf_hdr ) ;
2013-05-20 09:51:14 +10:00
for ( i = 0 ; i < leafhdr . count ; entry + + , i + + ) {
2005-04-16 15:20:36 -07:00
if ( entry - > flags & XFS_ATTR_INCOMPLETE )
continue ; /* don't copy partial entries */
if ( ! ( entry - > flags & XFS_ATTR_LOCAL ) )
2014-06-22 15:03:54 +10:00
return 0 ;
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf , i ) ;
2005-04-16 15:20:36 -07:00
if ( name_loc - > namelen > = XFS_ATTR_SF_ENTSIZE_MAX )
2014-06-22 15:03:54 +10:00
return 0 ;
2006-03-17 17:29:09 +11:00
if ( be16_to_cpu ( name_loc - > valuelen ) > = XFS_ATTR_SF_ENTSIZE_MAX )
2014-06-22 15:03:54 +10:00
return 0 ;
2020-09-07 08:08:50 -07:00
bytes + = xfs_attr_sf_entsize_byname ( name_loc - > namelen ,
2020-09-04 09:51:39 -07:00
be16_to_cpu ( name_loc - > valuelen ) ) ;
2005-04-16 15:20:36 -07:00
}
2021-08-18 18:46:52 -07:00
if ( xfs_has_attr2 ( dp - > i_mount ) & &
2020-05-18 10:28:05 -07:00
( dp - > i_df . if_format ! = XFS_DINODE_FMT_BTREE ) & &
2005-11-25 16:42:22 +11:00
( bytes = = sizeof ( struct xfs_attr_sf_hdr ) ) )
2013-05-20 09:51:14 +10:00
return - 1 ;
return xfs_attr_shortform_bytesfit ( dp , bytes ) ;
2005-04-16 15:20:36 -07:00
}
2023-12-15 10:03:37 -08:00
/* Verify the consistency of a raw inline attribute fork. */
2018-01-08 10:51:05 -08:00
xfs_failaddr_t
xfs_attr_shortform_verify (
2023-12-20 07:35:01 +01:00
struct xfs_attr_sf_hdr * sfp ,
2023-12-15 10:03:37 -08:00
size_t size )
2018-01-08 10:51:05 -08:00
{
2023-12-20 07:35:01 +01:00
struct xfs_attr_sf_entry * sfep = xfs_attr_sf_firstentry ( sfp ) ;
2018-01-08 10:51:05 -08:00
struct xfs_attr_sf_entry * next_sfep ;
char * endp ;
int i ;
/*
* Give up if the attribute is way too short .
*/
if ( size < sizeof ( struct xfs_attr_sf_hdr ) )
return __this_address ;
endp = ( char * ) sfp + size ;
/* Check all reported entries */
2023-12-20 07:35:01 +01:00
for ( i = 0 ; i < sfp - > count ; i + + ) {
2018-01-08 10:51:05 -08:00
/*
* struct xfs_attr_sf_entry has a variable length .
* Check the fixed - offset parts of the structure are
* within the data buffer .
2020-08-26 14:11:58 -07:00
* xfs_attr_sf_entry is defined with a 1 - byte variable
* array at the end , so we must subtract that off .
2018-01-08 10:51:05 -08:00
*/
2020-09-04 09:51:39 -07:00
if ( ( ( char * ) sfep + sizeof ( * sfep ) ) > = endp )
2018-01-08 10:51:05 -08:00
return __this_address ;
/* Don't allow names with known bad length. */
if ( sfep - > namelen = = 0 )
return __this_address ;
/*
* Check that the variable - length part of the structure is
* within the data buffer . The next entry starts after the
* name component , so nextentry is an acceptable test .
*/
2020-09-07 08:08:50 -07:00
next_sfep = xfs_attr_sf_nextentry ( sfep ) ;
2018-01-08 10:51:05 -08:00
if ( ( char * ) next_sfep > endp )
return __this_address ;
/*
* Check for unknown flags . Short form doesn ' t support
* the incomplete or local bits , so we can use the namespace
* mask here .
*/
if ( sfep - > flags & ~ XFS_ATTR_NSP_ONDISK_MASK )
return __this_address ;
/*
* Check for invalid namespace combinations . We only allow
* one namespace flag per xattr , so we can just count the
* bits ( i . e . hweight ) here .
*/
if ( hweight8 ( sfep - > flags & XFS_ATTR_NSP_ONDISK_MASK ) > 1 )
return __this_address ;
sfep = next_sfep ;
}
if ( ( void * ) sfep ! = ( void * ) endp )
return __this_address ;
return NULL ;
}
2005-04-16 15:20:36 -07:00
/*
* Convert a leaf attribute list to shortform attribute list
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_to_shortform (
struct xfs_buf * bp ,
struct xfs_da_args * args ,
int forkoff )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_attr_leaf_entry * entry ;
struct xfs_attr_leaf_name_local * name_loc ;
struct xfs_da_args nargs ;
struct xfs_inode * dp = args - > dp ;
char * tmpbuffer ;
int error ;
int i ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_to_sf ( args ) ;
2024-01-16 09:59:40 +11:00
tmpbuffer = kmalloc ( args - > geo - > blksize , GFP_KERNEL | __GFP_NOFAIL ) ;
2013-04-24 18:58:55 +10:00
if ( ! tmpbuffer )
2014-06-25 14:58:08 +10:00
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2014-06-06 15:21:45 +10:00
memcpy ( tmpbuffer , bp - > b_addr , args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
2005-04-16 15:20:36 -07:00
leaf = ( xfs_attr_leafblock_t * ) tmpbuffer ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
entry = xfs_attr3_leaf_entryp ( leaf ) ;
/* XXX (dgc): buffer is about to be marked stale - why zero it? */
2014-06-06 15:21:45 +10:00
memset ( bp - > b_addr , 0 , args - > geo - > blksize ) ;
2005-04-16 15:20:36 -07:00
/*
* Clean out the prior contents of the attribute list .
*/
error = xfs_da_shrink_inode ( args , 0 , bp ) ;
if ( error )
goto out ;
2005-11-02 10:34:53 +11:00
if ( forkoff = = - 1 ) {
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
/*
* Don ' t remove the attr fork if this operation is the first
* part of a attr replace operations . We ' re going to add a new
* attr immediately , so we need to keep the attr fork around in
* this case .
*/
if ( ! ( args - > op_flags & XFS_DA_OP_REPLACE ) ) {
ASSERT ( xfs_has_attr2 ( dp - > i_mount ) ) ;
ASSERT ( dp - > i_df . if_format ! = XFS_DINODE_FMT_BTREE ) ;
xfs_attr_fork_remove ( dp , args - > trans ) ;
}
2005-04-16 15:20:36 -07:00
goto out ;
2005-11-02 10:34:53 +11:00
}
xfs_attr_shortform_create ( args ) ;
2005-04-16 15:20:36 -07:00
/*
* Copy the attributes
*/
memset ( ( char * ) & nargs , 0 , sizeof ( nargs ) ) ;
2014-06-06 15:01:58 +10:00
nargs . geo = args - > geo ;
2005-04-16 15:20:36 -07:00
nargs . dp = dp ;
nargs . total = args - > total ;
nargs . whichfork = XFS_ATTR_FORK ;
nargs . trans = args - > trans ;
2008-05-21 16:42:05 +10:00
nargs . op_flags = XFS_DA_OP_OKNOENT ;
2013-04-24 18:58:55 +10:00
for ( i = 0 ; i < ichdr . count ; entry + + , i + + ) {
2005-04-16 15:20:36 -07:00
if ( entry - > flags & XFS_ATTR_INCOMPLETE )
continue ; /* don't copy partial entries */
if ( ! entry - > nameidx )
continue ;
ASSERT ( entry - > flags & XFS_ATTR_LOCAL ) ;
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf , i ) ;
2010-01-20 10:47:48 +11:00
nargs . name = name_loc - > nameval ;
2005-04-16 15:20:36 -07:00
nargs . namelen = name_loc - > namelen ;
2010-01-20 10:47:48 +11:00
nargs . value = & name_loc - > nameval [ nargs . namelen ] ;
2006-03-17 17:29:09 +11:00
nargs . valuelen = be16_to_cpu ( name_loc - > valuelen ) ;
2006-03-17 17:29:02 +11:00
nargs . hashval = be32_to_cpu ( entry - > hashval ) ;
2020-02-26 17:30:42 -08:00
nargs . attr_filter = entry - > flags & XFS_ATTR_NSP_ONDISK_MASK ;
2005-11-02 10:34:53 +11:00
xfs_attr_shortform_add ( & nargs , forkoff ) ;
2005-04-16 15:20:36 -07:00
}
error = 0 ;
out :
2024-01-16 09:59:43 +11:00
kfree ( tmpbuffer ) ;
2013-04-24 18:58:55 +10:00
return error ;
2005-04-16 15:20:36 -07:00
}
/*
* Convert from using a single leaf to a root node and a leaf .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_to_node (
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr icleafhdr ;
struct xfs_attr_leaf_entry * entries ;
struct xfs_da3_icnode_hdr icnodehdr ;
struct xfs_da_intnode * node ;
struct xfs_inode * dp = args - > dp ;
struct xfs_mount * mp = dp - > i_mount ;
struct xfs_buf * bp1 = NULL ;
struct xfs_buf * bp2 = NULL ;
xfs_dablk_t blkno ;
int error ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_to_node ( args ) ;
2022-05-11 17:01:23 +10:00
if ( XFS_TEST_ERROR ( false , mp , XFS_ERRTAG_ATTR_LEAF_TO_NODE ) ) {
error = - EIO ;
goto out ;
}
2005-04-16 15:20:36 -07:00
error = xfs_da_grow_inode ( args , & blkno ) ;
if ( error )
goto out ;
2019-11-20 09:46:02 -08:00
error = xfs_attr3_leaf_read ( args - > trans , dp , 0 , & bp1 ) ;
2005-04-16 15:20:36 -07:00
if ( error )
goto out ;
2012-11-12 22:54:16 +11:00
2019-11-20 09:46:05 -08:00
error = xfs_da_get_buf ( args - > trans , dp , blkno , & bp2 , XFS_ATTR_FORK ) ;
2005-04-16 15:20:36 -07:00
if ( error )
goto out ;
2013-04-24 18:58:55 +10:00
2023-12-05 13:59:00 +08:00
/*
* Copy leaf to new buffer and log it .
*/
xfs_da_buf_copy ( bp2 , bp1 , args - > geo - > blksize ) ;
2014-06-06 15:21:45 +10:00
xfs_trans_log_buf ( args - > trans , bp2 , 0 , args - > geo - > blksize - 1 ) ;
2005-04-16 15:20:36 -07:00
/*
* Set up the new root node .
*/
2013-04-24 18:58:02 +10:00
error = xfs_da3_node_create ( args , 0 , 1 , & bp1 , XFS_ATTR_FORK ) ;
2005-04-16 15:20:36 -07:00
if ( error )
goto out ;
2012-06-22 18:50:14 +10:00
node = bp1 - > b_addr ;
2019-11-08 14:53:00 -08:00
xfs_da3_node_hdr_from_disk ( mp , & icnodehdr , node ) ;
2013-04-24 18:58:55 +10:00
2012-06-22 18:50:14 +10:00
leaf = bp2 - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & icleafhdr , leaf ) ;
2013-04-24 18:58:55 +10:00
entries = xfs_attr3_leaf_entryp ( leaf ) ;
2005-04-16 15:20:36 -07:00
/* both on-disk, don't endian-flip twice */
2019-11-08 14:57:48 -08:00
icnodehdr . btree [ 0 ] . hashval = entries [ icleafhdr . count - 1 ] . hashval ;
icnodehdr . btree [ 0 ] . before = cpu_to_be32 ( blkno ) ;
2013-04-24 18:58:55 +10:00
icnodehdr . count = 1 ;
2019-11-08 14:57:48 -08:00
xfs_da3_node_hdr_to_disk ( dp - > i_mount , node , & icnodehdr ) ;
2014-06-06 15:21:45 +10:00
xfs_trans_log_buf ( args - > trans , bp1 , 0 , args - > geo - > blksize - 1 ) ;
2005-04-16 15:20:36 -07:00
error = 0 ;
out :
2013-04-24 18:58:55 +10:00
return error ;
2005-04-16 15:20:36 -07:00
}
/*========================================================================
* Routines used for growing the Btree .
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
* Create the initial contents of a leaf attribute list
* or a leaf in a node attribute list .
*/
2005-06-21 15:36:52 +10:00
STATIC int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_create (
struct xfs_da_args * args ,
xfs_dablk_t blkno ,
struct xfs_buf * * bpp )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_inode * dp = args - > dp ;
struct xfs_mount * mp = dp - > i_mount ;
struct xfs_buf * bp ;
int error ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_create ( args ) ;
2019-11-20 09:46:05 -08:00
error = xfs_da_get_buf ( args - > trans , args - > dp , blkno , & bp ,
2005-04-16 15:20:36 -07:00
XFS_ATTR_FORK ) ;
if ( error )
2013-04-24 18:58:55 +10:00
return error ;
bp - > b_ops = & xfs_attr3_leaf_buf_ops ;
2013-04-03 16:11:30 +11:00
xfs_trans_buf_set_type ( args - > trans , bp , XFS_BLFT_ATTR_LEAF_BUF ) ;
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2014-06-06 15:21:45 +10:00
memset ( leaf , 0 , args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
memset ( & ichdr , 0 , sizeof ( ichdr ) ) ;
2014-06-06 15:21:45 +10:00
ichdr . firstused = args - > geo - > blksize ;
2013-04-24 18:58:55 +10:00
2021-08-18 18:46:37 -07:00
if ( xfs_has_crc ( mp ) ) {
2013-04-24 18:58:55 +10:00
struct xfs_da3_blkinfo * hdr3 = bp - > b_addr ;
ichdr . magic = XFS_ATTR3_LEAF_MAGIC ;
2021-08-18 18:47:05 -07:00
hdr3 - > blkno = cpu_to_be64 ( xfs_buf_daddr ( bp ) ) ;
2013-04-24 18:58:55 +10:00
hdr3 - > owner = cpu_to_be64 ( dp - > i_ino ) ;
2015-07-29 11:53:31 +10:00
uuid_copy ( & hdr3 - > uuid , & mp - > m_sb . sb_meta_uuid ) ;
2005-04-16 15:20:36 -07:00
2013-04-24 18:58:55 +10:00
ichdr . freemap [ 0 ] . base = sizeof ( struct xfs_attr3_leaf_hdr ) ;
} else {
ichdr . magic = XFS_ATTR_LEAF_MAGIC ;
ichdr . freemap [ 0 ] . base = sizeof ( struct xfs_attr_leaf_hdr ) ;
}
ichdr . freemap [ 0 ] . size = ichdr . firstused - ichdr . freemap [ 0 ] . base ;
2005-04-16 15:20:36 -07:00
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( args - > geo , leaf , & ichdr ) ;
2014-06-06 15:21:45 +10:00
xfs_trans_log_buf ( args - > trans , bp , 0 , args - > geo - > blksize - 1 ) ;
2005-04-16 15:20:36 -07:00
* bpp = bp ;
2013-04-24 18:58:55 +10:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Split the leaf node , rebalance , then add the new entry .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_split (
struct xfs_da_state * state ,
struct xfs_da_state_blk * oldblk ,
struct xfs_da_state_blk * newblk )
2005-04-16 15:20:36 -07:00
{
xfs_dablk_t blkno ;
int error ;
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_split ( state - > args ) ;
2005-04-16 15:20:36 -07:00
/*
* Allocate space for a new leaf node .
*/
ASSERT ( oldblk - > magic = = XFS_ATTR_LEAF_MAGIC ) ;
error = xfs_da_grow_inode ( state - > args , & blkno ) ;
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_create ( state - > args , blkno , & newblk - > bp ) ;
2005-04-16 15:20:36 -07:00
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
newblk - > blkno = blkno ;
newblk - > magic = XFS_ATTR_LEAF_MAGIC ;
/*
* Rebalance the entries across the two leaves .
* NOTE : rebalance ( ) currently depends on the 2 nd block being empty .
*/
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_rebalance ( state , oldblk , newblk ) ;
2013-04-24 18:58:02 +10:00
error = xfs_da3_blk_link ( state , oldblk , newblk ) ;
2005-04-16 15:20:36 -07:00
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
/*
* Save info on " old " attribute for " atomic rename " ops , leaf_add ( )
* modifies the index / blkno / rmtblk / rmtblkcnt fields to show the
* " new " attrs info . Will need the " old " info to remove it later .
*
* Insert the " new " entry in the correct block .
*/
2012-03-22 05:15:13 +00:00
if ( state - > inleaf ) {
trace_xfs_attr_leaf_add_old ( state - > args ) ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_add ( oldblk - > bp , state - > args ) ;
2012-03-22 05:15:13 +00:00
} else {
trace_xfs_attr_leaf_add_new ( state - > args ) ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_add ( newblk - > bp , state - > args ) ;
2012-03-22 05:15:13 +00:00
}
2005-04-16 15:20:36 -07:00
/*
* Update last hashval in each block since we added the name .
*/
oldblk - > hashval = xfs_attr_leaf_lasthash ( oldblk - > bp , NULL ) ;
newblk - > hashval = xfs_attr_leaf_lasthash ( newblk - > bp , NULL ) ;
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
}
/*
* Add a name to the leaf attribute list structure .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_add (
2012-06-22 18:50:14 +10:00
struct xfs_buf * bp ,
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr ichdr ;
int tablesize ;
int entsize ;
int sum ;
int tmp ;
int i ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_add ( args ) ;
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index > = 0 & & args - > index < = ichdr . count ) ;
2014-06-06 15:21:27 +10:00
entsize = xfs_attr_leaf_newentsize ( args , NULL ) ;
2005-04-16 15:20:36 -07:00
/*
* Search through freemap for first - fit on new name length .
* ( may need to figure in size of entry struct too )
*/
2013-04-24 18:58:55 +10:00
tablesize = ( ichdr . count + 1 ) * sizeof ( xfs_attr_leaf_entry_t )
+ xfs_attr3_leaf_hdr_size ( leaf ) ;
for ( sum = 0 , i = XFS_ATTR_LEAF_MAPSIZE - 1 ; i > = 0 ; i - - ) {
if ( tablesize > ichdr . firstused ) {
sum + = ichdr . freemap [ i ] . size ;
2005-04-16 15:20:36 -07:00
continue ;
}
2013-04-24 18:58:55 +10:00
if ( ! ichdr . freemap [ i ] . size )
2005-04-16 15:20:36 -07:00
continue ; /* no space in this map */
tmp = entsize ;
2013-04-24 18:58:55 +10:00
if ( ichdr . freemap [ i ] . base < ichdr . firstused )
2005-04-16 15:20:36 -07:00
tmp + = sizeof ( xfs_attr_leaf_entry_t ) ;
2013-04-24 18:58:55 +10:00
if ( ichdr . freemap [ i ] . size > = tmp ) {
tmp = xfs_attr3_leaf_add_work ( bp , & ichdr , args , i ) ;
goto out_log_hdr ;
2005-04-16 15:20:36 -07:00
}
2013-04-24 18:58:55 +10:00
sum + = ichdr . freemap [ i ] . size ;
2005-04-16 15:20:36 -07:00
}
/*
* If there are no holes in the address space of the block ,
* and we don ' t have enough freespace , then compaction will do us
* no good and we should just give up .
*/
2013-04-24 18:58:55 +10:00
if ( ! ichdr . holes & & sum < entsize )
2014-06-25 14:58:08 +10:00
return - ENOSPC ;
2005-04-16 15:20:36 -07:00
/*
* Compact the entries to coalesce free space .
* This may change the hdr - > count via dropping INCOMPLETE entries .
*/
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_compact ( args , & ichdr , bp ) ;
2005-04-16 15:20:36 -07:00
/*
* After compaction , the block is guaranteed to have only one
* free region , in freemap [ 0 ] . If it is not big enough , give up .
*/
2013-04-24 18:58:55 +10:00
if ( ichdr . freemap [ 0 ] . size < ( entsize + sizeof ( xfs_attr_leaf_entry_t ) ) ) {
2014-06-25 14:58:08 +10:00
tmp = - ENOSPC ;
2013-04-24 18:58:55 +10:00
goto out_log_hdr ;
}
tmp = xfs_attr3_leaf_add_work ( bp , & ichdr , args , 0 ) ;
2005-04-16 15:20:36 -07:00
2013-04-24 18:58:55 +10:00
out_log_hdr :
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( args - > geo , leaf , & ichdr ) ;
2013-04-24 18:58:55 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
XFS_DA_LOGRANGE ( leaf , & leaf - > hdr ,
xfs_attr3_leaf_hdr_size ( leaf ) ) ) ;
return tmp ;
2005-04-16 15:20:36 -07:00
}
/*
* Add a name to a leaf attribute list structure .
*/
STATIC int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_add_work (
struct xfs_buf * bp ,
struct xfs_attr3_icleaf_hdr * ichdr ,
struct xfs_da_args * args ,
int mapindex )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr_leaf_entry * entry ;
struct xfs_attr_leaf_name_local * name_loc ;
struct xfs_attr_leaf_name_remote * name_rmt ;
struct xfs_mount * mp ;
int tmp ;
int i ;
2005-04-16 15:20:36 -07:00
2012-11-12 22:53:53 +11:00
trace_xfs_attr_leaf_add_work ( args ) ;
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2013-04-24 18:58:55 +10:00
ASSERT ( mapindex > = 0 & & mapindex < XFS_ATTR_LEAF_MAPSIZE ) ;
ASSERT ( args - > index > = 0 & & args - > index < = ichdr - > count ) ;
2005-04-16 15:20:36 -07:00
/*
* Force open some space in the entry array and fill it in .
*/
2013-04-24 18:58:55 +10:00
entry = & xfs_attr3_leaf_entryp ( leaf ) [ args - > index ] ;
if ( args - > index < ichdr - > count ) {
tmp = ichdr - > count - args - > index ;
2005-04-16 15:20:36 -07:00
tmp * = sizeof ( xfs_attr_leaf_entry_t ) ;
2013-04-24 18:58:55 +10:00
memmove ( entry + 1 , entry , tmp ) ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf , entry , tmp + sizeof ( * entry ) ) ) ;
}
2013-04-24 18:58:55 +10:00
ichdr - > count + + ;
2005-04-16 15:20:36 -07:00
/*
* Allocate space for the new string ( at the end of the run ) .
*/
mp = args - > trans - > t_mountp ;
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr - > freemap [ mapindex ] . base < args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( ( ichdr - > freemap [ mapindex ] . base & 0x3 ) = = 0 ) ;
ASSERT ( ichdr - > freemap [ mapindex ] . size > =
2014-06-06 15:21:27 +10:00
xfs_attr_leaf_newentsize ( args , NULL ) ) ;
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr - > freemap [ mapindex ] . size < args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( ( ichdr - > freemap [ mapindex ] . size & 0x3 ) = = 0 ) ;
2014-06-06 15:21:27 +10:00
ichdr - > freemap [ mapindex ] . size - = xfs_attr_leaf_newentsize ( args , & tmp ) ;
2013-04-24 18:58:55 +10:00
entry - > nameidx = cpu_to_be16 ( ichdr - > freemap [ mapindex ] . base +
ichdr - > freemap [ mapindex ] . size ) ;
2006-03-17 17:29:02 +11:00
entry - > hashval = cpu_to_be32 ( args - > hashval ) ;
2020-02-26 17:30:42 -08:00
entry - > flags = args - > attr_filter ;
if ( tmp )
entry - > flags | = XFS_ATTR_LOCAL ;
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
if ( args - > op_flags & XFS_DA_OP_REPLACE ) {
xfs: fix TOCTOU race involving the new logged xattrs control knob
I found a race involving the larp control knob, aka the debugging knob
that lets developers enable logging of extended attribute updates:
Thread 1 Thread 2
echo 0 > /sys/fs/xfs/debug/larp
setxattr(REPLACE)
xfs_has_larp (returns false)
xfs_attr_set
echo 1 > /sys/fs/xfs/debug/larp
xfs_attr_defer_replace
xfs_attr_init_replace_state
xfs_has_larp (returns true)
xfs_attr_init_remove_state
<oops, wrong DAS state!>
This isn't a particularly severe problem right now because xattr logging
is only enabled when CONFIG_XFS_DEBUG=y, and developers *should* know
what they're doing.
However, the eventual intent is that callers should be able to ask for
the assistance of the log in persisting xattr updates. This capability
might not be required for /all/ callers, which means that dynamic
control must work correctly. Once an xattr update has decided whether
or not to use logged xattrs, it needs to stay in that mode until the end
of the operation regardless of what subsequent parallel operations might
do.
Therefore, it is an error to continue sampling xfs_globals.larp once
xfs_attr_change has made a decision about larp, and it was not correct
for me to have told Allison that ->create_intent functions can sample
the global log incompat feature bitfield to decide to elide a log item.
Instead, create a new op flag for the xfs_da_args structure, and convert
all other callers of xfs_has_larp and xfs_sb_version_haslogxattrs within
the attr update state machine to look for the operations flag.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2022-06-05 18:51:22 -07:00
if ( ! ( args - > op_flags & XFS_DA_OP_LOGGED ) )
2022-05-09 19:09:10 +10:00
entry - > flags | = XFS_ATTR_INCOMPLETE ;
2005-04-16 15:20:36 -07:00
if ( ( args - > blkno2 = = args - > blkno ) & &
( args - > index2 < = args - > index ) ) {
args - > index2 + + ;
}
}
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf , entry , sizeof ( * entry ) ) ) ;
2006-03-17 17:29:02 +11:00
ASSERT ( ( args - > index = = 0 ) | |
( be32_to_cpu ( entry - > hashval ) > = be32_to_cpu ( ( entry - 1 ) - > hashval ) ) ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( ( args - > index = = ichdr - > count - 1 ) | |
2006-03-17 17:29:02 +11:00
( be32_to_cpu ( entry - > hashval ) < = be32_to_cpu ( ( entry + 1 ) - > hashval ) ) ) ;
2005-04-16 15:20:36 -07:00
/*
* For " remote " attribute values , simply note that we need to
* allocate space for the " remote " value . We can ' t actually
* allocate the extents in this transaction , and we can ' t decide
* which blocks they should be as we might allocate more blocks
* as part of this transaction ( a split operation for example ) .
*/
if ( entry - > flags & XFS_ATTR_LOCAL ) {
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf , args - > index ) ;
2005-04-16 15:20:36 -07:00
name_loc - > namelen = args - > namelen ;
2006-03-17 17:29:09 +11:00
name_loc - > valuelen = cpu_to_be16 ( args - > valuelen ) ;
2005-04-16 15:20:36 -07:00
memcpy ( ( char * ) name_loc - > nameval , args - > name , args - > namelen ) ;
memcpy ( ( char * ) & name_loc - > nameval [ args - > namelen ] , args - > value ,
2006-03-17 17:29:09 +11:00
be16_to_cpu ( name_loc - > valuelen ) ) ;
2005-04-16 15:20:36 -07:00
} else {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf , args - > index ) ;
2005-04-16 15:20:36 -07:00
name_rmt - > namelen = args - > namelen ;
memcpy ( ( char * ) name_rmt - > name , args - > name , args - > namelen ) ;
entry - > flags | = XFS_ATTR_INCOMPLETE ;
/* just in case */
name_rmt - > valuelen = 0 ;
name_rmt - > valueblk = 0 ;
args - > rmtblkno = 1 ;
xfs: rework remote attr CRCs
Note: this changes the on-disk remote attribute format. I assert
that this is OK to do as CRCs are marked experimental and the first
kernel it is included in has not yet reached release yet. Further,
the userspace utilities are still evolving and so anyone using this
stuff right now is a developer or tester using volatile filesystems
for testing this feature. Hence changing the format right now to
save longer term pain is the right thing to do.
The fundamental change is to move from a header per extent in the
attribute to a header per filesytem block in the attribute. This
means there are more header blocks and the parsing of the attribute
data is slightly more complex, but it has the advantage that we
always know the size of the attribute on disk based on the length of
the data it contains.
This is where the header-per-extent method has problems. We don't
know the size of the attribute on disk without first knowing how
many extents are used to hold it. And we can't tell from a
mapping lookup, either, because remote attributes can be allocated
contiguously with other attribute blocks and so there is no obvious
way of determining the actual size of the atribute on disk short of
walking and mapping buffers.
The problem with this approach is that if we map a buffer
incorrectly (e.g. we make the last buffer for the attribute data too
long), we then get buffer cache lookup failure when we map it
correctly. i.e. we get a size mismatch on lookup. This is not
necessarily fatal, but it's a cache coherency problem that can lead
to returning the wrong data to userspace or writing the wrong data
to disk. And debug kernels will assert fail if this occurs.
I found lots of niggly little problems trying to fix this issue on a
4k block size filesystem, finally getting it to pass with lots of
fixes. The thing is, 1024 byte filesystems still failed, and it was
getting really complex handling all the corner cases that were
showing up. And there were clearly more that I hadn't found yet.
It is complex, fragile code, and if we don't fix it now, it will be
complex, fragile code forever more.
Hence the simple fix is to add a header to each filesystem block.
This gives us the same relationship between the attribute data
length and the number of blocks on disk as we have without CRCs -
it's a linear mapping and doesn't require us to guess anything. It
is simple to implement, too - the remote block count calculated at
lookup time can be used by the remote attribute set/get/remove code
without modification for both CRC and non-CRC filesystems. The world
becomes sane again.
Because the copy-in and copy-out now need to iterate over each
filesystem block, I moved them into helper functions so we separate
the block mapping and buffer manupulations from the attribute data
and CRC header manipulations. The code becomes much clearer as a
result, and it is a lot easier to understand and debug. It also
appears to be much more robust - once it worked on 4k block size
filesystems, it has worked without failure on 1k block size
filesystems, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-05-21 18:02:08 +10:00
args - > rmtblkcnt = xfs_attr3_rmt_blocks ( mp , args - > valuelen ) ;
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
args - > rmtvaluelen = args - > valuelen ;
2005-04-16 15:20:36 -07:00
}
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2013-04-24 18:58:55 +10:00
XFS_DA_LOGRANGE ( leaf , xfs_attr3_leaf_name ( leaf , args - > index ) ,
2005-04-16 15:20:36 -07:00
xfs_attr_leaf_entsize ( leaf , args - > index ) ) ) ;
/*
* Update the control info for this leaf node
*/
2013-04-24 18:58:55 +10:00
if ( be16_to_cpu ( entry - > nameidx ) < ichdr - > firstused )
ichdr - > firstused = be16_to_cpu ( entry - > nameidx ) ;
ASSERT ( ichdr - > firstused > = ichdr - > count * sizeof ( xfs_attr_leaf_entry_t )
+ xfs_attr3_leaf_hdr_size ( leaf ) ) ;
tmp = ( ichdr - > count - 1 ) * sizeof ( xfs_attr_leaf_entry_t )
+ xfs_attr3_leaf_hdr_size ( leaf ) ;
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
if ( ichdr - > freemap [ i ] . base = = tmp ) {
ichdr - > freemap [ i ] . base + = sizeof ( xfs_attr_leaf_entry_t ) ;
2019-11-15 21:15:08 -08:00
ichdr - > freemap [ i ] . size - =
min_t ( uint16_t , ichdr - > freemap [ i ] . size ,
sizeof ( xfs_attr_leaf_entry_t ) ) ;
2005-04-16 15:20:36 -07:00
}
}
2013-04-24 18:58:55 +10:00
ichdr - > usedbytes + = xfs_attr_leaf_entsize ( leaf , args - > index ) ;
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Garbage collect a leaf attribute list block by copying it to a new buffer .
*/
STATIC void
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_compact (
2012-11-12 22:53:53 +11:00
struct xfs_da_args * args ,
2013-05-21 18:02:06 +10:00
struct xfs_attr3_icleaf_hdr * ichdr_dst ,
2012-11-12 22:53:53 +11:00
struct xfs_buf * bp )
2005-04-16 15:20:36 -07:00
{
2013-05-21 18:02:06 +10:00
struct xfs_attr_leafblock * leaf_src ;
struct xfs_attr_leafblock * leaf_dst ;
struct xfs_attr3_icleaf_hdr ichdr_src ;
2012-11-12 22:53:53 +11:00
struct xfs_trans * trans = args - > trans ;
char * tmpbuffer ;
trace_xfs_attr_leaf_compact ( args ) ;
2005-04-16 15:20:36 -07:00
2024-01-16 09:59:40 +11:00
tmpbuffer = kmalloc ( args - > geo - > blksize , GFP_KERNEL | __GFP_NOFAIL ) ;
2014-06-06 15:21:45 +10:00
memcpy ( tmpbuffer , bp - > b_addr , args - > geo - > blksize ) ;
memset ( bp - > b_addr , 0 , args - > geo - > blksize ) ;
2013-05-21 18:02:06 +10:00
leaf_src = ( xfs_attr_leafblock_t * ) tmpbuffer ;
leaf_dst = bp - > b_addr ;
2005-04-16 15:20:36 -07:00
/*
2013-05-21 18:02:06 +10:00
* Copy the on - disk header back into the destination buffer to ensure
* all the information in the header that is not part of the incore
* header structure is preserved .
2005-04-16 15:20:36 -07:00
*/
2013-05-21 18:02:06 +10:00
memcpy ( bp - > b_addr , tmpbuffer , xfs_attr3_leaf_hdr_size ( leaf_src ) ) ;
/* Initialise the incore headers */
ichdr_src = * ichdr_dst ; /* struct copy */
2014-06-06 15:21:45 +10:00
ichdr_dst - > firstused = args - > geo - > blksize ;
2013-05-21 18:02:06 +10:00
ichdr_dst - > usedbytes = 0 ;
ichdr_dst - > count = 0 ;
ichdr_dst - > holes = 0 ;
ichdr_dst - > freemap [ 0 ] . base = xfs_attr3_leaf_hdr_size ( leaf_src ) ;
ichdr_dst - > freemap [ 0 ] . size = ichdr_dst - > firstused -
ichdr_dst - > freemap [ 0 ] . base ;
/* write the header back to initialise the underlying buffer */
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( args - > geo , leaf_dst , ichdr_dst ) ;
2005-04-16 15:20:36 -07:00
/*
* Copy all entry ' s in the same ( sorted ) order ,
* but allocate name / value pairs packed and in sequence .
*/
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( args , leaf_src , & ichdr_src , 0 ,
leaf_dst , ichdr_dst , 0 , ichdr_src . count ) ;
2013-04-24 18:58:55 +10:00
/*
* this logs the entire buffer , but the caller must write the header
* back to the buffer when it is finished modifying it .
*/
2014-06-06 15:21:45 +10:00
xfs_trans_log_buf ( trans , bp , 0 , args - > geo - > blksize - 1 ) ;
2005-04-16 15:20:36 -07:00
2024-01-16 09:59:43 +11:00
kfree ( tmpbuffer ) ;
2005-04-16 15:20:36 -07:00
}
2013-04-24 18:58:55 +10:00
/*
* Compare two leaf blocks " order " .
* Return 0 unless leaf2 should go before leaf1 .
*/
static int
xfs_attr3_leaf_order (
struct xfs_buf * leaf1_bp ,
struct xfs_attr3_icleaf_hdr * leaf1hdr ,
struct xfs_buf * leaf2_bp ,
struct xfs_attr3_icleaf_hdr * leaf2hdr )
{
struct xfs_attr_leaf_entry * entries1 ;
struct xfs_attr_leaf_entry * entries2 ;
entries1 = xfs_attr3_leaf_entryp ( leaf1_bp - > b_addr ) ;
entries2 = xfs_attr3_leaf_entryp ( leaf2_bp - > b_addr ) ;
if ( leaf1hdr - > count > 0 & & leaf2hdr - > count > 0 & &
( ( be32_to_cpu ( entries2 [ 0 ] . hashval ) <
be32_to_cpu ( entries1 [ 0 ] . hashval ) ) | |
( be32_to_cpu ( entries2 [ leaf2hdr - > count - 1 ] . hashval ) <
be32_to_cpu ( entries1 [ leaf1hdr - > count - 1 ] . hashval ) ) ) ) {
return 1 ;
}
return 0 ;
}
int
xfs_attr_leaf_order (
struct xfs_buf * leaf1_bp ,
struct xfs_buf * leaf2_bp )
{
struct xfs_attr3_icleaf_hdr ichdr1 ;
struct xfs_attr3_icleaf_hdr ichdr2 ;
2019-06-28 19:27:29 -07:00
struct xfs_mount * mp = leaf1_bp - > b_mount ;
2013-04-24 18:58:55 +10:00
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( mp - > m_attr_geo , & ichdr1 , leaf1_bp - > b_addr ) ;
xfs_attr3_leaf_hdr_from_disk ( mp - > m_attr_geo , & ichdr2 , leaf2_bp - > b_addr ) ;
2013-04-24 18:58:55 +10:00
return xfs_attr3_leaf_order ( leaf1_bp , & ichdr1 , leaf2_bp , & ichdr2 ) ;
}
2005-04-16 15:20:36 -07:00
/*
* Redistribute the attribute list entries between two leaf nodes ,
* taking into account the size of the new entry .
*
* NOTE : if new block is empty , then it will get the upper half of the
* old block . At present , all ( one ) callers pass in an empty second block .
*
* This code adjusts the args - > index / blkno and args - > index2 / blkno2 fields
* to match what it is doing in splitting the attribute leaf block . Those
* values are used in " atomic rename " operations on attributes . Note that
* the " new " and " old " values can end up in different blocks .
*/
STATIC void
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_rebalance (
struct xfs_da_state * state ,
struct xfs_da_state_blk * blk1 ,
struct xfs_da_state_blk * blk2 )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_da_args * args ;
struct xfs_attr_leafblock * leaf1 ;
struct xfs_attr_leafblock * leaf2 ;
struct xfs_attr3_icleaf_hdr ichdr1 ;
struct xfs_attr3_icleaf_hdr ichdr2 ;
struct xfs_attr_leaf_entry * entries1 ;
struct xfs_attr_leaf_entry * entries2 ;
int count ;
int totallen ;
int max ;
int space ;
int swap ;
2005-04-16 15:20:36 -07:00
/*
* Set up environment .
*/
ASSERT ( blk1 - > magic = = XFS_ATTR_LEAF_MAGIC ) ;
ASSERT ( blk2 - > magic = = XFS_ATTR_LEAF_MAGIC ) ;
2012-06-22 18:50:14 +10:00
leaf1 = blk1 - > bp - > b_addr ;
leaf2 = blk2 - > bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( state - > args - > geo , & ichdr1 , leaf1 ) ;
xfs_attr3_leaf_hdr_from_disk ( state - > args - > geo , & ichdr2 , leaf2 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( ichdr2 . count = = 0 ) ;
2005-04-16 15:20:36 -07:00
args = state - > args ;
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_rebalance ( args ) ;
2005-04-16 15:20:36 -07:00
/*
* Check ordering of blocks , reverse if it makes things simpler .
*
* NOTE : Given that all ( current ) callers pass in an empty
* second block , this code should never set " swap " .
*/
swap = 0 ;
2013-04-24 18:58:55 +10:00
if ( xfs_attr3_leaf_order ( blk1 - > bp , & ichdr1 , blk2 - > bp , & ichdr2 ) ) {
2018-07-11 22:26:38 -07:00
swap ( blk1 , blk2 ) ;
2013-04-24 18:58:55 +10:00
2018-07-11 22:26:38 -07:00
/* swap structures rather than reconverting them */
swap ( ichdr1 , ichdr2 ) ;
2013-04-24 18:58:55 +10:00
2012-06-22 18:50:14 +10:00
leaf1 = blk1 - > bp - > b_addr ;
leaf2 = blk2 - > bp - > b_addr ;
2005-04-16 15:20:36 -07:00
swap = 1 ;
}
/*
* Examine entries until we reduce the absolute difference in
* byte usage between the two blocks to a minimum . Then get
* the direction to copy and the number of elements to move .
*
* " inleaf " is true if the new entry should be inserted into blk1 .
* If " swap " is also true , then reverse the sense of " inleaf " .
*/
2013-04-24 18:58:55 +10:00
state - > inleaf = xfs_attr3_leaf_figure_balance ( state , blk1 , & ichdr1 ,
blk2 , & ichdr2 ,
& count , & totallen ) ;
2005-04-16 15:20:36 -07:00
if ( swap )
state - > inleaf = ! state - > inleaf ;
/*
* Move any entries required from leaf to leaf :
*/
2013-04-24 18:58:55 +10:00
if ( count < ichdr1 . count ) {
2005-04-16 15:20:36 -07:00
/*
* Figure the total bytes to be added to the destination leaf .
*/
/* number entries being moved */
2013-04-24 18:58:55 +10:00
count = ichdr1 . count - count ;
space = ichdr1 . usedbytes - totallen ;
2005-04-16 15:20:36 -07:00
space + = count * sizeof ( xfs_attr_leaf_entry_t ) ;
/*
* leaf2 is the destination , compact it if it looks tight .
*/
2013-04-24 18:58:55 +10:00
max = ichdr2 . firstused - xfs_attr3_leaf_hdr_size ( leaf1 ) ;
max - = ichdr2 . count * sizeof ( xfs_attr_leaf_entry_t ) ;
2012-11-12 22:53:53 +11:00
if ( space > max )
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_compact ( args , & ichdr2 , blk2 - > bp ) ;
2005-04-16 15:20:36 -07:00
/*
* Move high entries from leaf1 to low end of leaf2 .
*/
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( args , leaf1 , & ichdr1 ,
ichdr1 . count - count , leaf2 , & ichdr2 , 0 , count ) ;
2005-04-16 15:20:36 -07:00
2013-04-24 18:58:55 +10:00
} else if ( count > ichdr1 . count ) {
2005-04-16 15:20:36 -07:00
/*
* I assert that since all callers pass in an empty
* second buffer , this code should never execute .
*/
xfs: fix attr tree double split corruption
In certain circumstances, a double split of an attribute tree is
needed to insert or replace an attribute. In rare situations, this
can go wrong, leaving the attribute tree corrupted. In this case,
the attr being replaced is the last attr in a leaf node, and the
replacement is larger so doesn't fit in the same leaf node.
When we have the initial condition of a node format attribute
btree with two leaves at index 1 and 2. Call them L1 and L2. The
leaf L1 is completely full, there is not a single byte of free space
in it. L2 is mostly empty. The attribute being replaced - call it X
- is the last attribute in L1.
The way an attribute replace is executed is that the replacement
attribute - call it Y - is first inserted into the tree, but has an
INCOMPLETE flag set on it so that list traversals ignore it. Once
this transaction is committed, a second transaction it run to
atomically mark Y as COMPLETE and X as INCOMPLETE, so that a
traversal will now find Y and skip X. Once that transaction is
committed, attribute X is then removed.
So, the initial condition is:
+--------+ +--------+
| L1 | | L2 |
| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |
| fsp: 0 | | fsp: N |
|--------| |--------|
| attr A | | attr 1 |
|--------| |--------|
| attr B | | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr X | | attr n |
+--------+ +--------+
So now we go to replace X, and see that L1:fsp = 0 - it is full so
we can't insert Y in the same leaf. So we record the the location of
attribute X so we can track it for later use, then we split L1 into
L1 and L3 and reblance across the two leafs. We end with:
+--------+ +--------+ +--------+
| L1 | | L3 | | L2 |
| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: N |
|--------| |--------| |--------|
| attr A | | attr X | | attr 1 |
|--------| +--------+ |--------|
| attr B | | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
And we track that the original attribute is now at L3:0.
We then try to insert Y into L1 again, and find that there isn't
enough room because the new attribute is larger than the old one.
Hence we have to split again to make room for Y. We end up with
this:
+--------+ +--------+ +--------+ +--------+
| L1 | | L4 | | L3 | | L2 |
| fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: J | | fsp: N |
|--------| |--------| |--------| |--------|
| attr A | | attr Y | | attr X | | attr 1 |
|--------| + INCOMP + +--------+ |--------|
| attr B | +--------+ | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
And now we have the new (incomplete) attribute @ L4:0, and the
original attribute at L3:0. At this point, the first transaction is
committed, and we move to the flipping of the flags.
This is where we are supposed to end up with this:
+--------+ +--------+ +--------+ +--------+
| L1 | | L4 | | L3 | | L2 |
| fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: J | | fsp: N |
|--------| |--------| |--------| |--------|
| attr A | | attr Y | | attr X | | attr 1 |
|--------| +--------+ + INCOMP + |--------|
| attr B | +--------+ | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
But that doesn't happen properly - the attribute tracking indexes
are not pointing to the right locations. What we end up with is both
the old attribute to be removed pointing at L4:0 and the new
attribute at L4:1. On a debug kernel, this assert fails like so:
XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725
because the new attribute location does not exist. On a production
kernel, this goes unnoticed and the code proceeds ahead merrily and
removes L4 because it thinks that is the block that is no longer
needed. This leaves the hash index node pointing to entries
L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the
leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free
space, and so everything is busted. This corruption is caused by the
removal of the old attribute triggering a join - it joins everything
correctly but then frees the wrong block.
xfs_repair will report something like:
bad sibling back pointer for block 4 in attribute fork for inode 131
problem with attribute contents in inode 131
would clear attr fork
bad nblocks 8 for inode 131, would reset to 3
bad anextents 4 for inode 131, would reset to 0
The problem lies in the assignment of the old/new blocks for
tracking purposes when the double leaf split occurs. The first split
tries to place the new attribute inside the current leaf (i.e.
"inleaf == true") and moves the old attribute (X) to the new block.
This sets up the old block/index to L1:X, and newly allocated
block to L3:0. It then moves attr X to the new block and tries to
insert attr Y at the old index. That fails, so it splits again.
With the second split, the rebalance ends up placing the new attr in
the second new block - L4:0 - and this is where the code goes wrong.
What is does is it sets both the new and old block index to the
second new block. Hence it inserts attr Y at the right place (L4:0)
but overwrites the current location of the attr to replace that is
held in the new block index (currently L3:0). It over writes it with
L4:1 - the index we later assert fail on.
Hopefully this table will show this in a foramt that is a bit easier
to understand:
Split old attr index new attr index
vanilla patched vanilla patched
before 1st L1:26 L1:26 N/A N/A
after 1st L3:0 L3:0 L1:26 L1:26
after 2nd L4:0 L3:0 L4:1 L4:0
^^^^ ^^^^
wrong wrong
The fix is surprisingly simple, for all this analysis - just stop
the rebalance on the out-of leaf case from overwriting the new attr
index - it's already correct for the double split case.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2012-11-12 22:09:44 +11:00
ASSERT ( 0 ) ;
2005-04-16 15:20:36 -07:00
/*
* Figure the total bytes to be added to the destination leaf .
*/
/* number entries being moved */
2013-04-24 18:58:55 +10:00
count - = ichdr1 . count ;
space = totallen - ichdr1 . usedbytes ;
2005-04-16 15:20:36 -07:00
space + = count * sizeof ( xfs_attr_leaf_entry_t ) ;
/*
* leaf1 is the destination , compact it if it looks tight .
*/
2013-04-24 18:58:55 +10:00
max = ichdr1 . firstused - xfs_attr3_leaf_hdr_size ( leaf1 ) ;
max - = ichdr1 . count * sizeof ( xfs_attr_leaf_entry_t ) ;
2012-11-12 22:53:53 +11:00
if ( space > max )
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_compact ( args , & ichdr1 , blk1 - > bp ) ;
2005-04-16 15:20:36 -07:00
/*
* Move low entries from leaf2 to high end of leaf1 .
*/
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( args , leaf2 , & ichdr2 , 0 , leaf1 , & ichdr1 ,
ichdr1 . count , count ) ;
2005-04-16 15:20:36 -07:00
}
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( state - > args - > geo , leaf1 , & ichdr1 ) ;
xfs_attr3_leaf_hdr_to_disk ( state - > args - > geo , leaf2 , & ichdr2 ) ;
2014-06-06 15:22:04 +10:00
xfs_trans_log_buf ( args - > trans , blk1 - > bp , 0 , args - > geo - > blksize - 1 ) ;
xfs_trans_log_buf ( args - > trans , blk2 - > bp , 0 , args - > geo - > blksize - 1 ) ;
2013-04-24 18:58:55 +10:00
2005-04-16 15:20:36 -07:00
/*
* Copy out last hashval in each block for B - tree code .
*/
2013-04-24 18:58:55 +10:00
entries1 = xfs_attr3_leaf_entryp ( leaf1 ) ;
entries2 = xfs_attr3_leaf_entryp ( leaf2 ) ;
blk1 - > hashval = be32_to_cpu ( entries1 [ ichdr1 . count - 1 ] . hashval ) ;
blk2 - > hashval = be32_to_cpu ( entries2 [ ichdr2 . count - 1 ] . hashval ) ;
2005-04-16 15:20:36 -07:00
/*
* Adjust the expected index for insertion .
* NOTE : this code depends on the ( current ) situation that the
* second block was originally empty .
*
* If the insertion point moved to the 2 nd block , we must adjust
* the index . We must also track the entry just following the
* new entry for use in an " atomic rename " operation , that entry
* is always the " old " entry and the " new " entry is what we are
* inserting . The index / blkno fields refer to the " old " entry ,
* while the index2 / blkno2 fields refer to the " new " entry .
*/
2013-04-24 18:58:55 +10:00
if ( blk1 - > index > ichdr1 . count ) {
2005-04-16 15:20:36 -07:00
ASSERT ( state - > inleaf = = 0 ) ;
2013-04-24 18:58:55 +10:00
blk2 - > index = blk1 - > index - ichdr1 . count ;
2005-04-16 15:20:36 -07:00
args - > index = args - > index2 = blk2 - > index ;
args - > blkno = args - > blkno2 = blk2 - > blkno ;
2013-04-24 18:58:55 +10:00
} else if ( blk1 - > index = = ichdr1 . count ) {
2005-04-16 15:20:36 -07:00
if ( state - > inleaf ) {
args - > index = blk1 - > index ;
args - > blkno = blk1 - > blkno ;
args - > index2 = 0 ;
args - > blkno2 = blk2 - > blkno ;
} else {
xfs: fix attr tree double split corruption
In certain circumstances, a double split of an attribute tree is
needed to insert or replace an attribute. In rare situations, this
can go wrong, leaving the attribute tree corrupted. In this case,
the attr being replaced is the last attr in a leaf node, and the
replacement is larger so doesn't fit in the same leaf node.
When we have the initial condition of a node format attribute
btree with two leaves at index 1 and 2. Call them L1 and L2. The
leaf L1 is completely full, there is not a single byte of free space
in it. L2 is mostly empty. The attribute being replaced - call it X
- is the last attribute in L1.
The way an attribute replace is executed is that the replacement
attribute - call it Y - is first inserted into the tree, but has an
INCOMPLETE flag set on it so that list traversals ignore it. Once
this transaction is committed, a second transaction it run to
atomically mark Y as COMPLETE and X as INCOMPLETE, so that a
traversal will now find Y and skip X. Once that transaction is
committed, attribute X is then removed.
So, the initial condition is:
+--------+ +--------+
| L1 | | L2 |
| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |
| fsp: 0 | | fsp: N |
|--------| |--------|
| attr A | | attr 1 |
|--------| |--------|
| attr B | | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr X | | attr n |
+--------+ +--------+
So now we go to replace X, and see that L1:fsp = 0 - it is full so
we can't insert Y in the same leaf. So we record the the location of
attribute X so we can track it for later use, then we split L1 into
L1 and L3 and reblance across the two leafs. We end with:
+--------+ +--------+ +--------+
| L1 | | L3 | | L2 |
| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: N |
|--------| |--------| |--------|
| attr A | | attr X | | attr 1 |
|--------| +--------+ |--------|
| attr B | | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
And we track that the original attribute is now at L3:0.
We then try to insert Y into L1 again, and find that there isn't
enough room because the new attribute is larger than the old one.
Hence we have to split again to make room for Y. We end up with
this:
+--------+ +--------+ +--------+ +--------+
| L1 | | L4 | | L3 | | L2 |
| fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: J | | fsp: N |
|--------| |--------| |--------| |--------|
| attr A | | attr Y | | attr X | | attr 1 |
|--------| + INCOMP + +--------+ |--------|
| attr B | +--------+ | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
And now we have the new (incomplete) attribute @ L4:0, and the
original attribute at L3:0. At this point, the first transaction is
committed, and we move to the flipping of the flags.
This is where we are supposed to end up with this:
+--------+ +--------+ +--------+ +--------+
| L1 | | L4 | | L3 | | L2 |
| fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: J | | fsp: N |
|--------| |--------| |--------| |--------|
| attr A | | attr Y | | attr X | | attr 1 |
|--------| +--------+ + INCOMP + |--------|
| attr B | +--------+ | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
But that doesn't happen properly - the attribute tracking indexes
are not pointing to the right locations. What we end up with is both
the old attribute to be removed pointing at L4:0 and the new
attribute at L4:1. On a debug kernel, this assert fails like so:
XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725
because the new attribute location does not exist. On a production
kernel, this goes unnoticed and the code proceeds ahead merrily and
removes L4 because it thinks that is the block that is no longer
needed. This leaves the hash index node pointing to entries
L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the
leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free
space, and so everything is busted. This corruption is caused by the
removal of the old attribute triggering a join - it joins everything
correctly but then frees the wrong block.
xfs_repair will report something like:
bad sibling back pointer for block 4 in attribute fork for inode 131
problem with attribute contents in inode 131
would clear attr fork
bad nblocks 8 for inode 131, would reset to 3
bad anextents 4 for inode 131, would reset to 0
The problem lies in the assignment of the old/new blocks for
tracking purposes when the double leaf split occurs. The first split
tries to place the new attribute inside the current leaf (i.e.
"inleaf == true") and moves the old attribute (X) to the new block.
This sets up the old block/index to L1:X, and newly allocated
block to L3:0. It then moves attr X to the new block and tries to
insert attr Y at the old index. That fails, so it splits again.
With the second split, the rebalance ends up placing the new attr in
the second new block - L4:0 - and this is where the code goes wrong.
What is does is it sets both the new and old block index to the
second new block. Hence it inserts attr Y at the right place (L4:0)
but overwrites the current location of the attr to replace that is
held in the new block index (currently L3:0). It over writes it with
L4:1 - the index we later assert fail on.
Hopefully this table will show this in a foramt that is a bit easier
to understand:
Split old attr index new attr index
vanilla patched vanilla patched
before 1st L1:26 L1:26 N/A N/A
after 1st L3:0 L3:0 L1:26 L1:26
after 2nd L4:0 L3:0 L4:1 L4:0
^^^^ ^^^^
wrong wrong
The fix is surprisingly simple, for all this analysis - just stop
the rebalance on the out-of leaf case from overwriting the new attr
index - it's already correct for the double split case.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2012-11-12 22:09:44 +11:00
/*
* On a double leaf split , the original attr location
* is already stored in blkno2 / index2 , so don ' t
* overwrite it overwise we corrupt the tree .
*/
2013-04-24 18:58:55 +10:00
blk2 - > index = blk1 - > index - ichdr1 . count ;
xfs: fix attr tree double split corruption
In certain circumstances, a double split of an attribute tree is
needed to insert or replace an attribute. In rare situations, this
can go wrong, leaving the attribute tree corrupted. In this case,
the attr being replaced is the last attr in a leaf node, and the
replacement is larger so doesn't fit in the same leaf node.
When we have the initial condition of a node format attribute
btree with two leaves at index 1 and 2. Call them L1 and L2. The
leaf L1 is completely full, there is not a single byte of free space
in it. L2 is mostly empty. The attribute being replaced - call it X
- is the last attribute in L1.
The way an attribute replace is executed is that the replacement
attribute - call it Y - is first inserted into the tree, but has an
INCOMPLETE flag set on it so that list traversals ignore it. Once
this transaction is committed, a second transaction it run to
atomically mark Y as COMPLETE and X as INCOMPLETE, so that a
traversal will now find Y and skip X. Once that transaction is
committed, attribute X is then removed.
So, the initial condition is:
+--------+ +--------+
| L1 | | L2 |
| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |
| fsp: 0 | | fsp: N |
|--------| |--------|
| attr A | | attr 1 |
|--------| |--------|
| attr B | | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr X | | attr n |
+--------+ +--------+
So now we go to replace X, and see that L1:fsp = 0 - it is full so
we can't insert Y in the same leaf. So we record the the location of
attribute X so we can track it for later use, then we split L1 into
L1 and L3 and reblance across the two leafs. We end with:
+--------+ +--------+ +--------+
| L1 | | L3 | | L2 |
| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: N |
|--------| |--------| |--------|
| attr A | | attr X | | attr 1 |
|--------| +--------+ |--------|
| attr B | | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
And we track that the original attribute is now at L3:0.
We then try to insert Y into L1 again, and find that there isn't
enough room because the new attribute is larger than the old one.
Hence we have to split again to make room for Y. We end up with
this:
+--------+ +--------+ +--------+ +--------+
| L1 | | L4 | | L3 | | L2 |
| fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: J | | fsp: N |
|--------| |--------| |--------| |--------|
| attr A | | attr Y | | attr X | | attr 1 |
|--------| + INCOMP + +--------+ |--------|
| attr B | +--------+ | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
And now we have the new (incomplete) attribute @ L4:0, and the
original attribute at L3:0. At this point, the first transaction is
committed, and we move to the flipping of the flags.
This is where we are supposed to end up with this:
+--------+ +--------+ +--------+ +--------+
| L1 | | L4 | | L3 | | L2 |
| fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
| bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
| fsp: M | | fsp: J | | fsp: J | | fsp: N |
|--------| |--------| |--------| |--------|
| attr A | | attr Y | | attr X | | attr 1 |
|--------| +--------+ + INCOMP + |--------|
| attr B | +--------+ | attr 2 |
|--------| |--------|
.......... ..........
|--------| |--------|
| attr W | | attr n |
+--------+ +--------+
But that doesn't happen properly - the attribute tracking indexes
are not pointing to the right locations. What we end up with is both
the old attribute to be removed pointing at L4:0 and the new
attribute at L4:1. On a debug kernel, this assert fails like so:
XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725
because the new attribute location does not exist. On a production
kernel, this goes unnoticed and the code proceeds ahead merrily and
removes L4 because it thinks that is the block that is no longer
needed. This leaves the hash index node pointing to entries
L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the
leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free
space, and so everything is busted. This corruption is caused by the
removal of the old attribute triggering a join - it joins everything
correctly but then frees the wrong block.
xfs_repair will report something like:
bad sibling back pointer for block 4 in attribute fork for inode 131
problem with attribute contents in inode 131
would clear attr fork
bad nblocks 8 for inode 131, would reset to 3
bad anextents 4 for inode 131, would reset to 0
The problem lies in the assignment of the old/new blocks for
tracking purposes when the double leaf split occurs. The first split
tries to place the new attribute inside the current leaf (i.e.
"inleaf == true") and moves the old attribute (X) to the new block.
This sets up the old block/index to L1:X, and newly allocated
block to L3:0. It then moves attr X to the new block and tries to
insert attr Y at the old index. That fails, so it splits again.
With the second split, the rebalance ends up placing the new attr in
the second new block - L4:0 - and this is where the code goes wrong.
What is does is it sets both the new and old block index to the
second new block. Hence it inserts attr Y at the right place (L4:0)
but overwrites the current location of the attr to replace that is
held in the new block index (currently L3:0). It over writes it with
L4:1 - the index we later assert fail on.
Hopefully this table will show this in a foramt that is a bit easier
to understand:
Split old attr index new attr index
vanilla patched vanilla patched
before 1st L1:26 L1:26 N/A N/A
after 1st L3:0 L3:0 L1:26 L1:26
after 2nd L4:0 L3:0 L4:1 L4:0
^^^^ ^^^^
wrong wrong
The fix is surprisingly simple, for all this analysis - just stop
the rebalance on the out-of leaf case from overwriting the new attr
index - it's already correct for the double split case.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2012-11-12 22:09:44 +11:00
args - > index = blk2 - > index ;
args - > blkno = blk2 - > blkno ;
if ( ! state - > extravalid ) {
/*
* set the new attr location to match the old
* one and let the higher level split code
* decide where in the leaf to place it .
*/
args - > index2 = blk2 - > index ;
args - > blkno2 = blk2 - > blkno ;
}
2005-04-16 15:20:36 -07:00
}
} else {
ASSERT ( state - > inleaf = = 1 ) ;
args - > index = args - > index2 = blk1 - > index ;
args - > blkno = args - > blkno2 = blk1 - > blkno ;
}
}
/*
* Examine entries until we reduce the absolute difference in
* byte usage between the two blocks to a minimum .
* GROT : Is this really necessary ? With other than a 512 byte blocksize ,
* GROT : there will always be enough room in either block for a new entry .
* GROT : Do a double - split for this case ?
*/
STATIC int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_figure_balance (
struct xfs_da_state * state ,
struct xfs_da_state_blk * blk1 ,
struct xfs_attr3_icleaf_hdr * ichdr1 ,
struct xfs_da_state_blk * blk2 ,
struct xfs_attr3_icleaf_hdr * ichdr2 ,
int * countarg ,
int * usedbytesarg )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf1 = blk1 - > bp - > b_addr ;
struct xfs_attr_leafblock * leaf2 = blk2 - > bp - > b_addr ;
struct xfs_attr_leaf_entry * entry ;
int count ;
int max ;
int index ;
int totallen = 0 ;
int half ;
int lastdelta ;
int foundit = 0 ;
int tmp ;
2005-04-16 15:20:36 -07:00
/*
* Examine entries until we reduce the absolute difference in
* byte usage between the two blocks to a minimum .
*/
2013-04-24 18:58:55 +10:00
max = ichdr1 - > count + ichdr2 - > count ;
half = ( max + 1 ) * sizeof ( * entry ) ;
half + = ichdr1 - > usedbytes + ichdr2 - > usedbytes +
2014-06-06 15:21:27 +10:00
xfs_attr_leaf_newentsize ( state - > args , NULL ) ;
2005-04-16 15:20:36 -07:00
half / = 2 ;
2014-06-06 15:22:04 +10:00
lastdelta = state - > args - > geo - > blksize ;
2013-04-24 18:58:55 +10:00
entry = xfs_attr3_leaf_entryp ( leaf1 ) ;
2005-04-16 15:20:36 -07:00
for ( count = index = 0 ; count < max ; entry + + , index + + , count + + ) {
# define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
/*
* The new entry is in the first block , account for it .
*/
if ( count = = blk1 - > index ) {
tmp = totallen + sizeof ( * entry ) +
2014-06-06 15:21:27 +10:00
xfs_attr_leaf_newentsize ( state - > args , NULL ) ;
2005-04-16 15:20:36 -07:00
if ( XFS_ATTR_ABS ( half - tmp ) > lastdelta )
break ;
lastdelta = XFS_ATTR_ABS ( half - tmp ) ;
totallen = tmp ;
foundit = 1 ;
}
/*
* Wrap around into the second block if necessary .
*/
2013-04-24 18:58:55 +10:00
if ( count = = ichdr1 - > count ) {
2005-04-16 15:20:36 -07:00
leaf1 = leaf2 ;
2013-04-24 18:58:55 +10:00
entry = xfs_attr3_leaf_entryp ( leaf1 ) ;
2005-04-16 15:20:36 -07:00
index = 0 ;
}
/*
* Figure out if next leaf entry would be too much .
*/
tmp = totallen + sizeof ( * entry ) + xfs_attr_leaf_entsize ( leaf1 ,
index ) ;
if ( XFS_ATTR_ABS ( half - tmp ) > lastdelta )
break ;
lastdelta = XFS_ATTR_ABS ( half - tmp ) ;
totallen = tmp ;
# undef XFS_ATTR_ABS
}
/*
* Calculate the number of usedbytes that will end up in lower block .
* If new entry not in lower block , fix up the count .
*/
totallen - = count * sizeof ( * entry ) ;
if ( foundit ) {
totallen - = sizeof ( * entry ) +
2014-06-06 15:21:27 +10:00
xfs_attr_leaf_newentsize ( state - > args , NULL ) ;
2005-04-16 15:20:36 -07:00
}
* countarg = count ;
* usedbytesarg = totallen ;
2013-04-24 18:58:55 +10:00
return foundit ;
2005-04-16 15:20:36 -07:00
}
/*========================================================================
* Routines used for shrinking the Btree .
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
* Check a leaf block and its neighbors to see if the block should be
* collapsed into one or the other neighbor . Always keep the block
* with the smaller block number .
* If the current block is over 50 % full , don ' t try to join it , return 0.
* If the block is empty , fill in the state structure and return 2.
* If it can be collapsed , fill in the state structure and return 1.
* If nothing can be done , return 0.
*
* GROT : allow for INCOMPLETE entries in calculation .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_toosmall (
struct xfs_da_state * state ,
int * action )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_da_state_blk * blk ;
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_buf * bp ;
xfs_dablk_t blkno ;
int bytes ;
int forward ;
int error ;
int retval ;
int i ;
2005-04-16 15:20:36 -07:00
2012-11-12 22:53:53 +11:00
trace_xfs_attr_leaf_toosmall ( state - > args ) ;
2005-04-16 15:20:36 -07:00
/*
* Check for the degenerate case of the block being over 50 % full .
* If so , it ' s not worth even looking to see if we might be able
* to coalesce with a sibling .
*/
blk = & state - > path . blk [ state - > path . active - 1 ] ;
2013-04-24 18:58:55 +10:00
leaf = blk - > bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( state - > args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
bytes = xfs_attr3_leaf_hdr_size ( leaf ) +
ichdr . count * sizeof ( xfs_attr_leaf_entry_t ) +
ichdr . usedbytes ;
2014-06-06 15:22:04 +10:00
if ( bytes > ( state - > args - > geo - > blksize > > 1 ) ) {
2005-04-16 15:20:36 -07:00
* action = 0 ; /* blk over 50%, don't try to join */
2014-06-22 15:03:54 +10:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Check for the degenerate case of the block being empty .
* If the block is empty , we ' ll simply delete it , no need to
2006-03-29 08:55:14 +10:00
* coalesce it with a sibling block . We choose ( arbitrarily )
2005-04-16 15:20:36 -07:00
* to merge with the forward block unless it is NULL .
*/
2013-04-24 18:58:55 +10:00
if ( ichdr . count = = 0 ) {
2005-04-16 15:20:36 -07:00
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop ( this one ) .
*/
2013-04-24 18:58:55 +10:00
forward = ( ichdr . forw ! = 0 ) ;
2005-04-16 15:20:36 -07:00
memcpy ( & state - > altpath , & state - > path , sizeof ( state - > path ) ) ;
2013-04-24 18:58:02 +10:00
error = xfs_da3_path_shift ( state , & state - > altpath , forward ,
2005-04-16 15:20:36 -07:00
0 , & retval ) ;
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
if ( retval ) {
* action = 0 ;
} else {
* action = 2 ;
}
2013-04-24 18:58:55 +10:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Examine each sibling block to see if we can coalesce with
* at least 25 % free space to spare . We need to figure out
* whether to merge with the forward or the backward block .
* We prefer coalescing with the lower numbered sibling so as
* to shrink an attribute list over time .
*/
/* start with smaller blk num */
2013-04-24 18:58:55 +10:00
forward = ichdr . forw < ichdr . back ;
2005-04-16 15:20:36 -07:00
for ( i = 0 ; i < 2 ; forward = ! forward , i + + ) {
2013-04-24 18:58:55 +10:00
struct xfs_attr3_icleaf_hdr ichdr2 ;
2005-04-16 15:20:36 -07:00
if ( forward )
2013-04-24 18:58:55 +10:00
blkno = ichdr . forw ;
2005-04-16 15:20:36 -07:00
else
2013-04-24 18:58:55 +10:00
blkno = ichdr . back ;
2005-04-16 15:20:36 -07:00
if ( blkno = = 0 )
continue ;
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_read ( state - > args - > trans , state - > args - > dp ,
2019-11-20 09:46:02 -08:00
blkno , & bp ) ;
2005-04-16 15:20:36 -07:00
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( state - > args - > geo , & ichdr2 , bp - > b_addr ) ;
2013-04-24 18:58:55 +10:00
2014-06-06 15:22:04 +10:00
bytes = state - > args - > geo - > blksize -
( state - > args - > geo - > blksize > > 2 ) -
2013-04-24 18:58:55 +10:00
ichdr . usedbytes - ichdr2 . usedbytes -
( ( ichdr . count + ichdr2 . count ) *
sizeof ( xfs_attr_leaf_entry_t ) ) -
xfs_attr3_leaf_hdr_size ( leaf ) ;
2012-06-22 18:50:14 +10:00
xfs_trans_brelse ( state - > args - > trans , bp ) ;
2005-04-16 15:20:36 -07:00
if ( bytes > = 0 )
break ; /* fits with at least 25% to spare */
}
if ( i > = 2 ) {
* action = 0 ;
2014-06-22 15:03:54 +10:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Make altpath point to the block we want to keep ( the lower
* numbered block ) and path point to the block we want to drop .
*/
memcpy ( & state - > altpath , & state - > path , sizeof ( state - > path ) ) ;
if ( blkno < blk - > blkno ) {
2013-04-24 18:58:02 +10:00
error = xfs_da3_path_shift ( state , & state - > altpath , forward ,
2005-04-16 15:20:36 -07:00
0 , & retval ) ;
} else {
2013-04-24 18:58:02 +10:00
error = xfs_da3_path_shift ( state , & state - > path , forward ,
2005-04-16 15:20:36 -07:00
0 , & retval ) ;
}
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
if ( retval ) {
* action = 0 ;
} else {
* action = 1 ;
}
2014-06-22 15:03:54 +10:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Remove a name from the leaf attribute list structure .
*
* Return 1 if leaf is less than 37 % full , 0 if > = 37 % full .
* If two leaves are 37 % full , when combined they will leave 25 % free .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_remove (
struct xfs_buf * bp ,
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_attr_leaf_entry * entry ;
int before ;
int after ;
int smallest ;
int entsize ;
int tablesize ;
int tmp ;
int i ;
2005-04-16 15:20:36 -07:00
2012-11-12 22:53:53 +11:00
trace_xfs_attr_leaf_remove ( args ) ;
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr . count > 0 & & ichdr . count < args - > geo - > blksize / 8 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index > = 0 & & args - > index < ichdr . count ) ;
ASSERT ( ichdr . firstused > = ichdr . count * sizeof ( * entry ) +
xfs_attr3_leaf_hdr_size ( leaf ) ) ;
entry = & xfs_attr3_leaf_entryp ( leaf ) [ args - > index ] ;
ASSERT ( be16_to_cpu ( entry - > nameidx ) > = ichdr . firstused ) ;
2014-06-06 15:21:45 +10:00
ASSERT ( be16_to_cpu ( entry - > nameidx ) < args - > geo - > blksize ) ;
2005-04-16 15:20:36 -07:00
/*
* Scan through free region table :
* check for adjacency of free ' d entry with an existing one ,
* find smallest free region in case we need to replace it ,
* adjust any map that borders the entry table ,
*/
2013-04-24 18:58:55 +10:00
tablesize = ichdr . count * sizeof ( xfs_attr_leaf_entry_t )
+ xfs_attr3_leaf_hdr_size ( leaf ) ;
tmp = ichdr . freemap [ 0 ] . size ;
2005-04-16 15:20:36 -07:00
before = after = - 1 ;
smallest = XFS_ATTR_LEAF_MAPSIZE - 1 ;
entsize = xfs_attr_leaf_entsize ( leaf , args - > index ) ;
2013-04-24 18:58:55 +10:00
for ( i = 0 ; i < XFS_ATTR_LEAF_MAPSIZE ; i + + ) {
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr . freemap [ i ] . base < args - > geo - > blksize ) ;
ASSERT ( ichdr . freemap [ i ] . size < args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
if ( ichdr . freemap [ i ] . base = = tablesize ) {
ichdr . freemap [ i ] . base - = sizeof ( xfs_attr_leaf_entry_t ) ;
ichdr . freemap [ i ] . size + = sizeof ( xfs_attr_leaf_entry_t ) ;
2005-04-16 15:20:36 -07:00
}
2013-04-24 18:58:55 +10:00
if ( ichdr . freemap [ i ] . base + ichdr . freemap [ i ] . size = =
be16_to_cpu ( entry - > nameidx ) ) {
2005-04-16 15:20:36 -07:00
before = i ;
2013-04-24 18:58:55 +10:00
} else if ( ichdr . freemap [ i ] . base = =
( be16_to_cpu ( entry - > nameidx ) + entsize ) ) {
2005-04-16 15:20:36 -07:00
after = i ;
2013-04-24 18:58:55 +10:00
} else if ( ichdr . freemap [ i ] . size < tmp ) {
tmp = ichdr . freemap [ i ] . size ;
2005-04-16 15:20:36 -07:00
smallest = i ;
}
}
/*
* Coalesce adjacent freemap regions ,
* or replace the smallest region .
*/
if ( ( before > = 0 ) | | ( after > = 0 ) ) {
if ( ( before > = 0 ) & & ( after > = 0 ) ) {
2013-04-24 18:58:55 +10:00
ichdr . freemap [ before ] . size + = entsize ;
ichdr . freemap [ before ] . size + = ichdr . freemap [ after ] . size ;
ichdr . freemap [ after ] . base = 0 ;
ichdr . freemap [ after ] . size = 0 ;
2005-04-16 15:20:36 -07:00
} else if ( before > = 0 ) {
2013-04-24 18:58:55 +10:00
ichdr . freemap [ before ] . size + = entsize ;
2005-04-16 15:20:36 -07:00
} else {
2013-04-24 18:58:55 +10:00
ichdr . freemap [ after ] . base = be16_to_cpu ( entry - > nameidx ) ;
ichdr . freemap [ after ] . size + = entsize ;
2005-04-16 15:20:36 -07:00
}
} else {
/*
* Replace smallest region ( if it is smaller than free ' d entry )
*/
2013-04-24 18:58:55 +10:00
if ( ichdr . freemap [ smallest ] . size < entsize ) {
ichdr . freemap [ smallest ] . base = be16_to_cpu ( entry - > nameidx ) ;
ichdr . freemap [ smallest ] . size = entsize ;
2005-04-16 15:20:36 -07:00
}
}
/*
* Did we remove the first entry ?
*/
2013-04-24 18:58:55 +10:00
if ( be16_to_cpu ( entry - > nameidx ) = = ichdr . firstused )
2005-04-16 15:20:36 -07:00
smallest = 1 ;
else
smallest = 0 ;
/*
* Compress the remaining entries and zero out the removed stuff .
*/
2013-04-24 18:58:55 +10:00
memset ( xfs_attr3_leaf_name ( leaf , args - > index ) , 0 , entsize ) ;
ichdr . usedbytes - = entsize ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2013-04-24 18:58:55 +10:00
XFS_DA_LOGRANGE ( leaf , xfs_attr3_leaf_name ( leaf , args - > index ) ,
2005-04-16 15:20:36 -07:00
entsize ) ) ;
2013-04-24 18:58:55 +10:00
tmp = ( ichdr . count - args - > index ) * sizeof ( xfs_attr_leaf_entry_t ) ;
memmove ( entry , entry + 1 , tmp ) ;
ichdr . count - - ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2013-04-24 18:58:55 +10:00
XFS_DA_LOGRANGE ( leaf , entry , tmp + sizeof ( xfs_attr_leaf_entry_t ) ) ) ;
entry = & xfs_attr3_leaf_entryp ( leaf ) [ ichdr . count ] ;
memset ( entry , 0 , sizeof ( xfs_attr_leaf_entry_t ) ) ;
2005-04-16 15:20:36 -07:00
/*
* If we removed the first entry , re - find the first used byte
* in the name area . Note that if the entry was the " firstused " ,
* then we don ' t have a " hole " in our block resulting from
* removing the name .
*/
if ( smallest ) {
2014-06-06 15:21:45 +10:00
tmp = args - > geo - > blksize ;
2013-04-24 18:58:55 +10:00
entry = xfs_attr3_leaf_entryp ( leaf ) ;
for ( i = ichdr . count - 1 ; i > = 0 ; entry + + , i - - ) {
ASSERT ( be16_to_cpu ( entry - > nameidx ) > = ichdr . firstused ) ;
2014-06-06 15:21:45 +10:00
ASSERT ( be16_to_cpu ( entry - > nameidx ) < args - > geo - > blksize ) ;
2006-03-17 17:29:02 +11:00
if ( be16_to_cpu ( entry - > nameidx ) < tmp )
tmp = be16_to_cpu ( entry - > nameidx ) ;
2005-04-16 15:20:36 -07:00
}
2013-04-24 18:58:55 +10:00
ichdr . firstused = tmp ;
2015-04-13 11:27:59 +10:00
ASSERT ( ichdr . firstused ! = 0 ) ;
2005-04-16 15:20:36 -07:00
} else {
2013-04-24 18:58:55 +10:00
ichdr . holes = 1 ; /* mark as needing compaction */
2005-04-16 15:20:36 -07:00
}
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( args - > geo , leaf , & ichdr ) ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2013-04-24 18:58:55 +10:00
XFS_DA_LOGRANGE ( leaf , & leaf - > hdr ,
xfs_attr3_leaf_hdr_size ( leaf ) ) ) ;
2005-04-16 15:20:36 -07:00
/*
* Check if leaf is less than 50 % full , caller may want to
* " join " the leaf with a sibling if so .
*/
2013-04-24 18:58:55 +10:00
tmp = ichdr . usedbytes + xfs_attr3_leaf_hdr_size ( leaf ) +
ichdr . count * sizeof ( xfs_attr_leaf_entry_t ) ;
2014-06-06 15:18:10 +10:00
return tmp < args - > geo - > magicpct ; /* leaf is < 37% full */
2005-04-16 15:20:36 -07:00
}
/*
* Move all the attribute list entries from drop_leaf into save_leaf .
*/
void
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_unbalance (
struct xfs_da_state * state ,
struct xfs_da_state_blk * drop_blk ,
struct xfs_da_state_blk * save_blk )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * drop_leaf = drop_blk - > bp - > b_addr ;
struct xfs_attr_leafblock * save_leaf = save_blk - > bp - > b_addr ;
struct xfs_attr3_icleaf_hdr drophdr ;
struct xfs_attr3_icleaf_hdr savehdr ;
struct xfs_attr_leaf_entry * entry ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_unbalance ( state - > args ) ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( state - > args - > geo , & drophdr , drop_leaf ) ;
xfs_attr3_leaf_hdr_from_disk ( state - > args - > geo , & savehdr , save_leaf ) ;
2013-04-24 18:58:55 +10:00
entry = xfs_attr3_leaf_entryp ( drop_leaf ) ;
2005-04-16 15:20:36 -07:00
/*
* Save last hashval from dying block for later Btree fixup .
*/
2013-04-24 18:58:55 +10:00
drop_blk - > hashval = be32_to_cpu ( entry [ drophdr . count - 1 ] . hashval ) ;
2005-04-16 15:20:36 -07:00
/*
* Check if we need a temp buffer , or can we do it in place .
* Note that we don ' t check " leaf " for holes because we will
* always be dropping it , toosmall ( ) decided that for us already .
*/
2013-04-24 18:58:55 +10:00
if ( savehdr . holes = = 0 ) {
2005-04-16 15:20:36 -07:00
/*
* dest leaf has no holes , so we add there . May need
* to make some room in the entry array .
*/
2013-04-24 18:58:55 +10:00
if ( xfs_attr3_leaf_order ( save_blk - > bp , & savehdr ,
drop_blk - > bp , & drophdr ) ) {
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( state - > args ,
drop_leaf , & drophdr , 0 ,
2013-04-24 18:58:55 +10:00
save_leaf , & savehdr , 0 ,
2014-06-06 15:21:45 +10:00
drophdr . count ) ;
2005-04-16 15:20:36 -07:00
} else {
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( state - > args ,
drop_leaf , & drophdr , 0 ,
2013-04-24 18:58:55 +10:00
save_leaf , & savehdr ,
2014-06-06 15:21:45 +10:00
savehdr . count , drophdr . count ) ;
2005-04-16 15:20:36 -07:00
}
} else {
/*
* Destination has holes , so we make a temporary copy
* of the leaf and add them both to that .
*/
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * tmp_leaf ;
struct xfs_attr3_icleaf_hdr tmphdr ;
2024-01-16 09:59:39 +11:00
tmp_leaf = kzalloc ( state - > args - > geo - > blksize ,
GFP_KERNEL | __GFP_NOFAIL ) ;
2013-05-21 18:02:05 +10:00
/*
* Copy the header into the temp leaf so that all the stuff
* not in the incore header is present and gets copied back in
* once we ' ve moved all the entries .
*/
memcpy ( tmp_leaf , save_leaf , xfs_attr3_leaf_hdr_size ( save_leaf ) ) ;
2013-04-24 18:58:55 +10:00
2013-05-21 18:02:05 +10:00
memset ( & tmphdr , 0 , sizeof ( tmphdr ) ) ;
2013-04-24 18:58:55 +10:00
tmphdr . magic = savehdr . magic ;
tmphdr . forw = savehdr . forw ;
tmphdr . back = savehdr . back ;
2014-06-06 15:22:04 +10:00
tmphdr . firstused = state - > args - > geo - > blksize ;
2013-05-21 18:02:05 +10:00
/* write the header to the temp buffer to initialise it */
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( state - > args - > geo , tmp_leaf , & tmphdr ) ;
2013-05-21 18:02:05 +10:00
2013-04-24 18:58:55 +10:00
if ( xfs_attr3_leaf_order ( save_blk - > bp , & savehdr ,
drop_blk - > bp , & drophdr ) ) {
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( state - > args ,
drop_leaf , & drophdr , 0 ,
2013-04-24 18:58:55 +10:00
tmp_leaf , & tmphdr , 0 ,
2014-06-06 15:21:45 +10:00
drophdr . count ) ;
xfs_attr3_leaf_moveents ( state - > args ,
save_leaf , & savehdr , 0 ,
2013-04-24 18:58:55 +10:00
tmp_leaf , & tmphdr , tmphdr . count ,
2014-06-06 15:21:45 +10:00
savehdr . count ) ;
2005-04-16 15:20:36 -07:00
} else {
2014-06-06 15:21:45 +10:00
xfs_attr3_leaf_moveents ( state - > args ,
save_leaf , & savehdr , 0 ,
2013-04-24 18:58:55 +10:00
tmp_leaf , & tmphdr , 0 ,
2014-06-06 15:21:45 +10:00
savehdr . count ) ;
xfs_attr3_leaf_moveents ( state - > args ,
drop_leaf , & drophdr , 0 ,
2013-04-24 18:58:55 +10:00
tmp_leaf , & tmphdr , tmphdr . count ,
2014-06-06 15:21:45 +10:00
drophdr . count ) ;
2005-04-16 15:20:36 -07:00
}
2014-06-06 15:22:04 +10:00
memcpy ( save_leaf , tmp_leaf , state - > args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
savehdr = tmphdr ; /* struct copy */
2024-01-16 09:59:43 +11:00
kfree ( tmp_leaf ) ;
2005-04-16 15:20:36 -07:00
}
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_to_disk ( state - > args - > geo , save_leaf , & savehdr ) ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( state - > args - > trans , save_blk - > bp , 0 ,
2014-06-06 15:22:04 +10:00
state - > args - > geo - > blksize - 1 ) ;
2005-04-16 15:20:36 -07:00
/*
* Copy out last hashval in each block for B - tree code .
*/
2013-04-24 18:58:55 +10:00
entry = xfs_attr3_leaf_entryp ( save_leaf ) ;
save_blk - > hashval = be32_to_cpu ( entry [ savehdr . count - 1 ] . hashval ) ;
2005-04-16 15:20:36 -07:00
}
/*========================================================================
* Routines used for finding things in the Btree .
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
* Look up a name in a leaf attribute list structure .
* This is the internal routine , it uses the caller ' s buffer .
*
* Note that duplicate keys are allowed , but only check within the
* current leaf node . The Btree code must check in adjacent leaf nodes .
*
* Return in args - > index the index into the entry [ ] array of either
* the found entry , or where the entry should have been ( insert before
* that entry ) .
*
* Don ' t change the args - > value unless we find the attribute .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_lookup_int (
struct xfs_buf * bp ,
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_attr_leaf_entry * entry ;
struct xfs_attr_leaf_entry * entries ;
struct xfs_attr_leaf_name_local * name_loc ;
struct xfs_attr_leaf_name_remote * name_rmt ;
xfs_dahash_t hashval ;
int probe ;
int span ;
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_lookup ( args ) ;
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
entries = xfs_attr3_leaf_entryp ( leaf ) ;
2019-11-02 09:40:53 -07:00
if ( ichdr . count > = args - > geo - > blksize / 8 ) {
2020-03-11 10:37:54 -07:00
xfs_buf_mark_corrupt ( bp ) ;
2024-02-22 12:32:18 -08:00
xfs_da_mark_sick ( args ) ;
2018-01-08 10:51:07 -08:00
return - EFSCORRUPTED ;
2019-11-02 09:40:53 -07:00
}
2005-04-16 15:20:36 -07:00
/*
* Binary search . ( note : small blocks will skip this loop )
*/
hashval = args - > hashval ;
2013-04-24 18:58:55 +10:00
probe = span = ichdr . count / 2 ;
for ( entry = & entries [ probe ] ; span > 4 ; entry = & entries [ probe ] ) {
2005-04-16 15:20:36 -07:00
span / = 2 ;
2006-03-17 17:29:02 +11:00
if ( be32_to_cpu ( entry - > hashval ) < hashval )
2005-04-16 15:20:36 -07:00
probe + = span ;
2006-03-17 17:29:02 +11:00
else if ( be32_to_cpu ( entry - > hashval ) > hashval )
2005-04-16 15:20:36 -07:00
probe - = span ;
else
break ;
}
2019-11-02 09:40:53 -07:00
if ( ! ( probe > = 0 & & ( ! ichdr . count | | probe < ichdr . count ) ) ) {
2020-03-11 10:37:54 -07:00
xfs_buf_mark_corrupt ( bp ) ;
2024-02-22 12:32:18 -08:00
xfs_da_mark_sick ( args ) ;
2018-01-08 10:51:07 -08:00
return - EFSCORRUPTED ;
2019-11-02 09:40:53 -07:00
}
if ( ! ( span < = 4 | | be32_to_cpu ( entry - > hashval ) = = hashval ) ) {
2020-03-11 10:37:54 -07:00
xfs_buf_mark_corrupt ( bp ) ;
2024-02-22 12:32:18 -08:00
xfs_da_mark_sick ( args ) ;
2018-01-08 10:51:07 -08:00
return - EFSCORRUPTED ;
2019-11-02 09:40:53 -07:00
}
2005-04-16 15:20:36 -07:00
/*
* Since we may have duplicate hashval ' s , find the first matching
* hashval in the leaf .
*/
2013-04-24 18:58:55 +10:00
while ( probe > 0 & & be32_to_cpu ( entry - > hashval ) > = hashval ) {
2005-04-16 15:20:36 -07:00
entry - - ;
probe - - ;
}
2013-04-24 18:58:55 +10:00
while ( probe < ichdr . count & &
be32_to_cpu ( entry - > hashval ) < hashval ) {
2005-04-16 15:20:36 -07:00
entry + + ;
probe + + ;
}
2013-04-24 18:58:55 +10:00
if ( probe = = ichdr . count | | be32_to_cpu ( entry - > hashval ) ! = hashval ) {
2005-04-16 15:20:36 -07:00
args - > index = probe ;
2014-06-25 14:58:08 +10:00
return - ENOATTR ;
2005-04-16 15:20:36 -07:00
}
/*
* Duplicate keys may be present , so search all of them for a match .
*/
2013-04-24 18:58:55 +10:00
for ( ; probe < ichdr . count & & ( be32_to_cpu ( entry - > hashval ) = = hashval ) ;
2005-04-16 15:20:36 -07:00
entry + + , probe + + ) {
/*
* GROT : Add code to remove incomplete entries .
*/
if ( entry - > flags & XFS_ATTR_LOCAL ) {
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf , probe ) ;
2020-02-26 17:30:36 -08:00
if ( ! xfs_attr_match ( args , name_loc - > namelen ,
name_loc - > nameval , entry - > flags ) )
2005-04-16 15:20:36 -07:00
continue ;
args - > index = probe ;
2014-06-25 14:58:08 +10:00
return - EEXIST ;
2005-04-16 15:20:36 -07:00
} else {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf , probe ) ;
2020-02-26 17:30:36 -08:00
if ( ! xfs_attr_match ( args , name_rmt - > namelen ,
name_rmt - > name , entry - > flags ) )
2005-04-16 15:20:36 -07:00
continue ;
args - > index = probe ;
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
args - > rmtvaluelen = be32_to_cpu ( name_rmt - > valuelen ) ;
2006-03-17 17:29:18 +11:00
args - > rmtblkno = be32_to_cpu ( name_rmt - > valueblk ) ;
xfs: rework remote attr CRCs
Note: this changes the on-disk remote attribute format. I assert
that this is OK to do as CRCs are marked experimental and the first
kernel it is included in has not yet reached release yet. Further,
the userspace utilities are still evolving and so anyone using this
stuff right now is a developer or tester using volatile filesystems
for testing this feature. Hence changing the format right now to
save longer term pain is the right thing to do.
The fundamental change is to move from a header per extent in the
attribute to a header per filesytem block in the attribute. This
means there are more header blocks and the parsing of the attribute
data is slightly more complex, but it has the advantage that we
always know the size of the attribute on disk based on the length of
the data it contains.
This is where the header-per-extent method has problems. We don't
know the size of the attribute on disk without first knowing how
many extents are used to hold it. And we can't tell from a
mapping lookup, either, because remote attributes can be allocated
contiguously with other attribute blocks and so there is no obvious
way of determining the actual size of the atribute on disk short of
walking and mapping buffers.
The problem with this approach is that if we map a buffer
incorrectly (e.g. we make the last buffer for the attribute data too
long), we then get buffer cache lookup failure when we map it
correctly. i.e. we get a size mismatch on lookup. This is not
necessarily fatal, but it's a cache coherency problem that can lead
to returning the wrong data to userspace or writing the wrong data
to disk. And debug kernels will assert fail if this occurs.
I found lots of niggly little problems trying to fix this issue on a
4k block size filesystem, finally getting it to pass with lots of
fixes. The thing is, 1024 byte filesystems still failed, and it was
getting really complex handling all the corner cases that were
showing up. And there were clearly more that I hadn't found yet.
It is complex, fragile code, and if we don't fix it now, it will be
complex, fragile code forever more.
Hence the simple fix is to add a header to each filesystem block.
This gives us the same relationship between the attribute data
length and the number of blocks on disk as we have without CRCs -
it's a linear mapping and doesn't require us to guess anything. It
is simple to implement, too - the remote block count calculated at
lookup time can be used by the remote attribute set/get/remove code
without modification for both CRC and non-CRC filesystems. The world
becomes sane again.
Because the copy-in and copy-out now need to iterate over each
filesystem block, I moved them into helper functions so we separate
the block mapping and buffer manupulations from the attribute data
and CRC header manipulations. The code becomes much clearer as a
result, and it is a lot easier to understand and debug. It also
appears to be much more robust - once it worked on 4k block size
filesystems, it has worked without failure on 1k block size
filesystems, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-05-21 18:02:08 +10:00
args - > rmtblkcnt = xfs_attr3_rmt_blocks (
args - > dp - > i_mount ,
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
args - > rmtvaluelen ) ;
2014-06-25 14:58:08 +10:00
return - EEXIST ;
2005-04-16 15:20:36 -07:00
}
}
args - > index = probe ;
2014-06-25 14:58:08 +10:00
return - ENOATTR ;
2005-04-16 15:20:36 -07:00
}
/*
* Get the value associated with an attribute name from a leaf attribute
* list structure .
2019-08-29 09:04:08 -07:00
*
2020-02-26 17:30:35 -08:00
* If args - > valuelen is zero , only the length needs to be returned . Unlike a
* lookup , we only return an error if the attribute does not exist or we can ' t
* retrieve the value .
2005-04-16 15:20:36 -07:00
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_getvalue (
struct xfs_buf * bp ,
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_attr_leaf_entry * entry ;
struct xfs_attr_leaf_name_local * name_loc ;
struct xfs_attr_leaf_name_remote * name_rmt ;
2005-04-16 15:20:36 -07:00
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr . count < args - > geo - > blksize / 8 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index < ichdr . count ) ;
2005-04-16 15:20:36 -07:00
2013-04-24 18:58:55 +10:00
entry = & xfs_attr3_leaf_entryp ( leaf ) [ args - > index ] ;
2005-04-16 15:20:36 -07:00
if ( entry - > flags & XFS_ATTR_LOCAL ) {
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf , args - > index ) ;
2005-04-16 15:20:36 -07:00
ASSERT ( name_loc - > namelen = = args - > namelen ) ;
ASSERT ( memcmp ( args - > name , name_loc - > nameval , args - > namelen ) = = 0 ) ;
2019-08-29 09:04:10 -07:00
return xfs_attr_copy_value ( args ,
& name_loc - > nameval [ args - > namelen ] ,
be16_to_cpu ( name_loc - > valuelen ) ) ;
2019-08-29 09:04:09 -07:00
}
name_rmt = xfs_attr3_leaf_name_remote ( leaf , args - > index ) ;
ASSERT ( name_rmt - > namelen = = args - > namelen ) ;
ASSERT ( memcmp ( args - > name , name_rmt - > name , args - > namelen ) = = 0 ) ;
args - > rmtvaluelen = be32_to_cpu ( name_rmt - > valuelen ) ;
args - > rmtblkno = be32_to_cpu ( name_rmt - > valueblk ) ;
args - > rmtblkcnt = xfs_attr3_rmt_blocks ( args - > dp - > i_mount ,
args - > rmtvaluelen ) ;
2019-08-29 09:04:10 -07:00
return xfs_attr_copy_value ( args , NULL , args - > rmtvaluelen ) ;
2005-04-16 15:20:36 -07:00
}
/*========================================================================
* Utility routines .
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
* Move the indicated entries from one leaf to another .
* NOTE : this routine modifies both source and destination leaves .
*/
/*ARGSUSED*/
STATIC void
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_moveents (
2014-06-06 15:21:45 +10:00
struct xfs_da_args * args ,
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf_s ,
struct xfs_attr3_icleaf_hdr * ichdr_s ,
int start_s ,
struct xfs_attr_leafblock * leaf_d ,
struct xfs_attr3_icleaf_hdr * ichdr_d ,
int start_d ,
2014-06-06 15:21:45 +10:00
int count )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leaf_entry * entry_s ;
struct xfs_attr_leaf_entry * entry_d ;
int desti ;
int tmp ;
int i ;
2005-04-16 15:20:36 -07:00
/*
* Check for nothing to do .
*/
if ( count = = 0 )
return ;
/*
* Set up environment .
*/
2013-04-24 18:58:55 +10:00
ASSERT ( ichdr_s - > magic = = XFS_ATTR_LEAF_MAGIC | |
ichdr_s - > magic = = XFS_ATTR3_LEAF_MAGIC ) ;
ASSERT ( ichdr_s - > magic = = ichdr_d - > magic ) ;
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr_s - > count > 0 & & ichdr_s - > count < args - > geo - > blksize / 8 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( ichdr_s - > firstused > = ( ichdr_s - > count * sizeof ( * entry_s ) )
+ xfs_attr3_leaf_hdr_size ( leaf_s ) ) ;
2014-06-06 15:21:45 +10:00
ASSERT ( ichdr_d - > count < args - > geo - > blksize / 8 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( ichdr_d - > firstused > = ( ichdr_d - > count * sizeof ( * entry_d ) )
+ xfs_attr3_leaf_hdr_size ( leaf_d ) ) ;
ASSERT ( start_s < ichdr_s - > count ) ;
ASSERT ( start_d < = ichdr_d - > count ) ;
ASSERT ( count < = ichdr_s - > count ) ;
2005-04-16 15:20:36 -07:00
/*
* Move the entries in the destination leaf up to make a hole ?
*/
2013-04-24 18:58:55 +10:00
if ( start_d < ichdr_d - > count ) {
tmp = ichdr_d - > count - start_d ;
2005-04-16 15:20:36 -07:00
tmp * = sizeof ( xfs_attr_leaf_entry_t ) ;
2013-04-24 18:58:55 +10:00
entry_s = & xfs_attr3_leaf_entryp ( leaf_d ) [ start_d ] ;
entry_d = & xfs_attr3_leaf_entryp ( leaf_d ) [ start_d + count ] ;
memmove ( entry_d , entry_s , tmp ) ;
2005-04-16 15:20:36 -07:00
}
/*
* Copy all entry ' s in the same ( sorted ) order ,
* but allocate attribute info packed and in sequence .
*/
2013-04-24 18:58:55 +10:00
entry_s = & xfs_attr3_leaf_entryp ( leaf_s ) [ start_s ] ;
entry_d = & xfs_attr3_leaf_entryp ( leaf_d ) [ start_d ] ;
2005-04-16 15:20:36 -07:00
desti = start_d ;
for ( i = 0 ; i < count ; entry_s + + , entry_d + + , desti + + , i + + ) {
2013-04-24 18:58:55 +10:00
ASSERT ( be16_to_cpu ( entry_s - > nameidx ) > = ichdr_s - > firstused ) ;
2005-04-16 15:20:36 -07:00
tmp = xfs_attr_leaf_entsize ( leaf_s , start_s + i ) ;
# ifdef GROT
/*
* Code to drop INCOMPLETE entries . Difficult to use as we
* may also need to change the insertion index . Code turned
* off for 6.2 , should be revisited later .
*/
if ( entry_s - > flags & XFS_ATTR_INCOMPLETE ) { /* skip partials? */
2013-04-24 18:58:55 +10:00
memset ( xfs_attr3_leaf_name ( leaf_s , start_s + i ) , 0 , tmp ) ;
ichdr_s - > usedbytes - = tmp ;
ichdr_s - > count - = 1 ;
2005-04-16 15:20:36 -07:00
entry_d - - ; /* to compensate for ++ in loop hdr */
desti - - ;
if ( ( start_s + i ) < offset )
result + + ; /* insertion index adjustment */
} else {
# endif /* GROT */
2013-04-24 18:58:55 +10:00
ichdr_d - > firstused - = tmp ;
2005-04-16 15:20:36 -07:00
/* both on-disk, don't endian flip twice */
entry_d - > hashval = entry_s - > hashval ;
2013-04-24 18:58:55 +10:00
entry_d - > nameidx = cpu_to_be16 ( ichdr_d - > firstused ) ;
2005-04-16 15:20:36 -07:00
entry_d - > flags = entry_s - > flags ;
2006-03-17 17:29:02 +11:00
ASSERT ( be16_to_cpu ( entry_d - > nameidx ) + tmp
2014-06-06 15:21:45 +10:00
< = args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
memmove ( xfs_attr3_leaf_name ( leaf_d , desti ) ,
xfs_attr3_leaf_name ( leaf_s , start_s + i ) , tmp ) ;
2006-03-17 17:29:02 +11:00
ASSERT ( be16_to_cpu ( entry_s - > nameidx ) + tmp
2014-06-06 15:21:45 +10:00
< = args - > geo - > blksize ) ;
2013-04-24 18:58:55 +10:00
memset ( xfs_attr3_leaf_name ( leaf_s , start_s + i ) , 0 , tmp ) ;
ichdr_s - > usedbytes - = tmp ;
ichdr_d - > usedbytes + = tmp ;
ichdr_s - > count - = 1 ;
ichdr_d - > count + = 1 ;
tmp = ichdr_d - > count * sizeof ( xfs_attr_leaf_entry_t )
+ xfs_attr3_leaf_hdr_size ( leaf_d ) ;
ASSERT ( ichdr_d - > firstused > = tmp ) ;
2005-04-16 15:20:36 -07:00
# ifdef GROT
}
# endif /* GROT */
}
/*
* Zero out the entries we just copied .
*/
2013-04-24 18:58:55 +10:00
if ( start_s = = ichdr_s - > count ) {
2005-04-16 15:20:36 -07:00
tmp = count * sizeof ( xfs_attr_leaf_entry_t ) ;
2013-04-24 18:58:55 +10:00
entry_s = & xfs_attr3_leaf_entryp ( leaf_s ) [ start_s ] ;
2005-04-16 15:20:36 -07:00
ASSERT ( ( ( char * ) entry_s + tmp ) < =
2014-06-06 15:21:45 +10:00
( ( char * ) leaf_s + args - > geo - > blksize ) ) ;
2013-04-24 18:58:55 +10:00
memset ( entry_s , 0 , tmp ) ;
2005-04-16 15:20:36 -07:00
} else {
/*
* Move the remaining entries down to fill the hole ,
* then zero the entries at the top .
*/
2013-04-24 18:58:55 +10:00
tmp = ( ichdr_s - > count - count ) * sizeof ( xfs_attr_leaf_entry_t ) ;
entry_s = & xfs_attr3_leaf_entryp ( leaf_s ) [ start_s + count ] ;
entry_d = & xfs_attr3_leaf_entryp ( leaf_s ) [ start_s ] ;
memmove ( entry_d , entry_s , tmp ) ;
2005-04-16 15:20:36 -07:00
tmp = count * sizeof ( xfs_attr_leaf_entry_t ) ;
2013-04-24 18:58:55 +10:00
entry_s = & xfs_attr3_leaf_entryp ( leaf_s ) [ ichdr_s - > count ] ;
2005-04-16 15:20:36 -07:00
ASSERT ( ( ( char * ) entry_s + tmp ) < =
2014-06-06 15:21:45 +10:00
( ( char * ) leaf_s + args - > geo - > blksize ) ) ;
2013-04-24 18:58:55 +10:00
memset ( entry_s , 0 , tmp ) ;
2005-04-16 15:20:36 -07:00
}
/*
* Fill in the freemap information
*/
2013-04-24 18:58:55 +10:00
ichdr_d - > freemap [ 0 ] . base = xfs_attr3_leaf_hdr_size ( leaf_d ) ;
ichdr_d - > freemap [ 0 ] . base + = ichdr_d - > count * sizeof ( xfs_attr_leaf_entry_t ) ;
ichdr_d - > freemap [ 0 ] . size = ichdr_d - > firstused - ichdr_d - > freemap [ 0 ] . base ;
ichdr_d - > freemap [ 1 ] . base = 0 ;
ichdr_d - > freemap [ 2 ] . base = 0 ;
ichdr_d - > freemap [ 1 ] . size = 0 ;
ichdr_d - > freemap [ 2 ] . size = 0 ;
ichdr_s - > holes = 1 ; /* leaf may not be compact */
2005-04-16 15:20:36 -07:00
}
/*
* Pick up the last hashvalue from a leaf block .
*/
xfs_dahash_t
2012-06-22 18:50:14 +10:00
xfs_attr_leaf_lasthash (
struct xfs_buf * bp ,
int * count )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr3_icleaf_hdr ichdr ;
struct xfs_attr_leaf_entry * entries ;
2019-06-28 19:27:29 -07:00
struct xfs_mount * mp = bp - > b_mount ;
2005-04-16 15:20:36 -07:00
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( mp - > m_attr_geo , & ichdr , bp - > b_addr ) ;
2013-04-24 18:58:55 +10:00
entries = xfs_attr3_leaf_entryp ( bp - > b_addr ) ;
2005-04-16 15:20:36 -07:00
if ( count )
2013-04-24 18:58:55 +10:00
* count = ichdr . count ;
if ( ! ichdr . count )
return 0 ;
return be32_to_cpu ( entries [ ichdr . count - 1 ] . hashval ) ;
2005-04-16 15:20:36 -07:00
}
/*
* Calculate the number of bytes used to store the indicated attribute
* ( whether local or remote only calculate bytes in this block ) .
*/
2005-06-21 15:36:52 +10:00
STATIC int
2005-04-16 15:20:36 -07:00
xfs_attr_leaf_entsize ( xfs_attr_leafblock_t * leaf , int index )
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leaf_entry * entries ;
2005-04-16 15:20:36 -07:00
xfs_attr_leaf_name_local_t * name_loc ;
xfs_attr_leaf_name_remote_t * name_rmt ;
int size ;
2013-04-24 18:58:55 +10:00
entries = xfs_attr3_leaf_entryp ( leaf ) ;
if ( entries [ index ] . flags & XFS_ATTR_LOCAL ) {
name_loc = xfs_attr3_leaf_name_local ( leaf , index ) ;
2009-01-01 16:40:11 -06:00
size = xfs_attr_leaf_entsize_local ( name_loc - > namelen ,
2006-03-17 17:29:09 +11:00
be16_to_cpu ( name_loc - > valuelen ) ) ;
2005-04-16 15:20:36 -07:00
} else {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf , index ) ;
2009-01-01 16:40:11 -06:00
size = xfs_attr_leaf_entsize_remote ( name_rmt - > namelen ) ;
2005-04-16 15:20:36 -07:00
}
2013-04-24 18:58:55 +10:00
return size ;
2005-04-16 15:20:36 -07:00
}
/*
* Calculate the number of bytes that would be required to store the new
* attribute ( whether local or remote only calculate bytes in this block ) .
* This routine decides as a side effect whether the attribute will be
* a " local " or a " remote " attribute .
*/
int
2014-06-06 15:21:27 +10:00
xfs_attr_leaf_newentsize (
struct xfs_da_args * args ,
int * local )
2005-04-16 15:20:36 -07:00
{
2014-06-06 15:21:27 +10:00
int size ;
2005-04-16 15:20:36 -07:00
2014-06-06 15:21:27 +10:00
size = xfs_attr_leaf_entsize_local ( args - > namelen , args - > valuelen ) ;
if ( size < xfs_attr_leaf_entsize_local_max ( args - > geo - > blksize ) ) {
if ( local )
2005-04-16 15:20:36 -07:00
* local = 1 ;
2014-06-06 15:21:27 +10:00
return size ;
2005-04-16 15:20:36 -07:00
}
2014-06-06 15:21:27 +10:00
if ( local )
* local = 0 ;
return xfs_attr_leaf_entsize_remote ( args - > namelen ) ;
2005-04-16 15:20:36 -07:00
}
/*========================================================================
* Manage the INCOMPLETE flag in a leaf entry
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
/*
* Clear the INCOMPLETE flag on an entry in a leaf block .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_clearflag (
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr_leaf_entry * entry ;
struct xfs_attr_leaf_name_remote * name_rmt ;
struct xfs_buf * bp ;
int error ;
2005-04-16 15:20:36 -07:00
# ifdef DEBUG
2013-04-24 18:58:55 +10:00
struct xfs_attr3_icleaf_hdr ichdr ;
2005-04-16 15:20:36 -07:00
xfs_attr_leaf_name_local_t * name_loc ;
int namelen ;
char * name ;
# endif /* DEBUG */
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_clearflag ( args ) ;
2005-04-16 15:20:36 -07:00
/*
* Set up the operation .
*/
2019-11-20 09:46:02 -08:00
error = xfs_attr3_leaf_read ( args - > trans , args - > dp , args - > blkno , & bp ) ;
2012-11-12 22:54:16 +11:00
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2013-04-24 18:58:55 +10:00
entry = & xfs_attr3_leaf_entryp ( leaf ) [ args - > index ] ;
2005-04-16 15:20:36 -07:00
ASSERT ( entry - > flags & XFS_ATTR_INCOMPLETE ) ;
# ifdef DEBUG
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index < ichdr . count ) ;
ASSERT ( args - > index > = 0 ) ;
2005-04-16 15:20:36 -07:00
if ( entry - > flags & XFS_ATTR_LOCAL ) {
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf , args - > index ) ;
2005-04-16 15:20:36 -07:00
namelen = name_loc - > namelen ;
name = ( char * ) name_loc - > nameval ;
} else {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf , args - > index ) ;
2005-04-16 15:20:36 -07:00
namelen = name_rmt - > namelen ;
name = ( char * ) name_rmt - > name ;
}
2006-03-17 17:29:02 +11:00
ASSERT ( be32_to_cpu ( entry - > hashval ) = = args - > hashval ) ;
2005-04-16 15:20:36 -07:00
ASSERT ( namelen = = args - > namelen ) ;
ASSERT ( memcmp ( name , args - > name , namelen ) = = 0 ) ;
# endif /* DEBUG */
entry - > flags & = ~ XFS_ATTR_INCOMPLETE ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf , entry , sizeof ( * entry ) ) ) ;
if ( args - > rmtblkno ) {
ASSERT ( ( entry - > flags & XFS_ATTR_LOCAL ) = = 0 ) ;
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf , args - > index ) ;
2006-03-17 17:29:18 +11:00
name_rmt - > valueblk = cpu_to_be32 ( args - > rmtblkno ) ;
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
name_rmt - > valuelen = cpu_to_be32 ( args - > rmtvaluelen ) ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf , name_rmt , sizeof ( * name_rmt ) ) ) ;
}
2020-07-20 21:47:26 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* Set the INCOMPLETE flag on an entry in a leaf block .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_setflag (
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf ;
struct xfs_attr_leaf_entry * entry ;
struct xfs_attr_leaf_name_remote * name_rmt ;
struct xfs_buf * bp ;
2005-04-16 15:20:36 -07:00
int error ;
2013-04-24 18:58:55 +10:00
# ifdef DEBUG
struct xfs_attr3_icleaf_hdr ichdr ;
# endif
2005-04-16 15:20:36 -07:00
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_setflag ( args ) ;
2005-04-16 15:20:36 -07:00
/*
* Set up the operation .
*/
2019-11-20 09:46:02 -08:00
error = xfs_attr3_leaf_read ( args - > trans , args - > dp , args - > blkno , & bp ) ;
2012-11-12 22:54:16 +11:00
if ( error )
2014-06-22 15:03:54 +10:00
return error ;
2005-04-16 15:20:36 -07:00
2012-06-22 18:50:14 +10:00
leaf = bp - > b_addr ;
2013-04-24 18:58:55 +10:00
# ifdef DEBUG
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr , leaf ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index < ichdr . count ) ;
2005-04-16 15:20:36 -07:00
ASSERT ( args - > index > = 0 ) ;
2013-04-24 18:58:55 +10:00
# endif
entry = & xfs_attr3_leaf_entryp ( leaf ) [ args - > index ] ;
2005-04-16 15:20:36 -07:00
ASSERT ( ( entry - > flags & XFS_ATTR_INCOMPLETE ) = = 0 ) ;
entry - > flags | = XFS_ATTR_INCOMPLETE ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf , entry , sizeof ( * entry ) ) ) ;
if ( ( entry - > flags & XFS_ATTR_LOCAL ) = = 0 ) {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf , args - > index ) ;
2005-04-16 15:20:36 -07:00
name_rmt - > valueblk = 0 ;
name_rmt - > valuelen = 0 ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf , name_rmt , sizeof ( * name_rmt ) ) ) ;
}
2020-07-20 21:47:25 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}
/*
* In a single transaction , clear the INCOMPLETE flag on the leaf entry
* given by args - > blkno / index and set the INCOMPLETE flag on the leaf
* entry given by args - > blkno2 / index2 .
*
* Note that they could be in different blocks , or in the same block .
*/
int
2013-04-24 18:58:55 +10:00
xfs_attr3_leaf_flipflags (
struct xfs_da_args * args )
2005-04-16 15:20:36 -07:00
{
2013-04-24 18:58:55 +10:00
struct xfs_attr_leafblock * leaf1 ;
struct xfs_attr_leafblock * leaf2 ;
struct xfs_attr_leaf_entry * entry1 ;
struct xfs_attr_leaf_entry * entry2 ;
struct xfs_attr_leaf_name_remote * name_rmt ;
struct xfs_buf * bp1 ;
struct xfs_buf * bp2 ;
2005-04-16 15:20:36 -07:00
int error ;
# ifdef DEBUG
2013-04-24 18:58:55 +10:00
struct xfs_attr3_icleaf_hdr ichdr1 ;
struct xfs_attr3_icleaf_hdr ichdr2 ;
2005-04-16 15:20:36 -07:00
xfs_attr_leaf_name_local_t * name_loc ;
int namelen1 , namelen2 ;
char * name1 , * name2 ;
# endif /* DEBUG */
2012-03-22 05:15:13 +00:00
trace_xfs_attr_leaf_flipflags ( args ) ;
2005-04-16 15:20:36 -07:00
/*
* Read the block containing the " old " attr
*/
2019-11-20 09:46:02 -08:00
error = xfs_attr3_leaf_read ( args - > trans , args - > dp , args - > blkno , & bp1 ) ;
2012-11-12 22:54:16 +11:00
if ( error )
return error ;
2005-04-16 15:20:36 -07:00
/*
* Read the block containing the " new " attr , if it is different
*/
if ( args - > blkno2 ! = args - > blkno ) {
2013-04-24 18:58:55 +10:00
error = xfs_attr3_leaf_read ( args - > trans , args - > dp , args - > blkno2 ,
2019-11-20 09:46:02 -08:00
& bp2 ) ;
2012-11-12 22:54:16 +11:00
if ( error )
return error ;
2005-04-16 15:20:36 -07:00
} else {
bp2 = bp1 ;
}
2012-06-22 18:50:14 +10:00
leaf1 = bp1 - > b_addr ;
2013-04-24 18:58:55 +10:00
entry1 = & xfs_attr3_leaf_entryp ( leaf1 ) [ args - > index ] ;
2005-04-16 15:20:36 -07:00
2012-06-22 18:50:14 +10:00
leaf2 = bp2 - > b_addr ;
2013-04-24 18:58:55 +10:00
entry2 = & xfs_attr3_leaf_entryp ( leaf2 ) [ args - > index2 ] ;
2005-04-16 15:20:36 -07:00
# ifdef DEBUG
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr1 , leaf1 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index < ichdr1 . count ) ;
ASSERT ( args - > index > = 0 ) ;
2015-04-13 11:26:02 +10:00
xfs_attr3_leaf_hdr_from_disk ( args - > geo , & ichdr2 , leaf2 ) ;
2013-04-24 18:58:55 +10:00
ASSERT ( args - > index2 < ichdr2 . count ) ;
ASSERT ( args - > index2 > = 0 ) ;
2005-04-16 15:20:36 -07:00
if ( entry1 - > flags & XFS_ATTR_LOCAL ) {
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf1 , args - > index ) ;
2005-04-16 15:20:36 -07:00
namelen1 = name_loc - > namelen ;
name1 = ( char * ) name_loc - > nameval ;
} else {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf1 , args - > index ) ;
2005-04-16 15:20:36 -07:00
namelen1 = name_rmt - > namelen ;
name1 = ( char * ) name_rmt - > name ;
}
if ( entry2 - > flags & XFS_ATTR_LOCAL ) {
2013-04-24 18:58:55 +10:00
name_loc = xfs_attr3_leaf_name_local ( leaf2 , args - > index2 ) ;
2005-04-16 15:20:36 -07:00
namelen2 = name_loc - > namelen ;
name2 = ( char * ) name_loc - > nameval ;
} else {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf2 , args - > index2 ) ;
2005-04-16 15:20:36 -07:00
namelen2 = name_rmt - > namelen ;
name2 = ( char * ) name_rmt - > name ;
}
2006-03-17 17:29:02 +11:00
ASSERT ( be32_to_cpu ( entry1 - > hashval ) = = be32_to_cpu ( entry2 - > hashval ) ) ;
2005-04-16 15:20:36 -07:00
ASSERT ( namelen1 = = namelen2 ) ;
ASSERT ( memcmp ( name1 , name2 , namelen1 ) = = 0 ) ;
# endif /* DEBUG */
ASSERT ( entry1 - > flags & XFS_ATTR_INCOMPLETE ) ;
ASSERT ( ( entry2 - > flags & XFS_ATTR_INCOMPLETE ) = = 0 ) ;
entry1 - > flags & = ~ XFS_ATTR_INCOMPLETE ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp1 ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf1 , entry1 , sizeof ( * entry1 ) ) ) ;
if ( args - > rmtblkno ) {
ASSERT ( ( entry1 - > flags & XFS_ATTR_LOCAL ) = = 0 ) ;
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf1 , args - > index ) ;
2006-03-17 17:29:18 +11:00
name_rmt - > valueblk = cpu_to_be32 ( args - > rmtblkno ) ;
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
name_rmt - > valuelen = cpu_to_be32 ( args - > rmtvaluelen ) ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp1 ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf1 , name_rmt , sizeof ( * name_rmt ) ) ) ;
}
entry2 - > flags | = XFS_ATTR_INCOMPLETE ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp2 ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf2 , entry2 , sizeof ( * entry2 ) ) ) ;
if ( ( entry2 - > flags & XFS_ATTR_LOCAL ) = = 0 ) {
2013-04-24 18:58:55 +10:00
name_rmt = xfs_attr3_leaf_name_remote ( leaf2 , args - > index2 ) ;
2005-04-16 15:20:36 -07:00
name_rmt - > valueblk = 0 ;
name_rmt - > valuelen = 0 ;
2012-06-22 18:50:14 +10:00
xfs_trans_log_buf ( args - > trans , bp2 ,
2005-04-16 15:20:36 -07:00
XFS_DA_LOGRANGE ( leaf2 , name_rmt , sizeof ( * name_rmt ) ) ) ;
}
2020-07-20 21:47:23 -07:00
return 0 ;
2005-04-16 15:20:36 -07:00
}