2018-06-06 05:42:14 +03:00
// SPDX-License-Identifier: GPL-2.0
2009-12-15 02:14:59 +03:00
/*
* Copyright ( c ) 2009 , Christoph Hellwig
* All Rights Reserved .
2021-08-19 06:04:44 +03:00
*
* NOTE : none of these tracepoints shall be considered a stable kernel ABI
* as they can change at any time .
*
* Current conventions for printing numbers measuring specific units :
*
* agno : allocation group number
*
* agino : per - AG inode number
* ino : filesystem inode number
*
* agbno : per - AG block number in fs blocks
* startblock : physical block number for file mappings . This is either a
* segmented fsblock for data device mappings , or a rfsblock
* for realtime device mappings
* fsbcount : number of blocks in an extent , in fs blocks
*
* daddr : physical block number in 512 b blocks
* bbcount : number of blocks in a physical extent , in 512 b blocks
*
* owner : reverse - mapping owner , usually inodes
*
* fileoff : file offset , in fs blocks
* pos : file offset , in bytes
* bytecount : number of bytes
*
* disize : ondisk file size , in bytes
* isize : incore file size , in bytes
*
* forkoff : inode fork offset , in bytes
*
* ireccount : number of inode records
*
* Numbers describing space allocations ( blocks , extents , inodes ) should be
* formatted in hexadecimal .
2009-12-15 02:14:59 +03:00
*/
# undef TRACE_SYSTEM
# define TRACE_SYSTEM xfs
# if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
# define _TRACE_XFS_H
# include <linux/tracepoint.h>
struct xfs_agf ;
struct xfs_alloc_arg ;
struct xfs_attr_list_context ;
struct xfs_buf_log_item ;
struct xfs_da_args ;
struct xfs_da_node_entry ;
struct xfs_dquot ;
2011-10-11 19:14:11 +04:00
struct xfs_log_item ;
2012-06-14 18:22:15 +04:00
struct xlog ;
2013-11-01 08:27:18 +04:00
struct xlog_ticket ;
2010-04-13 09:06:46 +04:00
struct xlog_recover ;
struct xlog_recover_item ;
2019-08-26 22:08:10 +03:00
struct xlog_rec_header ;
2021-06-18 21:57:05 +03:00
struct xlog_in_core ;
2010-04-13 09:06:46 +04:00
struct xfs_buf_log_format ;
struct xfs_inode_log_format ;
2012-08-01 18:56:49 +04:00
struct xfs_bmbt_irec ;
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
struct xfs_btree_cur ;
2016-10-03 19:11:18 +03:00
struct xfs_refcount_irec ;
2017-03-29 00:56:37 +03:00
struct xfs_fsmap ;
struct xfs_rmap_irec ;
2019-08-26 22:08:10 +03:00
struct xfs_icreate_log ;
struct xfs_owner_info ;
struct xfs_trans_res ;
struct xfs_inobt_rec_incore ;
2020-03-11 20:51:50 +03:00
union xfs_btree_ptr ;
2020-07-14 20:37:35 +03:00
struct xfs_dqtrx ;
2021-06-07 19:34:51 +03:00
struct xfs_icwalk ;
2023-02-13 01:14:56 +03:00
struct xfs_perag ;
2009-12-15 02:14:59 +03:00
2020-02-27 04:30:42 +03:00
# define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT , " ROOT " } , \
2020-02-27 04:30:43 +03:00
{ XFS_ATTR_SECURE , " SECURE " } , \
{ XFS_ATTR_INCOMPLETE , " INCOMPLETE " }
2020-02-27 04:30:42 +03:00
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_attr_list_class ,
TP_PROTO ( struct xfs_attr_list_context * ctx ) ,
TP_ARGS ( ctx ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( u32 , hashval )
__field ( u32 , blkno )
__field ( u32 , offset )
2020-02-27 04:30:37 +03:00
__field ( void * , buffer )
2009-12-21 17:03:03 +03:00
__field ( int , bufsize )
__field ( int , count )
__field ( int , firstu )
__field ( int , dupcnt )
2020-02-27 04:30:42 +03:00
__field ( unsigned int , attr_filter )
2009-12-21 17:03:03 +03:00
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ctx - > dp ) - > i_sb - > s_dev ;
__entry - > ino = ctx - > dp - > i_ino ;
2020-02-27 04:30:43 +03:00
__entry - > hashval = ctx - > cursor . hashval ;
__entry - > blkno = ctx - > cursor . blkno ;
__entry - > offset = ctx - > cursor . offset ;
2020-02-27 04:30:37 +03:00
__entry - > buffer = ctx - > buffer ;
2009-12-21 17:03:03 +03:00
__entry - > bufsize = ctx - > bufsize ;
__entry - > count = ctx - > count ;
__entry - > firstu = ctx - > firstu ;
2020-02-27 04:30:42 +03:00
__entry - > attr_filter = ctx - > attr_filter ;
2009-12-21 17:03:03 +03:00
) ,
TP_printk ( " dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
2020-02-27 04:30:42 +03:00
" buffer %p size %u count %u firstu %u filter %s " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > hashval ,
__entry - > blkno ,
__entry - > offset ,
__entry - > dupcnt ,
2020-02-27 04:30:37 +03:00
__entry - > buffer ,
2009-12-21 17:03:03 +03:00
__entry - > bufsize ,
__entry - > count ,
__entry - > firstu ,
2020-02-27 04:30:42 +03:00
__print_flags ( __entry - > attr_filter , " | " ,
XFS_ATTR_FILTER_FLAGS )
2009-12-21 17:03:03 +03:00
)
)
2009-12-15 02:14:59 +03:00
# define DEFINE_ATTR_LIST_EVENT(name) \
2009-12-21 17:03:03 +03:00
DEFINE_EVENT ( xfs_attr_list_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_attr_list_context * ctx ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( ctx ) )
2009-12-15 02:14:59 +03:00
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_sf ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_sf_all ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_leaf ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_leaf_end ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_full ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_add ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_wrong_blk ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_list_notfound ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_LIST_EVENT ( xfs_attr_leaf_list ) ;
DEFINE_ATTR_LIST_EVENT ( xfs_attr_node_list ) ;
2009-12-15 02:14:59 +03:00
2020-11-30 03:33:39 +03:00
TRACE_EVENT ( xlog_intent_recovery_failed ,
TP_PROTO ( struct xfs_mount * mp , int error , void * function ) ,
TP_ARGS ( mp , error , function ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( int , error )
__field ( void * , function )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > error = error ;
__entry - > function = function ;
) ,
TP_printk ( " dev %d:%d error %d function %pS " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > error , __entry - > function )
) ;
2010-05-24 12:25:57 +04:00
DECLARE_EVENT_CLASS ( xfs_perag_class ,
2023-02-13 01:14:52 +03:00
TP_PROTO ( struct xfs_perag * pag , unsigned long caller_ip ) ,
TP_ARGS ( pag , caller_ip ) ,
2010-05-24 12:25:57 +04:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( int , refcount )
2023-02-13 01:14:52 +03:00
__field ( int , active_refcount )
2010-05-24 12:25:57 +04:00
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
2023-02-13 01:14:52 +03:00
__entry - > dev = pag - > pag_mount - > m_super - > s_dev ;
__entry - > agno = pag - > pag_agno ;
__entry - > refcount = atomic_read ( & pag - > pag_ref ) ;
__entry - > active_refcount = atomic_read ( & pag - > pag_active_ref ) ;
2010-05-24 12:25:57 +04:00
__entry - > caller_ip = caller_ip ;
) ,
2023-02-13 01:14:52 +03:00
TP_printk ( " dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS " ,
2010-05-24 12:25:57 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > refcount ,
2023-02-13 01:14:52 +03:00
__entry - > active_refcount ,
2010-05-24 12:25:57 +04:00
( char * ) __entry - > caller_ip )
) ;
# define DEFINE_PERAG_REF_EVENT(name) \
DEFINE_EVENT ( xfs_perag_class , name , \
2023-02-13 01:14:52 +03:00
TP_PROTO ( struct xfs_perag * pag , unsigned long caller_ip ) , \
TP_ARGS ( pag , caller_ip ) )
2010-05-24 12:25:57 +04:00
DEFINE_PERAG_REF_EVENT ( xfs_perag_get ) ;
2010-09-24 12:40:15 +04:00
DEFINE_PERAG_REF_EVENT ( xfs_perag_get_tag ) ;
2010-05-24 12:25:57 +04:00
DEFINE_PERAG_REF_EVENT ( xfs_perag_put ) ;
2023-02-13 01:14:42 +03:00
DEFINE_PERAG_REF_EVENT ( xfs_perag_grab ) ;
DEFINE_PERAG_REF_EVENT ( xfs_perag_grab_tag ) ;
DEFINE_PERAG_REF_EVENT ( xfs_perag_rele ) ;
2021-05-31 21:32:02 +03:00
DEFINE_PERAG_REF_EVENT ( xfs_perag_set_inode_tag ) ;
DEFINE_PERAG_REF_EVENT ( xfs_perag_clear_inode_tag ) ;
2010-05-24 12:25:57 +04:00
2021-08-06 21:05:43 +03:00
TRACE_EVENT ( xfs_inodegc_worker ,
TP_PROTO ( struct xfs_mount * mp , unsigned int shrinker_hits ) ,
TP_ARGS ( mp , shrinker_hits ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned int , shrinker_hits )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > shrinker_hits = shrinker_hits ;
) ,
TP_printk ( " dev %d:%d shrinker_hits %u " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > shrinker_hits )
) ;
2021-08-06 21:05:39 +03:00
DECLARE_EVENT_CLASS ( xfs_fs_class ,
TP_PROTO ( struct xfs_mount * mp , void * caller_ip ) ,
TP_ARGS ( mp , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned long long , mflags )
__field ( unsigned long , opstate )
__field ( unsigned long , sbflags )
__field ( void * , caller_ip )
) ,
TP_fast_assign (
if ( mp ) {
__entry - > dev = mp - > m_super - > s_dev ;
2021-08-19 04:46:52 +03:00
__entry - > mflags = mp - > m_features ;
2021-08-06 21:05:39 +03:00
__entry - > opstate = mp - > m_opstate ;
__entry - > sbflags = mp - > m_super - > s_flags ;
}
__entry - > caller_ip = caller_ip ;
) ,
2021-08-19 04:46:52 +03:00
TP_printk ( " dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS " ,
2021-08-06 21:05:39 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > mflags ,
__print_flags ( __entry - > opstate , " | " , XFS_OPSTATE_STRINGS ) ,
__entry - > sbflags ,
__entry - > caller_ip )
) ;
# define DEFINE_FS_EVENT(name) \
DEFINE_EVENT ( xfs_fs_class , name , \
TP_PROTO ( struct xfs_mount * mp , void * caller_ip ) , \
TP_ARGS ( mp , caller_ip ) )
DEFINE_FS_EVENT ( xfs_inodegc_flush ) ;
2022-06-16 17:44:32 +03:00
DEFINE_FS_EVENT ( xfs_inodegc_push ) ;
2021-08-06 21:05:39 +03:00
DEFINE_FS_EVENT ( xfs_inodegc_start ) ;
DEFINE_FS_EVENT ( xfs_inodegc_stop ) ;
DEFINE_FS_EVENT ( xfs_inodegc_queue ) ;
DEFINE_FS_EVENT ( xfs_inodegc_throttle ) ;
DEFINE_FS_EVENT ( xfs_fs_sync_fs ) ;
2021-08-06 21:05:42 +03:00
DEFINE_FS_EVENT ( xfs_blockgc_start ) ;
DEFINE_FS_EVENT ( xfs_blockgc_stop ) ;
DEFINE_FS_EVENT ( xfs_blockgc_worker ) ;
2021-08-06 21:05:42 +03:00
DEFINE_FS_EVENT ( xfs_blockgc_flush_all ) ;
2021-08-06 21:05:39 +03:00
2021-08-06 21:05:43 +03:00
TRACE_EVENT ( xfs_inodegc_shrinker_scan ,
TP_PROTO ( struct xfs_mount * mp , struct shrink_control * sc ,
void * caller_ip ) ,
TP_ARGS ( mp , sc , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned long , nr_to_scan )
__field ( void * , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > nr_to_scan = sc - > nr_to_scan ;
__entry - > caller_ip = caller_ip ;
) ,
TP_printk ( " dev %d:%d nr_to_scan %lu caller %pS " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > nr_to_scan ,
__entry - > caller_ip )
) ;
2013-11-01 08:27:19 +04:00
DECLARE_EVENT_CLASS ( xfs_ag_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ) ,
TP_ARGS ( mp , agno ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x " ,
2013-11-01 08:27:19 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno )
) ;
# define DEFINE_AG_EVENT(name) \
DEFINE_EVENT ( xfs_ag_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ) , \
TP_ARGS ( mp , agno ) )
DEFINE_AG_EVENT ( xfs_read_agf ) ;
DEFINE_AG_EVENT ( xfs_alloc_read_agf ) ;
DEFINE_AG_EVENT ( xfs_read_agi ) ;
DEFINE_AG_EVENT ( xfs_ialloc_read_agi ) ;
2009-12-15 02:14:59 +03:00
TRACE_EVENT ( xfs_attr_list_node_descend ,
TP_PROTO ( struct xfs_attr_list_context * ctx ,
struct xfs_da_node_entry * btree ) ,
TP_ARGS ( ctx , btree ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( u32 , hashval )
__field ( u32 , blkno )
__field ( u32 , offset )
2020-02-27 04:30:37 +03:00
__field ( void * , buffer )
2009-12-15 02:14:59 +03:00
__field ( int , bufsize )
__field ( int , count )
__field ( int , firstu )
__field ( int , dupcnt )
2020-02-27 04:30:42 +03:00
__field ( unsigned int , attr_filter )
2009-12-15 02:14:59 +03:00
__field ( u32 , bt_hashval )
__field ( u32 , bt_before )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ctx - > dp ) - > i_sb - > s_dev ;
__entry - > ino = ctx - > dp - > i_ino ;
2020-02-27 04:30:43 +03:00
__entry - > hashval = ctx - > cursor . hashval ;
__entry - > blkno = ctx - > cursor . blkno ;
__entry - > offset = ctx - > cursor . offset ;
2020-02-27 04:30:37 +03:00
__entry - > buffer = ctx - > buffer ;
2009-12-15 02:14:59 +03:00
__entry - > bufsize = ctx - > bufsize ;
__entry - > count = ctx - > count ;
__entry - > firstu = ctx - > firstu ;
2020-02-27 04:30:42 +03:00
__entry - > attr_filter = ctx - > attr_filter ;
2009-12-15 02:14:59 +03:00
__entry - > bt_hashval = be32_to_cpu ( btree - > hashval ) ;
__entry - > bt_before = be32_to_cpu ( btree - > before ) ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
2020-02-27 04:30:42 +03:00
" buffer %p size %u count %u firstu %u filter %s "
2009-12-15 02:14:59 +03:00
" node hashval %u, node before %u " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > hashval ,
__entry - > blkno ,
__entry - > offset ,
__entry - > dupcnt ,
2020-02-27 04:30:37 +03:00
__entry - > buffer ,
2009-12-15 02:14:59 +03:00
__entry - > bufsize ,
__entry - > count ,
__entry - > firstu ,
2020-02-27 04:30:42 +03:00
__print_flags ( __entry - > attr_filter , " | " ,
XFS_ATTR_FILTER_FLAGS ) ,
2009-12-15 02:14:59 +03:00
__entry - > bt_hashval ,
__entry - > bt_before )
) ;
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_bmap_class ,
2017-11-03 20:34:43 +03:00
TP_PROTO ( struct xfs_inode * ip , struct xfs_iext_cursor * cur , int state ,
2009-12-21 17:03:03 +03:00
unsigned long caller_ip ) ,
2017-11-03 20:34:43 +03:00
TP_ARGS ( ip , cur , state , caller_ip ) ,
2009-12-21 17:03:03 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
2019-10-24 23:26:59 +03:00
__field ( void * , leaf )
__field ( int , pos )
2009-12-21 17:03:03 +03:00
__field ( xfs_fileoff_t , startoff )
__field ( xfs_fsblock_t , startblock )
__field ( xfs_filblks_t , blockcount )
__field ( xfs_exntst_t , state )
__field ( int , bmap_state )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
2016-10-03 19:11:32 +03:00
struct xfs_ifork * ifp ;
2009-12-21 17:03:03 +03:00
struct xfs_bmbt_irec r ;
2016-10-03 19:11:32 +03:00
ifp = xfs_iext_state_to_fork ( ip , state ) ;
2017-11-03 20:34:43 +03:00
xfs_iext_get_extent ( ifp , cur , & r ) ;
2009-12-21 17:03:03 +03:00
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2017-11-03 20:34:46 +03:00
__entry - > leaf = cur - > leaf ;
__entry - > pos = cur - > pos ;
2009-12-21 17:03:03 +03:00
__entry - > startoff = r . br_startoff ;
__entry - > startblock = r . br_startblock ;
__entry - > blockcount = r . br_blockcount ;
__entry - > state = r . br_state ;
__entry - > bmap_state = state ;
__entry - > caller_ip = caller_ip ;
) ,
2018-01-09 22:43:36 +03:00
TP_printk ( " dev %d:%d ino 0x%llx state %s cur %p/%d "
2021-08-17 23:00:13 +03:00
" fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__print_flags ( __entry - > bmap_state , " | " , XFS_BMAP_EXT_FLAGS ) ,
2017-11-03 20:34:46 +03:00
__entry - > leaf ,
__entry - > pos ,
2009-12-21 17:03:03 +03:00
__entry - > startoff ,
2017-06-16 21:00:05 +03:00
( int64_t ) __entry - > startblock ,
2009-12-21 17:03:03 +03:00
__entry - > blockcount ,
__entry - > state ,
( char * ) __entry - > caller_ip )
)
2009-12-15 02:14:59 +03:00
# define DEFINE_BMAP_EVENT(name) \
2009-12-21 17:03:03 +03:00
DEFINE_EVENT ( xfs_bmap_class , name , \
2017-11-03 20:34:43 +03:00
TP_PROTO ( struct xfs_inode * ip , struct xfs_iext_cursor * cur , int state , \
2009-12-15 02:14:59 +03:00
unsigned long caller_ip ) , \
2017-11-03 20:34:43 +03:00
TP_ARGS ( ip , cur , state , caller_ip ) )
2017-11-03 20:34:46 +03:00
DEFINE_BMAP_EVENT ( xfs_iext_insert ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BMAP_EVENT ( xfs_iext_remove ) ;
DEFINE_BMAP_EVENT ( xfs_bmap_pre_update ) ;
DEFINE_BMAP_EVENT ( xfs_bmap_post_update ) ;
2017-10-19 21:06:29 +03:00
DEFINE_BMAP_EVENT ( xfs_read_extent ) ;
DEFINE_BMAP_EVENT ( xfs_write_extent ) ;
2009-12-15 02:14:59 +03:00
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_buf_class ,
TP_PROTO ( struct xfs_buf * bp , unsigned long caller_ip ) ,
TP_ARGS ( bp , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_daddr_t , bno )
2012-04-23 09:58:51 +04:00
__field ( int , nblks )
2009-12-21 17:03:03 +03:00
__field ( int , hold )
__field ( int , pincount )
__field ( unsigned , lockval )
__field ( unsigned , flags )
__field ( unsigned long , caller_ip )
2022-04-26 04:37:05 +03:00
__field ( const void * , buf_ops )
2009-12-21 17:03:03 +03:00
) ,
TP_fast_assign (
__entry - > dev = bp - > b_target - > bt_dev ;
2021-08-19 04:47:05 +03:00
__entry - > bno = xfs_buf_daddr ( bp ) ;
2012-04-23 09:58:51 +04:00
__entry - > nblks = bp - > b_length ;
2009-12-21 17:03:03 +03:00
__entry - > hold = atomic_read ( & bp - > b_hold ) ;
__entry - > pincount = atomic_read ( & bp - > b_pin_count ) ;
2011-07-08 16:36:19 +04:00
__entry - > lockval = bp - > b_sema . count ;
2009-12-21 17:03:03 +03:00
__entry - > flags = bp - > b_flags ;
__entry - > caller_ip = caller_ip ;
2022-04-26 04:37:05 +03:00
__entry - > buf_ops = bp - > b_ops ;
2009-12-21 17:03:03 +03:00
) ,
2021-08-17 23:15:53 +03:00
TP_printk ( " dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
2022-04-26 04:37:05 +03:00
" lock %d flags %s bufops %pS caller %pS " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
( unsigned long long ) __entry - > bno ,
2012-04-23 09:58:51 +04:00
__entry - > nblks ,
2009-12-21 17:03:03 +03:00
__entry - > hold ,
__entry - > pincount ,
__entry - > lockval ,
__print_flags ( __entry - > flags , " | " , XFS_BUF_FLAGS ) ,
2022-04-26 04:37:05 +03:00
__entry - > buf_ops ,
2009-12-21 17:03:03 +03:00
( void * ) __entry - > caller_ip )
2009-12-15 02:14:59 +03:00
)
2009-12-21 17:03:03 +03:00
# define DEFINE_BUF_EVENT(name) \
DEFINE_EVENT ( xfs_buf_class , name , \
TP_PROTO ( struct xfs_buf * bp , unsigned long caller_ip ) , \
TP_ARGS ( bp , caller_ip ) )
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_buf_init ) ;
DEFINE_BUF_EVENT ( xfs_buf_free ) ;
DEFINE_BUF_EVENT ( xfs_buf_hold ) ;
DEFINE_BUF_EVENT ( xfs_buf_rele ) ;
DEFINE_BUF_EVENT ( xfs_buf_iodone ) ;
2014-10-02 03:05:14 +04:00
DEFINE_BUF_EVENT ( xfs_buf_submit ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_buf_lock ) ;
DEFINE_BUF_EVENT ( xfs_buf_lock_done ) ;
2016-06-21 04:53:28 +03:00
DEFINE_BUF_EVENT ( xfs_buf_trylock_fail ) ;
2011-07-08 16:36:19 +04:00
DEFINE_BUF_EVENT ( xfs_buf_trylock ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_buf_unlock ) ;
DEFINE_BUF_EVENT ( xfs_buf_iowait ) ;
DEFINE_BUF_EVENT ( xfs_buf_iowait_done ) ;
DEFINE_BUF_EVENT ( xfs_buf_delwri_queue ) ;
xfs: on-stack delayed write buffer lists
Queue delwri buffers on a local on-stack list instead of a per-buftarg one,
and write back the buffers per-process instead of by waking up xfsbufd.
This is now easily doable given that we have very few places left that write
delwri buffers:
- log recovery:
Only done at mount time, and already forcing out the buffers
synchronously using xfs_flush_buftarg
- quotacheck:
Same story.
- dquot reclaim:
Writes out dirty dquots on the LRU under memory pressure. We might
want to look into doing more of this via xfsaild, but it's already
more optimal than the synchronous inode reclaim that writes each
buffer synchronously.
- xfsaild:
This is the main beneficiary of the change. By keeping a local list
of buffers to write we reduce latency of writing out buffers, and
more importably we can remove all the delwri list promotions which
were hitting the buffer cache hard under sustained metadata loads.
The implementation is very straight forward - xfs_buf_delwri_queue now gets
a new list_head pointer that it adds the delwri buffers to, and all callers
need to eventually submit the list using xfs_buf_delwi_submit or
xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are
skipped in xfs_buf_delwri_queue, assuming they already are on another delwri
list. The biggest change to pass down the buffer list was done to the AIL
pushing. Now that we operate on buffers the trylock, push and pushbuf log
item methods are merged into a single push routine, which tries to lock the
item, and if possible add the buffer that needs writeback to the buffer list.
This leads to much simpler code than the previous split but requires the
individual IOP_PUSH instances to unlock and reacquire the AIL around calls
to blocking routines.
Given that xfsailds now also handle writing out buffers, the conditions for
log forcing and the sleep times needed some small changes. The most
important one is that we consider an AIL busy as long we still have buffers
to push, and the other one is that we do increment the pushed LSN for
buffers that are under flushing at this moment, but still count them towards
the stuck items for restart purposes. Without this we could hammer on stuck
items without ever forcing the log and not make progress under heavy random
delete workloads on fast flash storage devices.
[ Dave Chinner:
- rebase on previous patches.
- improved comments for XBF_DELWRI_Q handling
- fix XBF_ASYNC handling in queue submission (test 106 failure)
- rename delwri submit function buffer list parameters for clarity
- xfs_efd_item_push() should return XFS_ITEM_PINNED ]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 09:58:39 +04:00
DEFINE_BUF_EVENT ( xfs_buf_delwri_queued ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_buf_delwri_split ) ;
2017-06-15 07:21:45 +03:00
DEFINE_BUF_EVENT ( xfs_buf_delwri_pushbuf ) ;
2010-09-24 14:07:47 +04:00
DEFINE_BUF_EVENT ( xfs_buf_get_uncached ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_buf_item_relse ) ;
2020-09-01 20:55:29 +03:00
DEFINE_BUF_EVENT ( xfs_buf_iodone_async ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_buf_error_relse ) ;
2021-01-23 03:48:19 +03:00
DEFINE_BUF_EVENT ( xfs_buf_drain_buftarg ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_EVENT ( xfs_trans_read_buf_shut ) ;
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT ( xfs_btree_corrupt ) ;
DEFINE_BUF_EVENT ( xfs_reset_dqcounts ) ;
/* pass flags explicitly */
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_buf_flags_class ,
TP_PROTO ( struct xfs_buf * bp , unsigned flags , unsigned long caller_ip ) ,
TP_ARGS ( bp , flags , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_daddr_t , bno )
2021-08-17 23:15:53 +03:00
__field ( unsigned int , length )
2009-12-21 17:03:03 +03:00
__field ( int , hold )
__field ( int , pincount )
__field ( unsigned , lockval )
__field ( unsigned , flags )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = bp - > b_target - > bt_dev ;
2021-08-19 04:47:05 +03:00
__entry - > bno = xfs_buf_daddr ( bp ) ;
2021-08-17 23:15:53 +03:00
__entry - > length = bp - > b_length ;
2009-12-21 17:03:03 +03:00
__entry - > flags = flags ;
__entry - > hold = atomic_read ( & bp - > b_hold ) ;
__entry - > pincount = atomic_read ( & bp - > b_pin_count ) ;
2011-07-08 16:36:19 +04:00
__entry - > lockval = bp - > b_sema . count ;
2009-12-21 17:03:03 +03:00
__entry - > caller_ip = caller_ip ;
) ,
2021-08-17 23:15:53 +03:00
TP_printk ( " dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
2018-01-09 22:46:05 +03:00
" lock %d flags %s caller %pS " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
( unsigned long long ) __entry - > bno ,
2021-08-17 23:15:53 +03:00
__entry - > length ,
2009-12-21 17:03:03 +03:00
__entry - > hold ,
__entry - > pincount ,
__entry - > lockval ,
__print_flags ( __entry - > flags , " | " , XFS_BUF_FLAGS ) ,
( void * ) __entry - > caller_ip )
2009-12-15 02:14:59 +03:00
)
2009-12-21 17:03:03 +03:00
# define DEFINE_BUF_FLAGS_EVENT(name) \
DEFINE_EVENT ( xfs_buf_flags_class , name , \
TP_PROTO ( struct xfs_buf * bp , unsigned flags , unsigned long caller_ip ) , \
TP_ARGS ( bp , flags , caller_ip ) )
2009-12-15 02:14:59 +03:00
DEFINE_BUF_FLAGS_EVENT ( xfs_buf_find ) ;
DEFINE_BUF_FLAGS_EVENT ( xfs_buf_get ) ;
DEFINE_BUF_FLAGS_EVENT ( xfs_buf_read ) ;
TRACE_EVENT ( xfs_buf_ioerror ,
2018-01-08 21:51:02 +03:00
TP_PROTO ( struct xfs_buf * bp , int error , xfs_failaddr_t caller_ip ) ,
2009-12-15 02:14:59 +03:00
TP_ARGS ( bp , error , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_daddr_t , bno )
2021-08-17 23:15:53 +03:00
__field ( unsigned int , length )
2009-12-15 02:14:59 +03:00
__field ( unsigned , flags )
__field ( int , hold )
__field ( int , pincount )
__field ( unsigned , lockval )
__field ( int , error )
2018-01-08 21:51:02 +03:00
__field ( xfs_failaddr_t , caller_ip )
2009-12-15 02:14:59 +03:00
) ,
TP_fast_assign (
__entry - > dev = bp - > b_target - > bt_dev ;
2021-08-19 04:47:05 +03:00
__entry - > bno = xfs_buf_daddr ( bp ) ;
2021-08-17 23:15:53 +03:00
__entry - > length = bp - > b_length ;
2009-12-15 02:14:59 +03:00
__entry - > hold = atomic_read ( & bp - > b_hold ) ;
__entry - > pincount = atomic_read ( & bp - > b_pin_count ) ;
2011-07-08 16:36:19 +04:00
__entry - > lockval = bp - > b_sema . count ;
2009-12-15 02:14:59 +03:00
__entry - > error = error ;
__entry - > flags = bp - > b_flags ;
__entry - > caller_ip = caller_ip ;
) ,
2021-08-17 23:15:53 +03:00
TP_printk ( " dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
2018-01-08 21:51:02 +03:00
" lock %d error %d flags %s caller %pS " ,
2009-12-15 02:14:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
( unsigned long long ) __entry - > bno ,
2021-08-17 23:15:53 +03:00
__entry - > length ,
2009-12-15 02:14:59 +03:00
__entry - > hold ,
__entry - > pincount ,
__entry - > lockval ,
__entry - > error ,
__print_flags ( __entry - > flags , " | " , XFS_BUF_FLAGS ) ,
( void * ) __entry - > caller_ip )
) ;
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_buf_item_class ,
TP_PROTO ( struct xfs_buf_log_item * bip ) ,
TP_ARGS ( bip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_daddr_t , buf_bno )
2021-08-17 23:15:53 +03:00
__field ( unsigned int , buf_len )
2009-12-21 17:03:03 +03:00
__field ( int , buf_hold )
__field ( int , buf_pincount )
__field ( int , buf_lockval )
__field ( unsigned , buf_flags )
__field ( unsigned , bli_recur )
__field ( int , bli_refcount )
__field ( unsigned , bli_flags )
2018-05-09 17:47:34 +03:00
__field ( unsigned long , li_flags )
2009-12-21 17:03:03 +03:00
) ,
TP_fast_assign (
__entry - > dev = bip - > bli_buf - > b_target - > bt_dev ;
__entry - > bli_flags = bip - > bli_flags ;
__entry - > bli_recur = bip - > bli_recur ;
__entry - > bli_refcount = atomic_read ( & bip - > bli_refcount ) ;
2021-08-19 04:47:05 +03:00
__entry - > buf_bno = xfs_buf_daddr ( bip - > bli_buf ) ;
2021-08-17 23:15:53 +03:00
__entry - > buf_len = bip - > bli_buf - > b_length ;
2009-12-21 17:03:03 +03:00
__entry - > buf_flags = bip - > bli_buf - > b_flags ;
__entry - > buf_hold = atomic_read ( & bip - > bli_buf - > b_hold ) ;
__entry - > buf_pincount = atomic_read ( & bip - > bli_buf - > b_pin_count ) ;
2011-07-08 16:36:19 +04:00
__entry - > buf_lockval = bip - > bli_buf - > b_sema . count ;
2009-12-21 17:03:03 +03:00
__entry - > li_flags = bip - > bli_item . li_flags ;
) ,
2021-08-17 23:15:53 +03:00
TP_printk ( " dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
2009-12-21 17:03:03 +03:00
" lock %d flags %s recur %d refcount %d bliflags %s "
2018-05-09 17:49:37 +03:00
" liflags %s " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
( unsigned long long ) __entry - > buf_bno ,
__entry - > buf_len ,
__entry - > buf_hold ,
__entry - > buf_pincount ,
__entry - > buf_lockval ,
__print_flags ( __entry - > buf_flags , " | " , XFS_BUF_FLAGS ) ,
__entry - > bli_recur ,
__entry - > bli_refcount ,
__print_flags ( __entry - > bli_flags , " | " , XFS_BLI_FLAGS ) ,
__print_flags ( __entry - > li_flags , " | " , XFS_LI_FLAGS ) )
2009-12-15 02:14:59 +03:00
)
2009-12-21 17:03:03 +03:00
# define DEFINE_BUF_ITEM_EVENT(name) \
DEFINE_EVENT ( xfs_buf_item_class , name , \
TP_PROTO ( struct xfs_buf_log_item * bip ) , \
TP_ARGS ( bip ) )
2009-12-15 02:14:59 +03:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_size ) ;
2013-06-27 10:04:52 +04:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_size_ordered ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_size_stale ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_format ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_format_stale ) ;
2013-06-27 10:04:52 +04:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_ordered ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_pin ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_unpin ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_unpin_stale ) ;
2019-06-29 05:27:32 +03:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_release ) ;
2009-12-15 02:14:59 +03:00
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_committed ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_buf_item_push ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_get_buf ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_get_buf_recur ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_getsb ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_getsb_recur ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_read_buf ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_read_buf_recur ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_log_buf ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_brelse ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_bjoin ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_bhold ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_bhold_release ) ;
DEFINE_BUF_ITEM_EVENT ( xfs_trans_binval ) ;
2014-04-23 01:11:52 +04:00
DECLARE_EVENT_CLASS ( xfs_filestream_class ,
2023-02-13 01:14:56 +03:00
TP_PROTO ( struct xfs_perag * pag , xfs_ino_t ino ) ,
TP_ARGS ( pag , ino ) ,
2014-04-23 01:11:52 +04:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_agnumber_t , agno )
__field ( int , streams )
) ,
TP_fast_assign (
2023-02-13 01:14:56 +03:00
__entry - > dev = pag - > pag_mount - > m_super - > s_dev ;
2018-04-09 20:23:39 +03:00
__entry - > ino = ino ;
2023-02-13 01:14:56 +03:00
__entry - > agno = pag - > pag_agno ;
__entry - > streams = atomic_read ( & pag - > pagf_fstrms ) ;
2014-04-23 01:11:52 +04:00
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d ino 0x%llx agno 0x%x streams %d " ,
2014-04-23 01:11:52 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > agno ,
__entry - > streams )
)
# define DEFINE_FILESTREAM_EVENT(name) \
DEFINE_EVENT ( xfs_filestream_class , name , \
2023-02-13 01:14:56 +03:00
TP_PROTO ( struct xfs_perag * pag , xfs_ino_t ino ) , \
TP_ARGS ( pag , ino ) )
2014-04-23 01:11:52 +04:00
DEFINE_FILESTREAM_EVENT ( xfs_filestream_free ) ;
DEFINE_FILESTREAM_EVENT ( xfs_filestream_lookup ) ;
DEFINE_FILESTREAM_EVENT ( xfs_filestream_scan ) ;
TRACE_EVENT ( xfs_filestream_pick ,
2023-02-13 01:14:56 +03:00
TP_PROTO ( struct xfs_perag * pag , xfs_ino_t ino , xfs_extlen_t free ) ,
TP_ARGS ( pag , ino , free ) ,
2014-04-23 01:11:52 +04:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_agnumber_t , agno )
__field ( int , streams )
__field ( xfs_extlen_t , free )
) ,
TP_fast_assign (
2023-02-13 01:14:56 +03:00
__entry - > dev = pag - > pag_mount - > m_super - > s_dev ;
__entry - > ino = ino ;
2023-02-13 01:14:56 +03:00
if ( pag ) {
__entry - > agno = pag - > pag_agno ;
__entry - > streams = atomic_read ( & pag - > pagf_fstrms ) ;
} else {
__entry - > agno = NULLAGNUMBER ;
__entry - > streams = 0 ;
}
2014-04-23 01:11:52 +04:00
__entry - > free = free ;
) ,
2023-02-13 01:14:56 +03:00
TP_printk ( " dev %d:%d ino 0x%llx agno 0x%x streams %d free %d " ,
2014-04-23 01:11:52 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > agno ,
__entry - > streams ,
2023-02-13 01:14:56 +03:00
__entry - > free )
2014-04-23 01:11:52 +04:00
) ;
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_lock_class ,
TP_PROTO ( struct xfs_inode * ip , unsigned lock_flags ,
unsigned long caller_ip ) ,
TP_ARGS ( ip , lock_flags , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( int , lock_flags )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > lock_flags = lock_flags ;
__entry - > caller_ip = caller_ip ;
) ,
2018-01-09 22:46:05 +03:00
TP_printk ( " dev %d:%d ino 0x%llx flags %s caller %pS " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__print_flags ( __entry - > lock_flags , " | " , XFS_LOCK_FLAGS ) ,
( void * ) __entry - > caller_ip )
)
2009-12-15 02:14:59 +03:00
# define DEFINE_LOCK_EVENT(name) \
2009-12-21 17:03:03 +03:00
DEFINE_EVENT ( xfs_lock_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_inode * ip , unsigned lock_flags , \
unsigned long caller_ip ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( ip , lock_flags , caller_ip ) )
2009-12-15 02:14:59 +03:00
DEFINE_LOCK_EVENT ( xfs_ilock ) ;
DEFINE_LOCK_EVENT ( xfs_ilock_nowait ) ;
DEFINE_LOCK_EVENT ( xfs_ilock_demote ) ;
DEFINE_LOCK_EVENT ( xfs_iunlock ) ;
2010-06-24 05:57:09 +04:00
DECLARE_EVENT_CLASS ( xfs_inode_class ,
2009-12-21 17:03:03 +03:00
TP_PROTO ( struct xfs_inode * ip ) ,
TP_ARGS ( ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
2021-08-06 21:05:39 +03:00
__field ( unsigned long , iflags )
2009-12-21 17:03:03 +03:00
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2021-08-06 21:05:39 +03:00
__entry - > iflags = ip - > i_flags ;
2009-12-21 17:03:03 +03:00
) ,
2021-08-06 21:05:39 +03:00
TP_printk ( " dev %d:%d ino 0x%llx iflags 0x%lx " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
2021-08-06 21:05:39 +03:00
__entry - > ino ,
__entry - > iflags )
2009-12-21 17:03:03 +03:00
)
2010-06-24 05:57:09 +04:00
# define DEFINE_INODE_EVENT(name) \
DEFINE_EVENT ( xfs_inode_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_inode * ip ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( ip ) )
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_iget_skip ) ;
2021-06-18 21:57:05 +03:00
DEFINE_INODE_EVENT ( xfs_iget_recycle ) ;
DEFINE_INODE_EVENT ( xfs_iget_recycle_fail ) ;
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_iget_hit ) ;
DEFINE_INODE_EVENT ( xfs_iget_miss ) ;
2009-12-15 02:14:59 +03:00
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_getattr ) ;
DEFINE_INODE_EVENT ( xfs_setattr ) ;
DEFINE_INODE_EVENT ( xfs_readlink ) ;
2013-06-18 00:35:57 +04:00
DEFINE_INODE_EVENT ( xfs_inactive_symlink ) ;
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_alloc_file_space ) ;
DEFINE_INODE_EVENT ( xfs_free_file_space ) ;
2014-04-14 12:15:11 +04:00
DEFINE_INODE_EVENT ( xfs_zero_file_space ) ;
2014-02-24 03:58:19 +04:00
DEFINE_INODE_EVENT ( xfs_collapse_file_space ) ;
2015-03-25 07:08:56 +03:00
DEFINE_INODE_EVENT ( xfs_insert_file_space ) ;
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_readdir ) ;
2010-07-20 11:54:41 +04:00
# ifdef CONFIG_XFS_POSIX_ACL
2011-07-23 19:37:31 +04:00
DEFINE_INODE_EVENT ( xfs_get_acl ) ;
2010-07-20 11:54:41 +04:00
# endif
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_vm_bmap ) ;
DEFINE_INODE_EVENT ( xfs_file_ioctl ) ;
DEFINE_INODE_EVENT ( xfs_file_compat_ioctl ) ;
DEFINE_INODE_EVENT ( xfs_ioctl_setattr ) ;
2011-10-02 18:25:16 +04:00
DEFINE_INODE_EVENT ( xfs_dir_fsync ) ;
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_file_fsync ) ;
DEFINE_INODE_EVENT ( xfs_destroy_inode ) ;
2012-06-07 01:01:28 +04:00
DEFINE_INODE_EVENT ( xfs_update_time ) ;
2010-06-24 05:57:09 +04:00
DEFINE_INODE_EVENT ( xfs_dquot_dqalloc ) ;
DEFINE_INODE_EVENT ( xfs_dquot_dqdetach ) ;
2012-11-06 18:50:38 +04:00
DEFINE_INODE_EVENT ( xfs_inode_set_eofblocks_tag ) ;
DEFINE_INODE_EVENT ( xfs_inode_clear_eofblocks_tag ) ;
2012-11-06 18:50:42 +04:00
DEFINE_INODE_EVENT ( xfs_inode_free_eofblocks_invalid ) ;
2016-10-03 19:11:46 +03:00
DEFINE_INODE_EVENT ( xfs_inode_set_cowblocks_tag ) ;
DEFINE_INODE_EVENT ( xfs_inode_clear_cowblocks_tag ) ;
DEFINE_INODE_EVENT ( xfs_inode_free_cowblocks_invalid ) ;
2021-08-06 21:05:39 +03:00
DEFINE_INODE_EVENT ( xfs_inode_set_reclaimable ) ;
DEFINE_INODE_EVENT ( xfs_inode_reclaiming ) ;
DEFINE_INODE_EVENT ( xfs_inode_set_need_inactive ) ;
DEFINE_INODE_EVENT ( xfs_inode_inactivating ) ;
2012-11-06 18:50:38 +04:00
2018-12-19 01:32:29 +03:00
/*
* ftrace ' s __print_symbolic requires that all enum values be wrapped in the
* TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
* ring buffer . Somehow this was only worth mentioning in the ftrace sample
* code .
*/
TRACE_DEFINE_ENUM ( PE_SIZE_PTE ) ;
TRACE_DEFINE_ENUM ( PE_SIZE_PMD ) ;
TRACE_DEFINE_ENUM ( PE_SIZE_PUD ) ;
2022-10-27 00:23:58 +03:00
TRACE_DEFINE_ENUM ( XFS_REFC_DOMAIN_SHARED ) ;
TRACE_DEFINE_ENUM ( XFS_REFC_DOMAIN_COW ) ;
2017-08-29 20:08:41 +03:00
TRACE_EVENT ( xfs_filemap_fault ,
TP_PROTO ( struct xfs_inode * ip , enum page_entry_size pe_size ,
bool write_fault ) ,
TP_ARGS ( ip , pe_size , write_fault ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( enum page_entry_size , pe_size )
__field ( bool , write_fault )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > pe_size = pe_size ;
__entry - > write_fault = write_fault ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx %s write_fault %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__print_symbolic ( __entry - > pe_size ,
{ PE_SIZE_PTE , " PTE " } ,
{ PE_SIZE_PMD , " PMD " } ,
{ PE_SIZE_PUD , " PUD " } ) ,
__entry - > write_fault )
)
2010-06-24 05:57:09 +04:00
DECLARE_EVENT_CLASS ( xfs_iref_class ,
2009-12-21 17:03:03 +03:00
TP_PROTO ( struct xfs_inode * ip , unsigned long caller_ip ) ,
TP_ARGS ( ip , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( int , count )
2010-03-08 03:24:07 +03:00
__field ( int , pincount )
2009-12-21 17:03:03 +03:00
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > count = atomic_read ( & VFS_I ( ip ) - > i_count ) ;
2010-03-08 03:24:07 +03:00
__entry - > pincount = atomic_read ( & ip - > i_pincount ) ;
2009-12-21 17:03:03 +03:00
__entry - > caller_ip = caller_ip ;
) ,
2018-01-09 22:46:05 +03:00
TP_printk ( " dev %d:%d ino 0x%llx count %d pincount %d caller %pS " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > count ,
2010-03-08 03:24:07 +03:00
__entry - > pincount ,
2009-12-21 17:03:03 +03:00
( char * ) __entry - > caller_ip )
2013-03-18 18:51:48 +04:00
)
TRACE_EVENT ( xfs_iomap_prealloc_size ,
TP_PROTO ( struct xfs_inode * ip , xfs_fsblock_t blocks , int shift ,
unsigned int writeio_blocks ) ,
TP_ARGS ( ip , blocks , shift , writeio_blocks ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_fsblock_t , blocks )
__field ( int , shift )
__field ( unsigned int , writeio_blocks )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > blocks = blocks ;
__entry - > shift = shift ;
__entry - > writeio_blocks = writeio_blocks ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
2019-10-28 18:41:44 +03:00
" m_allocsize_blocks %u " ,
2013-03-18 18:51:48 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) , __entry - > ino ,
__entry - > blocks , __entry - > shift , __entry - > writeio_blocks )
2009-12-21 17:03:03 +03:00
)
2015-05-29 02:18:32 +03:00
TRACE_EVENT ( xfs_irec_merge_pre ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , xfs_agino_t agino ,
uint16_t holemask , xfs_agino_t nagino , uint16_t nholemask ) ,
TP_ARGS ( mp , agno , agino , holemask , nagino , nholemask ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , agino )
__field ( uint16_t , holemask )
__field ( xfs_agino_t , nagino )
__field ( uint16_t , nholemask )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agino = agino ;
__entry - > holemask = holemask ;
__entry - > nagino = nagino ;
__entry - > nholemask = holemask ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x " ,
2021-08-17 19:20:27 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agino ,
__entry - > holemask ,
__entry - > nagino ,
2015-05-29 02:18:32 +03:00
__entry - > nholemask )
)
TRACE_EVENT ( xfs_irec_merge_post ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , xfs_agino_t agino ,
uint16_t holemask ) ,
TP_ARGS ( mp , agno , agino , holemask ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , agino )
__field ( uint16_t , holemask )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agino = agino ;
__entry - > holemask = holemask ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x agino 0x%x holemask 0x%x " ,
2021-08-17 19:20:27 +03:00
MAJOR ( __entry - > dev ) ,
MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agino ,
2015-05-29 02:18:32 +03:00
__entry - > holemask )
)
2010-06-24 05:57:09 +04:00
# define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT ( xfs_iref_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_inode * ip , unsigned long caller_ip ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( ip , caller_ip ) )
2010-06-24 05:57:09 +04:00
DEFINE_IREF_EVENT ( xfs_irele ) ;
DEFINE_IREF_EVENT ( xfs_inode_pin ) ;
DEFINE_IREF_EVENT ( xfs_inode_unpin ) ;
DEFINE_IREF_EVENT ( xfs_inode_unpin_nowait ) ;
DECLARE_EVENT_CLASS ( xfs_namespace_class ,
2022-03-09 21:16:09 +03:00
TP_PROTO ( struct xfs_inode * dp , const struct xfs_name * name ) ,
2010-06-24 05:57:09 +04:00
TP_ARGS ( dp , name ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , dp_ino )
2012-02-28 15:01:40 +04:00
__field ( int , namelen )
2010-06-24 05:57:09 +04:00
__dynamic_array ( char , name , name - > len )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( dp ) - > i_sb - > s_dev ;
__entry - > dp_ino = dp - > i_ino ;
2012-02-28 15:01:40 +04:00
__entry - > namelen = name - > len ;
2010-06-24 05:57:09 +04:00
memcpy ( __get_str ( name ) , name - > name , name - > len ) ;
) ,
2012-02-28 15:01:40 +04:00
TP_printk ( " dev %d:%d dp ino 0x%llx name %.*s " ,
2010-06-24 05:57:09 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > dp_ino ,
2012-02-28 15:01:40 +04:00
__entry - > namelen ,
2010-06-24 05:57:09 +04:00
__get_str ( name ) )
)
# define DEFINE_NAMESPACE_EVENT(name) \
DEFINE_EVENT ( xfs_namespace_class , name , \
2022-03-09 21:16:09 +03:00
TP_PROTO ( struct xfs_inode * dp , const struct xfs_name * name ) , \
2010-06-24 05:57:09 +04:00
TP_ARGS ( dp , name ) )
DEFINE_NAMESPACE_EVENT ( xfs_remove ) ;
DEFINE_NAMESPACE_EVENT ( xfs_link ) ;
DEFINE_NAMESPACE_EVENT ( xfs_lookup ) ;
DEFINE_NAMESPACE_EVENT ( xfs_create ) ;
DEFINE_NAMESPACE_EVENT ( xfs_symlink ) ;
2010-03-08 03:24:07 +03:00
2010-06-24 05:57:09 +04:00
TRACE_EVENT ( xfs_rename ,
TP_PROTO ( struct xfs_inode * src_dp , struct xfs_inode * target_dp ,
struct xfs_name * src_name , struct xfs_name * target_name ) ,
TP_ARGS ( src_dp , target_dp , src_name , target_name ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , src_dp_ino )
__field ( xfs_ino_t , target_dp_ino )
2012-02-28 15:01:40 +04:00
__field ( int , src_namelen )
__field ( int , target_namelen )
2010-06-24 05:57:09 +04:00
__dynamic_array ( char , src_name , src_name - > len )
__dynamic_array ( char , target_name , target_name - > len )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( src_dp ) - > i_sb - > s_dev ;
__entry - > src_dp_ino = src_dp - > i_ino ;
__entry - > target_dp_ino = target_dp - > i_ino ;
2012-02-28 15:01:40 +04:00
__entry - > src_namelen = src_name - > len ;
__entry - > target_namelen = target_name - > len ;
2010-06-24 05:57:09 +04:00
memcpy ( __get_str ( src_name ) , src_name - > name , src_name - > len ) ;
2012-02-28 15:01:40 +04:00
memcpy ( __get_str ( target_name ) , target_name - > name ,
target_name - > len ) ;
2010-06-24 05:57:09 +04:00
) ,
TP_printk ( " dev %d:%d src dp ino 0x%llx target dp ino 0x%llx "
2012-02-28 15:01:40 +04:00
" src name %.*s target name %.*s " ,
2010-06-24 05:57:09 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > src_dp_ino ,
__entry - > target_dp_ino ,
2012-02-28 15:01:40 +04:00
__entry - > src_namelen ,
2010-06-24 05:57:09 +04:00
__get_str ( src_name ) ,
2012-02-28 15:01:40 +04:00
__entry - > target_namelen ,
2010-06-24 05:57:09 +04:00
__get_str ( target_name ) )
)
2009-12-15 02:14:59 +03:00
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_dquot_class ,
TP_PROTO ( struct xfs_dquot * dqp ) ,
TP_ARGS ( dqp ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
2010-02-15 01:01:45 +03:00
__field ( u32 , id )
2020-07-16 03:53:43 +03:00
__field ( xfs_dqtype_t , type )
2009-12-21 17:03:03 +03:00
__field ( unsigned , flags )
__field ( unsigned , nrefs )
__field ( unsigned long long , res_bcount )
2020-07-14 20:37:35 +03:00
__field ( unsigned long long , res_rtbcount )
__field ( unsigned long long , res_icount )
2009-12-21 17:03:03 +03:00
__field ( unsigned long long , bcount )
2020-07-14 20:37:35 +03:00
__field ( unsigned long long , rtbcount )
2009-12-21 17:03:03 +03:00
__field ( unsigned long long , icount )
2020-07-14 20:37:35 +03:00
2009-12-21 17:03:03 +03:00
__field ( unsigned long long , blk_hardlimit )
__field ( unsigned long long , blk_softlimit )
2020-07-14 20:37:35 +03:00
__field ( unsigned long long , rtb_hardlimit )
__field ( unsigned long long , rtb_softlimit )
2009-12-21 17:03:03 +03:00
__field ( unsigned long long , ino_hardlimit )
__field ( unsigned long long , ino_softlimit )
2020-07-14 20:37:35 +03:00
) ,
2009-12-21 17:03:03 +03:00
TP_fast_assign (
__entry - > dev = dqp - > q_mount - > m_super - > s_dev ;
2020-07-14 20:37:30 +03:00
__entry - > id = dqp - > q_id ;
2020-07-16 03:53:43 +03:00
__entry - > type = dqp - > q_type ;
__entry - > flags = dqp - > q_flags ;
2009-12-21 17:03:03 +03:00
__entry - > nrefs = dqp - > q_nrefs ;
2020-07-14 20:37:35 +03:00
2020-07-14 20:37:30 +03:00
__entry - > res_bcount = dqp - > q_blk . reserved ;
2020-07-14 20:37:35 +03:00
__entry - > res_rtbcount = dqp - > q_rtb . reserved ;
__entry - > res_icount = dqp - > q_ino . reserved ;
2020-07-14 20:37:31 +03:00
__entry - > bcount = dqp - > q_blk . count ;
2020-07-14 20:37:35 +03:00
__entry - > rtbcount = dqp - > q_rtb . count ;
2020-07-14 20:37:31 +03:00
__entry - > icount = dqp - > q_ino . count ;
2020-07-14 20:37:35 +03:00
2020-07-14 20:37:31 +03:00
__entry - > blk_hardlimit = dqp - > q_blk . hardlimit ;
__entry - > blk_softlimit = dqp - > q_blk . softlimit ;
2020-07-14 20:37:35 +03:00
__entry - > rtb_hardlimit = dqp - > q_rtb . hardlimit ;
__entry - > rtb_softlimit = dqp - > q_rtb . softlimit ;
2020-07-14 20:37:31 +03:00
__entry - > ino_hardlimit = dqp - > q_ino . hardlimit ;
__entry - > ino_softlimit = dqp - > q_ino . softlimit ;
2009-12-21 17:03:03 +03:00
) ,
2020-07-16 03:53:43 +03:00
TP_printk ( " dev %d:%d id 0x%x type %s flags %s nrefs %u "
2020-07-14 20:37:35 +03:00
" res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx "
2010-02-15 01:01:45 +03:00
" bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
2020-07-14 20:37:35 +03:00
" rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx "
2010-02-15 01:01:45 +03:00
" icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx] " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
2010-02-15 01:01:45 +03:00
__entry - > id ,
2020-07-16 03:53:43 +03:00
__print_flags ( __entry - > type , " | " , XFS_DQTYPE_STRINGS ) ,
2020-07-14 20:37:13 +03:00
__print_flags ( __entry - > flags , " | " , XFS_DQFLAG_STRINGS ) ,
2009-12-21 17:03:03 +03:00
__entry - > nrefs ,
__entry - > res_bcount ,
2020-07-14 20:37:35 +03:00
__entry - > res_rtbcount ,
__entry - > res_icount ,
2009-12-21 17:03:03 +03:00
__entry - > bcount ,
__entry - > blk_hardlimit ,
__entry - > blk_softlimit ,
2020-07-14 20:37:35 +03:00
__entry - > rtbcount ,
__entry - > rtb_hardlimit ,
__entry - > rtb_softlimit ,
2009-12-21 17:03:03 +03:00
__entry - > icount ,
__entry - > ino_hardlimit ,
__entry - > ino_softlimit )
2009-12-15 02:14:59 +03:00
)
2009-12-21 17:03:03 +03:00
# define DEFINE_DQUOT_EVENT(name) \
DEFINE_EVENT ( xfs_dquot_class , name , \
TP_PROTO ( struct xfs_dquot * dqp ) , \
TP_ARGS ( dqp ) )
2009-12-15 02:14:59 +03:00
DEFINE_DQUOT_EVENT ( xfs_dqadjust ) ;
DEFINE_DQUOT_EVENT ( xfs_dqreclaim_want ) ;
DEFINE_DQUOT_EVENT ( xfs_dqreclaim_dirty ) ;
2012-02-01 17:57:20 +04:00
DEFINE_DQUOT_EVENT ( xfs_dqreclaim_busy ) ;
DEFINE_DQUOT_EVENT ( xfs_dqreclaim_done ) ;
2009-12-15 02:14:59 +03:00
DEFINE_DQUOT_EVENT ( xfs_dqattach_found ) ;
DEFINE_DQUOT_EVENT ( xfs_dqattach_get ) ;
DEFINE_DQUOT_EVENT ( xfs_dqalloc ) ;
DEFINE_DQUOT_EVENT ( xfs_dqtobp_read ) ;
DEFINE_DQUOT_EVENT ( xfs_dqread ) ;
DEFINE_DQUOT_EVENT ( xfs_dqread_fail ) ;
DEFINE_DQUOT_EVENT ( xfs_dqget_hit ) ;
DEFINE_DQUOT_EVENT ( xfs_dqget_miss ) ;
2012-03-13 12:52:35 +04:00
DEFINE_DQUOT_EVENT ( xfs_dqget_freeing ) ;
DEFINE_DQUOT_EVENT ( xfs_dqget_dup ) ;
2009-12-15 02:14:59 +03:00
DEFINE_DQUOT_EVENT ( xfs_dqput ) ;
DEFINE_DQUOT_EVENT ( xfs_dqput_free ) ;
DEFINE_DQUOT_EVENT ( xfs_dqrele ) ;
DEFINE_DQUOT_EVENT ( xfs_dqflush ) ;
DEFINE_DQUOT_EVENT ( xfs_dqflush_force ) ;
DEFINE_DQUOT_EVENT ( xfs_dqflush_done ) ;
2020-07-14 20:37:35 +03:00
DEFINE_DQUOT_EVENT ( xfs_trans_apply_dquot_deltas_before ) ;
DEFINE_DQUOT_EVENT ( xfs_trans_apply_dquot_deltas_after ) ;
TRACE_EVENT ( xfs_trans_mod_dquot ,
TP_PROTO ( struct xfs_trans * tp , struct xfs_dquot * dqp ,
unsigned int field , int64_t delta ) ,
TP_ARGS ( tp , dqp , field , delta ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
2020-07-16 03:53:43 +03:00
__field ( xfs_dqtype_t , type )
2020-07-14 20:37:35 +03:00
__field ( unsigned int , flags )
__field ( unsigned int , dqid )
__field ( unsigned int , field )
__field ( int64_t , delta )
) ,
TP_fast_assign (
__entry - > dev = tp - > t_mountp - > m_super - > s_dev ;
2020-07-16 03:53:43 +03:00
__entry - > type = dqp - > q_type ;
__entry - > flags = dqp - > q_flags ;
2020-07-14 20:37:35 +03:00
__entry - > dqid = dqp - > q_id ;
__entry - > field = field ;
__entry - > delta = delta ;
) ,
2020-07-16 03:53:43 +03:00
TP_printk ( " dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld " ,
2020-07-14 20:37:35 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > dqid ,
2020-07-16 03:53:43 +03:00
__print_flags ( __entry - > type , " | " , XFS_DQTYPE_STRINGS ) ,
2020-07-14 20:37:35 +03:00
__print_flags ( __entry - > flags , " | " , XFS_DQFLAG_STRINGS ) ,
__print_flags ( __entry - > field , " | " , XFS_QMOPT_FLAGS ) ,
__entry - > delta )
) ;
DECLARE_EVENT_CLASS ( xfs_dqtrx_class ,
TP_PROTO ( struct xfs_dqtrx * qtrx ) ,
TP_ARGS ( qtrx ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
2020-07-16 03:53:43 +03:00
__field ( xfs_dqtype_t , type )
2020-07-14 20:37:35 +03:00
__field ( unsigned int , flags )
__field ( u32 , dqid )
__field ( uint64_t , blk_res )
__field ( int64_t , bcount_delta )
__field ( int64_t , delbcnt_delta )
__field ( uint64_t , rtblk_res )
__field ( uint64_t , rtblk_res_used )
__field ( int64_t , rtbcount_delta )
__field ( int64_t , delrtb_delta )
__field ( uint64_t , ino_res )
__field ( uint64_t , ino_res_used )
__field ( int64_t , icount_delta )
) ,
TP_fast_assign (
__entry - > dev = qtrx - > qt_dquot - > q_mount - > m_super - > s_dev ;
2020-07-16 03:53:43 +03:00
__entry - > type = qtrx - > qt_dquot - > q_type ;
__entry - > flags = qtrx - > qt_dquot - > q_flags ;
2020-07-14 20:37:35 +03:00
__entry - > dqid = qtrx - > qt_dquot - > q_id ;
__entry - > blk_res = qtrx - > qt_blk_res ;
__entry - > bcount_delta = qtrx - > qt_bcount_delta ;
__entry - > delbcnt_delta = qtrx - > qt_delbcnt_delta ;
__entry - > rtblk_res = qtrx - > qt_rtblk_res ;
__entry - > rtblk_res_used = qtrx - > qt_rtblk_res_used ;
__entry - > rtbcount_delta = qtrx - > qt_rtbcount_delta ;
__entry - > delrtb_delta = qtrx - > qt_delrtb_delta ;
__entry - > ino_res = qtrx - > qt_ino_res ;
__entry - > ino_res_used = qtrx - > qt_ino_res_used ;
__entry - > icount_delta = qtrx - > qt_icount_delta ;
) ,
2022-09-18 23:51:14 +03:00
TP_printk ( " dev %d:%d dquot id 0x%x type %s flags %s "
2020-07-14 20:37:35 +03:00
" blk_res %llu bcount_delta %lld delbcnt_delta %lld "
" rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
" ino_res %llu ino_res_used %llu icount_delta %lld " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > dqid ,
2020-07-16 03:53:43 +03:00
__print_flags ( __entry - > type , " | " , XFS_DQTYPE_STRINGS ) ,
2020-07-14 20:37:35 +03:00
__print_flags ( __entry - > flags , " | " , XFS_DQFLAG_STRINGS ) ,
__entry - > blk_res ,
__entry - > bcount_delta ,
__entry - > delbcnt_delta ,
__entry - > rtblk_res ,
__entry - > rtblk_res_used ,
__entry - > rtbcount_delta ,
__entry - > delrtb_delta ,
__entry - > ino_res ,
__entry - > ino_res_used ,
__entry - > icount_delta )
)
# define DEFINE_DQTRX_EVENT(name) \
DEFINE_EVENT ( xfs_dqtrx_class , name , \
TP_PROTO ( struct xfs_dqtrx * qtrx ) , \
TP_ARGS ( qtrx ) )
DEFINE_DQTRX_EVENT ( xfs_trans_apply_dquot_deltas ) ;
DEFINE_DQTRX_EVENT ( xfs_trans_mod_dquot_before ) ;
DEFINE_DQTRX_EVENT ( xfs_trans_mod_dquot_after ) ;
2009-12-15 02:14:59 +03:00
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_loggrant_class ,
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xlog_ticket * tic ) ,
2009-12-21 17:03:03 +03:00
TP_ARGS ( log , tic ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( char , ocnt )
__field ( char , cnt )
__field ( int , curr_res )
__field ( int , unit_res )
__field ( unsigned int , flags )
2010-12-21 04:02:25 +03:00
__field ( int , reserveq )
__field ( int , writeq )
2009-12-21 17:03:03 +03:00
__field ( int , grant_reserve_cycle )
__field ( int , grant_reserve_bytes )
__field ( int , grant_write_cycle )
__field ( int , grant_write_bytes )
__field ( int , curr_cycle )
__field ( int , curr_block )
__field ( xfs_lsn_t , tail_lsn )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > ocnt = tic - > t_ocnt ;
__entry - > cnt = tic - > t_cnt ;
__entry - > curr_res = tic - > t_curr_res ;
__entry - > unit_res = tic - > t_unit_res ;
__entry - > flags = tic - > t_flags ;
2012-02-20 06:31:25 +04:00
__entry - > reserveq = list_empty ( & log - > l_reserve_head . waiters ) ;
__entry - > writeq = list_empty ( & log - > l_write_head . waiters ) ;
xlog_crack_grant_head ( & log - > l_reserve_head . grant ,
2010-12-21 04:08:20 +03:00
& __entry - > grant_reserve_cycle ,
& __entry - > grant_reserve_bytes ) ;
2012-02-20 06:31:25 +04:00
xlog_crack_grant_head ( & log - > l_write_head . grant ,
2010-12-21 04:08:20 +03:00
& __entry - > grant_write_cycle ,
& __entry - > grant_write_bytes ) ;
2009-12-21 17:03:03 +03:00
__entry - > curr_cycle = log - > l_curr_cycle ;
__entry - > curr_block = log - > l_curr_block ;
2010-12-21 04:28:39 +03:00
__entry - > tail_lsn = atomic64_read ( & log - > l_tail_lsn ) ;
2009-12-21 17:03:03 +03:00
) ,
2016-04-06 02:20:36 +03:00
TP_printk ( " dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
2010-12-21 04:02:25 +03:00
" t_unit_res %u t_flags %s reserveq %s "
" writeq %s grant_reserve_cycle %d "
2009-12-21 17:03:03 +03:00
" grant_reserve_bytes %d grant_write_cycle %d "
" grant_write_bytes %d curr_cycle %d curr_block %d "
" tail_cycle %d tail_block %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ocnt ,
__entry - > cnt ,
__entry - > curr_res ,
__entry - > unit_res ,
__print_flags ( __entry - > flags , " | " , XLOG_TIC_FLAGS ) ,
2010-12-21 04:02:25 +03:00
__entry - > reserveq ? " empty " : " active " ,
__entry - > writeq ? " empty " : " active " ,
2009-12-21 17:03:03 +03:00
__entry - > grant_reserve_cycle ,
__entry - > grant_reserve_bytes ,
__entry - > grant_write_cycle ,
__entry - > grant_write_bytes ,
__entry - > curr_cycle ,
__entry - > curr_block ,
CYCLE_LSN ( __entry - > tail_lsn ) ,
BLOCK_LSN ( __entry - > tail_lsn )
)
)
2009-12-15 02:14:59 +03:00
2009-12-21 17:03:03 +03:00
# define DEFINE_LOGGRANT_EVENT(name) \
DEFINE_EVENT ( xfs_loggrant_class , name , \
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xlog_ticket * tic ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( log , tic ) )
2009-12-15 02:14:59 +03:00
DEFINE_LOGGRANT_EVENT ( xfs_log_umount_write ) ;
2011-11-28 12:17:36 +04:00
DEFINE_LOGGRANT_EVENT ( xfs_log_grant_sleep ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_grant_wake ) ;
2010-12-21 04:29:01 +03:00
DEFINE_LOGGRANT_EVENT ( xfs_log_grant_wake_up ) ;
2012-02-20 06:31:31 +04:00
DEFINE_LOGGRANT_EVENT ( xfs_log_reserve ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_reserve_exit ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_regrant ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_regrant_exit ) ;
2020-03-26 04:18:23 +03:00
DEFINE_LOGGRANT_EVENT ( xfs_log_ticket_regrant ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_ticket_regrant_exit ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_ticket_regrant_sub ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_ticket_ungrant ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_ticket_ungrant_sub ) ;
DEFINE_LOGGRANT_EVENT ( xfs_log_ticket_ungrant_exit ) ;
2020-03-25 06:10:27 +03:00
DEFINE_LOGGRANT_EVENT ( xfs_log_cil_wait ) ;
2009-12-15 02:14:59 +03:00
2011-10-11 19:14:11 +04:00
DECLARE_EVENT_CLASS ( xfs_log_item_class ,
TP_PROTO ( struct xfs_log_item * lip ) ,
TP_ARGS ( lip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( void * , lip )
__field ( uint , type )
2018-05-09 17:47:34 +03:00
__field ( unsigned long , flags )
2011-10-11 19:14:11 +04:00
__field ( xfs_lsn_t , lsn )
) ,
TP_fast_assign (
2022-03-17 19:09:12 +03:00
__entry - > dev = lip - > li_log - > l_mp - > m_super - > s_dev ;
2011-10-11 19:14:11 +04:00
__entry - > lip = lip ;
__entry - > type = lip - > li_type ;
__entry - > flags = lip - > li_flags ;
__entry - > lsn = lip - > li_lsn ;
) ,
2018-01-09 22:43:36 +03:00
TP_printk ( " dev %d:%d lip %p lsn %d/%d type %s flags %s " ,
2011-10-11 19:14:11 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > lip ,
CYCLE_LSN ( __entry - > lsn ) , BLOCK_LSN ( __entry - > lsn ) ,
__print_symbolic ( __entry - > type , XFS_LI_TYPE_DESC ) ,
__print_flags ( __entry - > flags , " | " , XFS_LI_FLAGS ) )
)
2012-04-24 10:33:31 +04:00
TRACE_EVENT ( xfs_log_force ,
2016-04-06 02:46:30 +03:00
TP_PROTO ( struct xfs_mount * mp , xfs_lsn_t lsn , unsigned long caller_ip ) ,
TP_ARGS ( mp , lsn , caller_ip ) ,
2012-04-24 10:33:31 +04:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_lsn_t , lsn )
2016-04-06 02:46:30 +03:00
__field ( unsigned long , caller_ip )
2012-04-24 10:33:31 +04:00
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > lsn = lsn ;
2016-04-06 02:46:30 +03:00
__entry - > caller_ip = caller_ip ;
2012-04-24 10:33:31 +04:00
) ,
2018-01-09 22:46:05 +03:00
TP_printk ( " dev %d:%d lsn 0x%llx caller %pS " ,
2012-04-24 10:33:31 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
2016-04-06 02:46:30 +03:00
__entry - > lsn , ( void * ) __entry - > caller_ip )
2012-04-24 10:33:31 +04:00
)
2011-10-11 19:14:11 +04:00
# define DEFINE_LOG_ITEM_EVENT(name) \
DEFINE_EVENT ( xfs_log_item_class , name , \
TP_PROTO ( struct xfs_log_item * lip ) , \
TP_ARGS ( lip ) )
DEFINE_LOG_ITEM_EVENT ( xfs_ail_push ) ;
DEFINE_LOG_ITEM_EVENT ( xfs_ail_pinned ) ;
DEFINE_LOG_ITEM_EVENT ( xfs_ail_locked ) ;
xfs: on-stack delayed write buffer lists
Queue delwri buffers on a local on-stack list instead of a per-buftarg one,
and write back the buffers per-process instead of by waking up xfsbufd.
This is now easily doable given that we have very few places left that write
delwri buffers:
- log recovery:
Only done at mount time, and already forcing out the buffers
synchronously using xfs_flush_buftarg
- quotacheck:
Same story.
- dquot reclaim:
Writes out dirty dquots on the LRU under memory pressure. We might
want to look into doing more of this via xfsaild, but it's already
more optimal than the synchronous inode reclaim that writes each
buffer synchronously.
- xfsaild:
This is the main beneficiary of the change. By keeping a local list
of buffers to write we reduce latency of writing out buffers, and
more importably we can remove all the delwri list promotions which
were hitting the buffer cache hard under sustained metadata loads.
The implementation is very straight forward - xfs_buf_delwri_queue now gets
a new list_head pointer that it adds the delwri buffers to, and all callers
need to eventually submit the list using xfs_buf_delwi_submit or
xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are
skipped in xfs_buf_delwri_queue, assuming they already are on another delwri
list. The biggest change to pass down the buffer list was done to the AIL
pushing. Now that we operate on buffers the trylock, push and pushbuf log
item methods are merged into a single push routine, which tries to lock the
item, and if possible add the buffer that needs writeback to the buffer list.
This leads to much simpler code than the previous split but requires the
individual IOP_PUSH instances to unlock and reacquire the AIL around calls
to blocking routines.
Given that xfsailds now also handle writing out buffers, the conditions for
log forcing and the sleep times needed some small changes. The most
important one is that we consider an AIL busy as long we still have buffers
to push, and the other one is that we do increment the pushed LSN for
buffers that are under flushing at this moment, but still count them towards
the stuck items for restart purposes. Without this we could hammer on stuck
items without ever forcing the log and not make progress under heavy random
delete workloads on fast flash storage devices.
[ Dave Chinner:
- rebase on previous patches.
- improved comments for XBF_DELWRI_Q handling
- fix XBF_ASYNC handling in queue submission (test 106 failure)
- rename delwri submit function buffer list parameters for clarity
- xfs_efd_item_push() should return XFS_ITEM_PINNED ]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 09:58:39 +04:00
DEFINE_LOG_ITEM_EVENT ( xfs_ail_flushing ) ;
xfs: intent item whiteouts
When we log modifications based on intents, we add both intent
and intent done items to the modification being made. These get
written to the log to ensure that the operation is re-run if the
intent done is not found in the log.
However, for operations that complete wholly within a single
checkpoint, the change in the checkpoint is atomic and will never
need replay. In this case, we don't need to actually write the
intent and intent done items to the journal because log recovery
will never need to manually restart this modification.
Log recovery currently handles intent/intent done matching by
inserting the intent into the AIL, then removing it when a matching
intent done item is found. Hence for all the intent-based operations
that complete within a checkpoint, we spend all that time parsing
the intent/intent done items just to cancel them and do nothing with
them.
Hence it follows that the only time we actually need intents in the
log is when the modification crosses checkpoint boundaries in the
log and so may only be partially complete in the journal. Hence if
we commit and intent done item to the CIL and the intent item is in
the same checkpoint, we don't actually have to write them to the
journal because log recovery will always cancel the intents.
We've never really worried about the overhead of logging intents
unnecessarily like this because the intents we log are generally
very much smaller than the change being made. e.g. freeing an extent
involves modifying at lease two freespace btree blocks and the AGF,
so the EFI/EFD overhead is only a small increase in space and
processing time compared to the overall cost of freeing an extent.
However, delayed attributes change this cost equation dramatically,
especially for inline attributes. In the case of adding an inline
attribute, we only log the inode core and attribute fork at present.
With delayed attributes, we now log the attr intent which includes
the name and value, the inode core adn attr fork, and finally the
attr intent done item. We increase the number of items we log from 1
to 3, and the number of log vectors (regions) goes up from 3 to 7.
Hence we tripple the number of objects that the CIL has to process,
and more than double the number of log vectors that need to be
written to the journal.
At scale, this means delayed attributes cause a non-pipelined CIL to
become CPU bound processing all the extra items, resulting in a > 40%
performance degradation on 16-way file+xattr create worklaods.
Pipelining the CIL (as per 5.15) reduces the performance degradation
to 20%, but now the limitation is the rate at which the log items
can be written to the iclogs and iclogs be dispatched for IO and
completed.
Even log IO completion is slowed down by these intents, because it
now has to process 3x the number of items in the checkpoint.
Processing completed intents is especially inefficient here, because
we first insert the intent into the AIL, then remove it from the AIL
when the intent done is processed. IOWs, we are also doing expensive
operations in log IO completion we could completely avoid if we
didn't log completed intent/intent done pairs.
Enter log item whiteouts.
When an intent done is committed, we can check to see if the
associated intent is in the same checkpoint as we are currently
committing the intent done to. If so, we can mark the intent log
item with a whiteout and immediately free the intent done item
rather than committing it to the CIL. We can basically skip the
entire formatting and CIL insertion steps for the intent done item.
However, we cannot remove the intent item from the CIL at this point
because the unlocked per-cpu CIL item lists do not permit removal
without holding the CIL context lock exclusively. Transaction commit
only holds the context lock shared, hence the best we can do is mark
the intent item with a whiteout so that the CIL push can release it
rather than writing it to the log.
This means we never write the intent to the log if the intent done
has also been committed to the same checkpoint, but we'll always
write the intent if the intent done has not been committed or has
been committed to a different checkpoint. This will result in
correct log recovery behaviour in all cases, without the overhead of
logging unnecessary intents.
This intent whiteout concept is generic - we can apply it to all
intent/intent done pairs that have a direct 1:1 relationship. The
way deferred ops iterate and relog intents mean that all intents
currently have a 1:1 relationship with their done intent, and hence
we can apply this cancellation to all existing intent/intent done
implementations.
For delayed attributes with a 16-way 64kB xattr create workload,
whiteouts reduce the amount of journalled metadata from ~2.5GB/s
down to ~600MB/s and improve the creation rate from 9000/s to
14000/s.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-04 04:50:29 +03:00
DEFINE_LOG_ITEM_EVENT ( xfs_cil_whiteout_mark ) ;
DEFINE_LOG_ITEM_EVENT ( xfs_cil_whiteout_skip ) ;
DEFINE_LOG_ITEM_EVENT ( xfs_cil_whiteout_unpin ) ;
2011-10-11 19:14:11 +04:00
2013-11-01 08:27:18 +04:00
DECLARE_EVENT_CLASS ( xfs_ail_class ,
TP_PROTO ( struct xfs_log_item * lip , xfs_lsn_t old_lsn , xfs_lsn_t new_lsn ) ,
TP_ARGS ( lip , old_lsn , new_lsn ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( void * , lip )
__field ( uint , type )
2018-05-09 17:47:34 +03:00
__field ( unsigned long , flags )
2013-11-01 08:27:18 +04:00
__field ( xfs_lsn_t , old_lsn )
__field ( xfs_lsn_t , new_lsn )
) ,
TP_fast_assign (
2022-03-17 19:09:12 +03:00
__entry - > dev = lip - > li_log - > l_mp - > m_super - > s_dev ;
2013-11-01 08:27:18 +04:00
__entry - > lip = lip ;
__entry - > type = lip - > li_type ;
__entry - > flags = lip - > li_flags ;
__entry - > old_lsn = old_lsn ;
__entry - > new_lsn = new_lsn ;
) ,
2018-01-09 22:43:36 +03:00
TP_printk ( " dev %d:%d lip %p old lsn %d/%d new lsn %d/%d type %s flags %s " ,
2013-11-01 08:27:18 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > lip ,
CYCLE_LSN ( __entry - > old_lsn ) , BLOCK_LSN ( __entry - > old_lsn ) ,
CYCLE_LSN ( __entry - > new_lsn ) , BLOCK_LSN ( __entry - > new_lsn ) ,
__print_symbolic ( __entry - > type , XFS_LI_TYPE_DESC ) ,
__print_flags ( __entry - > flags , " | " , XFS_LI_FLAGS ) )
)
# define DEFINE_AIL_EVENT(name) \
DEFINE_EVENT ( xfs_ail_class , name , \
TP_PROTO ( struct xfs_log_item * lip , xfs_lsn_t old_lsn , xfs_lsn_t new_lsn ) , \
TP_ARGS ( lip , old_lsn , new_lsn ) )
DEFINE_AIL_EVENT ( xfs_ail_insert ) ;
DEFINE_AIL_EVENT ( xfs_ail_move ) ;
DEFINE_AIL_EVENT ( xfs_ail_delete ) ;
TRACE_EVENT ( xfs_log_assign_tail_lsn ,
TP_PROTO ( struct xlog * log , xfs_lsn_t new_lsn ) ,
TP_ARGS ( log , new_lsn ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_lsn_t , new_lsn )
__field ( xfs_lsn_t , old_lsn )
__field ( xfs_lsn_t , last_sync_lsn )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > new_lsn = new_lsn ;
__entry - > old_lsn = atomic64_read ( & log - > l_tail_lsn ) ;
__entry - > last_sync_lsn = atomic64_read ( & log - > l_last_sync_lsn ) ;
) ,
TP_printk ( " dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
CYCLE_LSN ( __entry - > new_lsn ) , BLOCK_LSN ( __entry - > new_lsn ) ,
CYCLE_LSN ( __entry - > old_lsn ) , BLOCK_LSN ( __entry - > old_lsn ) ,
CYCLE_LSN ( __entry - > last_sync_lsn ) , BLOCK_LSN ( __entry - > last_sync_lsn ) )
)
2011-10-11 19:14:11 +04:00
2010-05-24 12:25:57 +04:00
DECLARE_EVENT_CLASS ( xfs_file_class ,
2021-01-23 21:06:29 +03:00
TP_PROTO ( struct kiocb * iocb , struct iov_iter * iter ) ,
TP_ARGS ( iocb , iter ) ,
2010-05-24 12:25:57 +04:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_fsize_t , size )
__field ( loff_t , offset )
__field ( size_t , count )
) ,
TP_fast_assign (
2021-01-23 21:06:29 +03:00
__entry - > dev = file_inode ( iocb - > ki_filp ) - > i_sb - > s_dev ;
__entry - > ino = XFS_I ( file_inode ( iocb - > ki_filp ) ) - > i_ino ;
2021-03-29 21:11:40 +03:00
__entry - > size = XFS_I ( file_inode ( iocb - > ki_filp ) ) - > i_disk_size ;
2021-01-23 21:06:29 +03:00
__entry - > offset = iocb - > ki_pos ;
__entry - > count = iov_iter_count ( iter ) ;
2010-05-24 12:25:57 +04:00
) ,
2021-08-17 23:03:19 +03:00
TP_printk ( " dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx " ,
2010-05-24 12:25:57 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > size ,
__entry - > offset ,
2016-07-20 04:31:42 +03:00
__entry - > count )
2009-12-15 02:14:59 +03:00
)
2010-05-24 12:25:57 +04:00
# define DEFINE_RW_EVENT(name) \
DEFINE_EVENT ( xfs_file_class , name , \
2021-01-23 21:06:29 +03:00
TP_PROTO ( struct kiocb * iocb , struct iov_iter * iter ) , \
TP_ARGS ( iocb , iter ) )
2016-07-20 04:31:42 +03:00
DEFINE_RW_EVENT ( xfs_file_buffered_read ) ;
DEFINE_RW_EVENT ( xfs_file_direct_read ) ;
2016-07-20 04:38:55 +03:00
DEFINE_RW_EVENT ( xfs_file_dax_read ) ;
2009-12-15 02:14:59 +03:00
DEFINE_RW_EVENT ( xfs_file_buffered_write ) ;
DEFINE_RW_EVENT ( xfs_file_direct_write ) ;
2016-07-20 04:38:55 +03:00
DEFINE_RW_EVENT ( xfs_file_dax_write ) ;
2021-01-23 21:06:30 +03:00
DEFINE_RW_EVENT ( xfs_reflink_bounce_dio_write ) ;
2009-12-15 02:14:59 +03:00
2010-12-10 11:42:20 +03:00
DECLARE_EVENT_CLASS ( xfs_imap_class ,
2010-05-24 12:25:57 +04:00
TP_PROTO ( struct xfs_inode * ip , xfs_off_t offset , ssize_t count ,
2019-02-15 19:02:46 +03:00
int whichfork , struct xfs_bmbt_irec * irec ) ,
TP_ARGS ( ip , offset , count , whichfork , irec ) ,
2010-05-24 12:25:57 +04:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( loff_t , size )
__field ( loff_t , offset )
__field ( size_t , count )
2019-02-15 19:02:46 +03:00
__field ( int , whichfork )
2010-05-24 12:25:57 +04:00
__field ( xfs_fileoff_t , startoff )
__field ( xfs_fsblock_t , startblock )
__field ( xfs_filblks_t , blockcount )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2021-03-29 21:11:40 +03:00
__entry - > size = ip - > i_disk_size ;
2010-05-24 12:25:57 +04:00
__entry - > offset = offset ;
__entry - > count = count ;
2019-02-15 19:02:46 +03:00
__entry - > whichfork = whichfork ;
2010-05-24 12:25:57 +04:00
__entry - > startoff = irec ? irec - > br_startoff : 0 ;
__entry - > startblock = irec ? irec - > br_startblock : 0 ;
__entry - > blockcount = irec ? irec - > br_blockcount : 0 ;
) ,
2021-08-17 23:03:19 +03:00
TP_printk ( " dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx "
2021-08-17 22:45:59 +03:00
" fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx " ,
2010-05-24 12:25:57 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > size ,
__entry - > offset ,
__entry - > count ,
2021-08-17 23:09:26 +03:00
__print_symbolic ( __entry - > whichfork , XFS_WHICHFORK_STRINGS ) ,
2010-05-24 12:25:57 +04:00
__entry - > startoff ,
2017-06-16 21:00:05 +03:00
( int64_t ) __entry - > startblock ,
2010-05-24 12:25:57 +04:00
__entry - > blockcount )
2009-12-15 02:14:59 +03:00
)
2010-05-24 12:25:57 +04:00
2019-02-15 19:02:46 +03:00
# define DEFINE_IMAP_EVENT(name) \
2010-12-10 11:42:20 +03:00
DEFINE_EVENT ( xfs_imap_class , name , \
2010-05-24 12:25:57 +04:00
TP_PROTO ( struct xfs_inode * ip , xfs_off_t offset , ssize_t count , \
2019-02-15 19:02:46 +03:00
int whichfork , struct xfs_bmbt_irec * irec ) , \
TP_ARGS ( ip , offset , count , whichfork , irec ) )
DEFINE_IMAP_EVENT ( xfs_map_blocks_found ) ;
DEFINE_IMAP_EVENT ( xfs_map_blocks_alloc ) ;
DEFINE_IMAP_EVENT ( xfs_iomap_alloc ) ;
DEFINE_IMAP_EVENT ( xfs_iomap_found ) ;
2009-12-15 02:14:59 +03:00
2010-05-24 12:25:57 +04:00
DECLARE_EVENT_CLASS ( xfs_simple_io_class ,
TP_PROTO ( struct xfs_inode * ip , xfs_off_t offset , ssize_t count ) ,
TP_ARGS ( ip , offset , count ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
2011-07-18 07:40:19 +04:00
__field ( loff_t , isize )
__field ( loff_t , disize )
2010-05-24 12:25:57 +04:00
__field ( loff_t , offset )
__field ( size_t , count )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2011-12-19 00:00:11 +04:00
__entry - > isize = VFS_I ( ip ) - > i_size ;
2021-03-29 21:11:40 +03:00
__entry - > disize = ip - > i_disk_size ;
2010-05-24 12:25:57 +04:00
__entry - > offset = offset ;
__entry - > count = count ;
) ,
2011-12-19 00:00:12 +04:00
TP_printk ( " dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
2021-08-17 23:00:13 +03:00
" pos 0x%llx bytecount 0x%zx " ,
2010-05-24 12:25:57 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
2011-07-18 07:40:19 +04:00
__entry - > isize ,
__entry - > disize ,
2010-05-24 12:25:57 +04:00
__entry - > offset ,
__entry - > count )
2009-12-15 02:14:59 +03:00
) ;
2010-05-24 12:25:57 +04:00
# define DEFINE_SIMPLE_IO_EVENT(name) \
DEFINE_EVENT ( xfs_simple_io_class , name , \
TP_PROTO ( struct xfs_inode * ip , xfs_off_t offset , ssize_t count ) , \
TP_ARGS ( ip , offset , count ) )
2009-12-15 02:14:59 +03:00
DEFINE_SIMPLE_IO_EVENT ( xfs_delalloc_enospc ) ;
DEFINE_SIMPLE_IO_EVENT ( xfs_unwritten_convert ) ;
2011-07-18 07:40:19 +04:00
DEFINE_SIMPLE_IO_EVENT ( xfs_setfilesize ) ;
2015-10-12 08:02:08 +03:00
DEFINE_SIMPLE_IO_EVENT ( xfs_zero_eof ) ;
2016-02-08 06:40:51 +03:00
DEFINE_SIMPLE_IO_EVENT ( xfs_end_io_direct_write ) ;
DEFINE_SIMPLE_IO_EVENT ( xfs_end_io_direct_write_unwritten ) ;
DEFINE_SIMPLE_IO_EVENT ( xfs_end_io_direct_write_append ) ;
2009-12-15 02:14:59 +03:00
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_itrunc_class ,
TP_PROTO ( struct xfs_inode * ip , xfs_fsize_t new_size ) ,
TP_ARGS ( ip , new_size ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_fsize_t , size )
__field ( xfs_fsize_t , new_size )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2021-03-29 21:11:40 +03:00
__entry - > size = ip - > i_disk_size ;
2009-12-21 17:03:03 +03:00
__entry - > new_size = new_size ;
) ,
2021-08-17 23:03:19 +03:00
TP_printk ( " dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > size ,
__entry - > new_size )
)
2009-12-15 02:14:59 +03:00
# define DEFINE_ITRUNC_EVENT(name) \
2009-12-21 17:03:03 +03:00
DEFINE_EVENT ( xfs_itrunc_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_inode * ip , xfs_fsize_t new_size ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( ip , new_size ) )
2011-12-19 00:00:04 +04:00
DEFINE_ITRUNC_EVENT ( xfs_itruncate_extents_start ) ;
DEFINE_ITRUNC_EVENT ( xfs_itruncate_extents_end ) ;
2009-12-15 02:14:59 +03:00
TRACE_EVENT ( xfs_pagecache_inval ,
TP_PROTO ( struct xfs_inode * ip , xfs_off_t start , xfs_off_t finish ) ,
TP_ARGS ( ip , start , finish ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_fsize_t , size )
__field ( xfs_off_t , start )
__field ( xfs_off_t , finish )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2021-03-29 21:11:40 +03:00
__entry - > size = ip - > i_disk_size ;
2009-12-15 02:14:59 +03:00
__entry - > start = start ;
__entry - > finish = finish ;
) ,
2021-08-17 23:03:19 +03:00
TP_printk ( " dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx " ,
2009-12-15 02:14:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > size ,
__entry - > start ,
__entry - > finish )
) ;
TRACE_EVENT ( xfs_bunmap ,
2021-08-17 20:03:45 +03:00
TP_PROTO ( struct xfs_inode * ip , xfs_fileoff_t fileoff , xfs_filblks_t len ,
2009-12-15 02:14:59 +03:00
int flags , unsigned long caller_ip ) ,
2021-08-17 20:03:45 +03:00
TP_ARGS ( ip , fileoff , len , flags , caller_ip ) ,
2009-12-15 02:14:59 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_fsize_t , size )
2021-08-17 20:03:45 +03:00
__field ( xfs_fileoff_t , fileoff )
2009-12-15 02:14:59 +03:00
__field ( xfs_filblks_t , len )
__field ( unsigned long , caller_ip )
__field ( int , flags )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
2021-03-29 21:11:40 +03:00
__entry - > size = ip - > i_disk_size ;
2021-08-17 20:03:45 +03:00
__entry - > fileoff = fileoff ;
2009-12-15 02:14:59 +03:00
__entry - > len = len ;
__entry - > caller_ip = caller_ip ;
__entry - > flags = flags ;
) ,
2022-09-18 23:51:14 +03:00
TP_printk ( " dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx "
2018-01-09 22:46:05 +03:00
" flags %s caller %pS " ,
2009-12-15 02:14:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > size ,
2021-08-17 20:03:45 +03:00
__entry - > fileoff ,
2009-12-15 02:14:59 +03:00
__entry - > len ,
__print_flags ( __entry - > flags , " | " , XFS_BMAPI_FLAGS ) ,
( void * ) __entry - > caller_ip )
) ;
2012-04-29 14:41:10 +04:00
DECLARE_EVENT_CLASS ( xfs_extent_busy_class ,
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy
extent array so that it is not re-used before the freeing
transaction hits the disk. This array is fixed size, so when it
overflows we make further allocation transactions synchronous
because we cannot track more freed extents until those transactions
hit the disk and are completed. Under heavy mixed allocation and
freeing workloads with large log buffers, we can overflow this array
quite easily.
Further, the array is sparsely populated, which means that inserts
need to search for a free slot, and array searches often have to
search many more slots that are actually used to check all the
busy extents. Quite inefficient, really.
To enable this aspect of extent freeing to scale better, we need
a structure that can grow dynamically. While in other areas of
XFS we have used radix trees, the extents being freed are at random
locations on disk so are better suited to being indexed by an rbtree.
So, use a per-AG rbtree indexed by block number to track busy
extents. This incures a memory allocation when marking an extent
busy, but should not occur too often in low memory situations. This
should scale to an arbitrary number of extents so should not be a
limitation for features such as in-memory aggregation of
transactions.
However, there are still situations where we can't avoid allocating
busy extents (such as allocation from the AGFL). To minimise the
overhead of such occurences, we need to avoid doing a synchronous
log force while holding the AGF locked to ensure that the previous
transactions are safely on disk before we use the extent. We can do
this by marking the transaction doing the allocation as synchronous
rather issuing a log force.
Because of the locking involved and the ordering of transactions,
the synchronous transaction provides the same guarantees as a
synchronous log force because it ensures that all the prior
transactions are already on disk when the synchronous transaction
hits the disk. i.e. it preserves the free->allocate order of the
extent correctly in recovery.
By doing this, we avoid holding the AGF locked while log writes are
in progress, hence reducing the length of time the lock is held and
therefore we increase the rate at which we can allocate and free
from the allocation group, thereby increasing overall throughput.
The only problem with this approach is that when a metadata buffer is
marked stale (e.g. a directory block is removed), then buffer remains
pinned and locked until the log goes to disk. The issue here is that
if that stale buffer is reallocated in a subsequent transaction, the
attempt to lock that buffer in the transaction will hang waiting
the log to go to disk to unlock and unpin the buffer. Hence if
someone tries to lock a pinned, stale, locked buffer we need to
push on the log to get it unlocked ASAP. Effectively we are trading
off a guaranteed log force for a much less common trigger for log
force to occur.
Ideally we should not reallocate busy extents. That is a much more
complex fix to the problem as it involves direct intervention in the
allocation btree searches in many places. This is left to a future
set of modifications.
Finally, now that we track busy extents in allocated memory, we
don't need the descriptors in the transaction structure to point to
them. We can replace the complex busy chunk infrastructure with a
simple linked list of busy extents. This allows us to remove a large
chunk of code, making the overall change a net reduction in code
size.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 06:07:08 +04:00
xfs_agblock_t agbno , xfs_extlen_t len ) ,
TP_ARGS ( mp , agno , agbno , len ) ,
2009-12-15 02:14:59 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy
extent array so that it is not re-used before the freeing
transaction hits the disk. This array is fixed size, so when it
overflows we make further allocation transactions synchronous
because we cannot track more freed extents until those transactions
hit the disk and are completed. Under heavy mixed allocation and
freeing workloads with large log buffers, we can overflow this array
quite easily.
Further, the array is sparsely populated, which means that inserts
need to search for a free slot, and array searches often have to
search many more slots that are actually used to check all the
busy extents. Quite inefficient, really.
To enable this aspect of extent freeing to scale better, we need
a structure that can grow dynamically. While in other areas of
XFS we have used radix trees, the extents being freed are at random
locations on disk so are better suited to being indexed by an rbtree.
So, use a per-AG rbtree indexed by block number to track busy
extents. This incures a memory allocation when marking an extent
busy, but should not occur too often in low memory situations. This
should scale to an arbitrary number of extents so should not be a
limitation for features such as in-memory aggregation of
transactions.
However, there are still situations where we can't avoid allocating
busy extents (such as allocation from the AGFL). To minimise the
overhead of such occurences, we need to avoid doing a synchronous
log force while holding the AGF locked to ensure that the previous
transactions are safely on disk before we use the extent. We can do
this by marking the transaction doing the allocation as synchronous
rather issuing a log force.
Because of the locking involved and the ordering of transactions,
the synchronous transaction provides the same guarantees as a
synchronous log force because it ensures that all the prior
transactions are already on disk when the synchronous transaction
hits the disk. i.e. it preserves the free->allocate order of the
extent correctly in recovery.
By doing this, we avoid holding the AGF locked while log writes are
in progress, hence reducing the length of time the lock is held and
therefore we increase the rate at which we can allocate and free
from the allocation group, thereby increasing overall throughput.
The only problem with this approach is that when a metadata buffer is
marked stale (e.g. a directory block is removed), then buffer remains
pinned and locked until the log goes to disk. The issue here is that
if that stale buffer is reallocated in a subsequent transaction, the
attempt to lock that buffer in the transaction will hang waiting
the log to go to disk to unlock and unpin the buffer. Hence if
someone tries to lock a pinned, stale, locked buffer we need to
push on the log to get it unlocked ASAP. Effectively we are trading
off a guaranteed log force for a much less common trigger for log
force to occur.
Ideally we should not reallocate busy extents. That is a much more
complex fix to the problem as it involves direct intervention in the
allocation btree searches in many places. This is left to a future
set of modifications.
Finally, now that we track busy extents in allocated memory, we
don't need the descriptors in the transaction structure to point to
them. We can replace the complex busy chunk infrastructure with a
simple linked list of busy extents. This allows us to remove a large
chunk of code, making the overall change a net reduction in code
size.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 06:07:08 +04:00
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
2009-12-15 02:14:59 +03:00
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy
extent array so that it is not re-used before the freeing
transaction hits the disk. This array is fixed size, so when it
overflows we make further allocation transactions synchronous
because we cannot track more freed extents until those transactions
hit the disk and are completed. Under heavy mixed allocation and
freeing workloads with large log buffers, we can overflow this array
quite easily.
Further, the array is sparsely populated, which means that inserts
need to search for a free slot, and array searches often have to
search many more slots that are actually used to check all the
busy extents. Quite inefficient, really.
To enable this aspect of extent freeing to scale better, we need
a structure that can grow dynamically. While in other areas of
XFS we have used radix trees, the extents being freed are at random
locations on disk so are better suited to being indexed by an rbtree.
So, use a per-AG rbtree indexed by block number to track busy
extents. This incures a memory allocation when marking an extent
busy, but should not occur too often in low memory situations. This
should scale to an arbitrary number of extents so should not be a
limitation for features such as in-memory aggregation of
transactions.
However, there are still situations where we can't avoid allocating
busy extents (such as allocation from the AGFL). To minimise the
overhead of such occurences, we need to avoid doing a synchronous
log force while holding the AGF locked to ensure that the previous
transactions are safely on disk before we use the extent. We can do
this by marking the transaction doing the allocation as synchronous
rather issuing a log force.
Because of the locking involved and the ordering of transactions,
the synchronous transaction provides the same guarantees as a
synchronous log force because it ensures that all the prior
transactions are already on disk when the synchronous transaction
hits the disk. i.e. it preserves the free->allocate order of the
extent correctly in recovery.
By doing this, we avoid holding the AGF locked while log writes are
in progress, hence reducing the length of time the lock is held and
therefore we increase the rate at which we can allocate and free
from the allocation group, thereby increasing overall throughput.
The only problem with this approach is that when a metadata buffer is
marked stale (e.g. a directory block is removed), then buffer remains
pinned and locked until the log goes to disk. The issue here is that
if that stale buffer is reallocated in a subsequent transaction, the
attempt to lock that buffer in the transaction will hang waiting
the log to go to disk to unlock and unpin the buffer. Hence if
someone tries to lock a pinned, stale, locked buffer we need to
push on the log to get it unlocked ASAP. Effectively we are trading
off a guaranteed log force for a much less common trigger for log
force to occur.
Ideally we should not reallocate busy extents. That is a much more
complex fix to the problem as it involves direct intervention in the
allocation btree searches in many places. This is left to a future
set of modifications.
Finally, now that we track busy extents in allocated memory, we
don't need the descriptors in the transaction structure to point to
them. We can replace the complex busy chunk infrastructure with a
simple linked list of busy extents. This allows us to remove a large
chunk of code, making the overall change a net reduction in code
size.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 06:07:08 +04:00
__entry - > agbno = agbno ;
__entry - > len = len ;
2009-12-15 02:14:59 +03:00
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x " ,
2009-12-15 02:14:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy
extent array so that it is not re-used before the freeing
transaction hits the disk. This array is fixed size, so when it
overflows we make further allocation transactions synchronous
because we cannot track more freed extents until those transactions
hit the disk and are completed. Under heavy mixed allocation and
freeing workloads with large log buffers, we can overflow this array
quite easily.
Further, the array is sparsely populated, which means that inserts
need to search for a free slot, and array searches often have to
search many more slots that are actually used to check all the
busy extents. Quite inefficient, really.
To enable this aspect of extent freeing to scale better, we need
a structure that can grow dynamically. While in other areas of
XFS we have used radix trees, the extents being freed are at random
locations on disk so are better suited to being indexed by an rbtree.
So, use a per-AG rbtree indexed by block number to track busy
extents. This incures a memory allocation when marking an extent
busy, but should not occur too often in low memory situations. This
should scale to an arbitrary number of extents so should not be a
limitation for features such as in-memory aggregation of
transactions.
However, there are still situations where we can't avoid allocating
busy extents (such as allocation from the AGFL). To minimise the
overhead of such occurences, we need to avoid doing a synchronous
log force while holding the AGF locked to ensure that the previous
transactions are safely on disk before we use the extent. We can do
this by marking the transaction doing the allocation as synchronous
rather issuing a log force.
Because of the locking involved and the ordering of transactions,
the synchronous transaction provides the same guarantees as a
synchronous log force because it ensures that all the prior
transactions are already on disk when the synchronous transaction
hits the disk. i.e. it preserves the free->allocate order of the
extent correctly in recovery.
By doing this, we avoid holding the AGF locked while log writes are
in progress, hence reducing the length of time the lock is held and
therefore we increase the rate at which we can allocate and free
from the allocation group, thereby increasing overall throughput.
The only problem with this approach is that when a metadata buffer is
marked stale (e.g. a directory block is removed), then buffer remains
pinned and locked until the log goes to disk. The issue here is that
if that stale buffer is reallocated in a subsequent transaction, the
attempt to lock that buffer in the transaction will hang waiting
the log to go to disk to unlock and unpin the buffer. Hence if
someone tries to lock a pinned, stale, locked buffer we need to
push on the log to get it unlocked ASAP. Effectively we are trading
off a guaranteed log force for a much less common trigger for log
force to occur.
Ideally we should not reallocate busy extents. That is a much more
complex fix to the problem as it involves direct intervention in the
allocation btree searches in many places. This is left to a future
set of modifications.
Finally, now that we track busy extents in allocated memory, we
don't need the descriptors in the transaction structure to point to
them. We can replace the complex busy chunk infrastructure with a
simple linked list of busy extents. This allows us to remove a large
chunk of code, making the overall change a net reduction in code
size.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 06:07:08 +04:00
__entry - > agbno ,
__entry - > len )
2009-12-15 02:14:59 +03:00
) ;
2011-04-24 23:06:16 +04:00
# define DEFINE_BUSY_EVENT(name) \
2012-04-29 14:41:10 +04:00
DEFINE_EVENT ( xfs_extent_busy_class , name , \
2011-04-24 23:06:16 +04:00
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
xfs_agblock_t agbno , xfs_extlen_t len ) , \
TP_ARGS ( mp , agno , agbno , len ) )
2012-04-29 14:41:10 +04:00
DEFINE_BUSY_EVENT ( xfs_extent_busy ) ;
DEFINE_BUSY_EVENT ( xfs_extent_busy_enomem ) ;
DEFINE_BUSY_EVENT ( xfs_extent_busy_force ) ;
DEFINE_BUSY_EVENT ( xfs_extent_busy_reuse ) ;
DEFINE_BUSY_EVENT ( xfs_extent_busy_clear ) ;
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy
extent array so that it is not re-used before the freeing
transaction hits the disk. This array is fixed size, so when it
overflows we make further allocation transactions synchronous
because we cannot track more freed extents until those transactions
hit the disk and are completed. Under heavy mixed allocation and
freeing workloads with large log buffers, we can overflow this array
quite easily.
Further, the array is sparsely populated, which means that inserts
need to search for a free slot, and array searches often have to
search many more slots that are actually used to check all the
busy extents. Quite inefficient, really.
To enable this aspect of extent freeing to scale better, we need
a structure that can grow dynamically. While in other areas of
XFS we have used radix trees, the extents being freed are at random
locations on disk so are better suited to being indexed by an rbtree.
So, use a per-AG rbtree indexed by block number to track busy
extents. This incures a memory allocation when marking an extent
busy, but should not occur too often in low memory situations. This
should scale to an arbitrary number of extents so should not be a
limitation for features such as in-memory aggregation of
transactions.
However, there are still situations where we can't avoid allocating
busy extents (such as allocation from the AGFL). To minimise the
overhead of such occurences, we need to avoid doing a synchronous
log force while holding the AGF locked to ensure that the previous
transactions are safely on disk before we use the extent. We can do
this by marking the transaction doing the allocation as synchronous
rather issuing a log force.
Because of the locking involved and the ordering of transactions,
the synchronous transaction provides the same guarantees as a
synchronous log force because it ensures that all the prior
transactions are already on disk when the synchronous transaction
hits the disk. i.e. it preserves the free->allocate order of the
extent correctly in recovery.
By doing this, we avoid holding the AGF locked while log writes are
in progress, hence reducing the length of time the lock is held and
therefore we increase the rate at which we can allocate and free
from the allocation group, thereby increasing overall throughput.
The only problem with this approach is that when a metadata buffer is
marked stale (e.g. a directory block is removed), then buffer remains
pinned and locked until the log goes to disk. The issue here is that
if that stale buffer is reallocated in a subsequent transaction, the
attempt to lock that buffer in the transaction will hang waiting
the log to go to disk to unlock and unpin the buffer. Hence if
someone tries to lock a pinned, stale, locked buffer we need to
push on the log to get it unlocked ASAP. Effectively we are trading
off a guaranteed log force for a much less common trigger for log
force to occur.
Ideally we should not reallocate busy extents. That is a much more
complex fix to the problem as it involves direct intervention in the
allocation btree searches in many places. This is left to a future
set of modifications.
Finally, now that we track busy extents in allocated memory, we
don't need the descriptors in the transaction structure to point to
them. We can replace the complex busy chunk infrastructure with a
simple linked list of busy extents. This allows us to remove a large
chunk of code, making the overall change a net reduction in code
size.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 06:07:08 +04:00
2012-04-29 14:41:10 +04:00
TRACE_EVENT ( xfs_extent_busy_trim ,
2011-04-24 23:06:15 +04:00
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs_agblock_t agbno , xfs_extlen_t len ,
xfs_agblock_t tbno , xfs_extlen_t tlen ) ,
TP_ARGS ( mp , agno , agbno , len , tbno , tlen ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
__field ( xfs_agblock_t , tbno )
__field ( xfs_extlen_t , tlen )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agbno = agbno ;
__entry - > len = len ;
__entry - > tbno = tbno ;
__entry - > tlen = tlen ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x " ,
2011-04-24 23:06:15 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > len ,
__entry - > tbno ,
__entry - > tlen )
) ;
2018-03-15 20:51:58 +03:00
DECLARE_EVENT_CLASS ( xfs_agf_class ,
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_mount * mp , struct xfs_agf * agf , int flags ,
unsigned long caller_ip ) ,
TP_ARGS ( mp , agf , flags , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( int , flags )
__field ( __u32 , length )
__field ( __u32 , bno_root )
__field ( __u32 , cnt_root )
__field ( __u32 , bno_level )
__field ( __u32 , cnt_level )
__field ( __u32 , flfirst )
__field ( __u32 , fllast )
__field ( __u32 , flcount )
__field ( __u32 , freeblks )
__field ( __u32 , longest )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = be32_to_cpu ( agf - > agf_seqno ) ,
__entry - > flags = flags ;
__entry - > length = be32_to_cpu ( agf - > agf_length ) ,
__entry - > bno_root = be32_to_cpu ( agf - > agf_roots [ XFS_BTNUM_BNO ] ) ,
__entry - > cnt_root = be32_to_cpu ( agf - > agf_roots [ XFS_BTNUM_CNT ] ) ,
__entry - > bno_level =
be32_to_cpu ( agf - > agf_levels [ XFS_BTNUM_BNO ] ) ,
__entry - > cnt_level =
be32_to_cpu ( agf - > agf_levels [ XFS_BTNUM_CNT ] ) ,
__entry - > flfirst = be32_to_cpu ( agf - > agf_flfirst ) ,
__entry - > fllast = be32_to_cpu ( agf - > agf_fllast ) ,
__entry - > flcount = be32_to_cpu ( agf - > agf_flcount ) ,
__entry - > freeblks = be32_to_cpu ( agf - > agf_freeblks ) ,
__entry - > longest = be32_to_cpu ( agf - > agf_longest ) ;
__entry - > caller_ip = caller_ip ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x flags %s length %u roots b %u c %u "
2009-12-15 02:14:59 +03:00
" levels b %u c %u flfirst %u fllast %u flcount %u "
2018-01-09 22:46:05 +03:00
" freeblks %u longest %u caller %pS " ,
2009-12-15 02:14:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__print_flags ( __entry - > flags , " | " , XFS_AGF_FLAGS ) ,
__entry - > length ,
__entry - > bno_root ,
__entry - > cnt_root ,
__entry - > bno_level ,
__entry - > cnt_level ,
__entry - > flfirst ,
__entry - > fllast ,
__entry - > flcount ,
__entry - > freeblks ,
__entry - > longest ,
( void * ) __entry - > caller_ip )
) ;
2018-03-15 20:51:58 +03:00
# define DEFINE_AGF_EVENT(name) \
DEFINE_EVENT ( xfs_agf_class , name , \
TP_PROTO ( struct xfs_mount * mp , struct xfs_agf * agf , int flags , \
unsigned long caller_ip ) , \
TP_ARGS ( mp , agf , flags , caller_ip ) )
DEFINE_AGF_EVENT ( xfs_agf ) ;
DEFINE_AGF_EVENT ( xfs_agfl_reset ) ;
2009-12-15 02:14:59 +03:00
TRACE_EVENT ( xfs_free_extent ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , xfs_agblock_t agbno ,
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
xfs_extlen_t len , enum xfs_ag_resv_type resv , int haveleft ,
int haveright ) ,
TP_ARGS ( mp , agno , agbno , len , resv , haveleft , haveright ) ,
2009-12-15 02:14:59 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
__field ( int , resv )
2009-12-15 02:14:59 +03:00
__field ( int , haveleft )
__field ( int , haveright )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agbno = agbno ;
__entry - > len = len ;
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
__entry - > resv = resv ;
2009-12-15 02:14:59 +03:00
__entry - > haveleft = haveleft ;
__entry - > haveright = haveright ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s " ,
2009-12-15 02:14:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > len ,
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
__entry - > resv ,
2009-12-15 02:14:59 +03:00
__entry - > haveleft ?
( __entry - > haveright ? " both " : " left " ) :
( __entry - > haveright ? " right " : " none " ) )
) ;
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_alloc_class ,
TP_PROTO ( struct xfs_alloc_arg * args ) ,
TP_ARGS ( args ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , minlen )
__field ( xfs_extlen_t , maxlen )
__field ( xfs_extlen_t , mod )
__field ( xfs_extlen_t , prod )
__field ( xfs_extlen_t , minleft )
__field ( xfs_extlen_t , total )
__field ( xfs_extlen_t , alignment )
__field ( xfs_extlen_t , minalignslop )
__field ( xfs_extlen_t , len )
__field ( char , wasdel )
__field ( char , wasfromfl )
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
__field ( int , resv )
xfs: remote attribute blocks aren't really userdata
When adding a new remote attribute, we write the attribute to the
new extent before the allocation transaction is committed. This
means we cannot reuse busy extents as that violates crash
consistency semantics. Hence we currently treat remote attribute
extent allocation like userdata because it has the same overwrite
ordering constraints as userdata.
Unfortunately, this also allows the allocator to incorrectly apply
extent size hints to the remote attribute extent allocation. This
results in interesting failures, such as transaction block
reservation overruns and in-memory inode attribute fork corruption.
To fix this, we need to separate the busy extent reuse configuration
from the userdata configuration. This changes the definition of
XFS_BMAPI_METADATA slightly - it now means that allocation is
metadata and reuse of busy extents is acceptible due to the metadata
ordering semantics of the journal. If this flag is not set, it
means the allocation is that has unordered data writeback, and hence
busy extent reuse is not allowed. It no longer implies the
allocation is for user data, just that the data write will not be
strictly ordered. This matches the semantics for both user data
and remote attribute block allocation.
As such, This patch changes the "userdata" field to a "datatype"
field, and adds a "no busy reuse" flag to the field.
When we detect an unordered data extent allocation, we immediately set
the no reuse flag. We then set the "user data" flags based on the
inode fork we are allocating the extent to. Hence we only set
userdata flags on data fork allocations now and consider attribute
fork remote extents to be an unordered metadata extent.
The result is that remote attribute extents now have the expected
allocation semantics, and the data fork allocation behaviour is
completely unchanged.
It should be noted that there may be other ways to fix this (e.g.
use ordered metadata buffers for the remote attribute extent data
write) but they are more invasive and difficult to validate both
from a design and implementation POV. Hence this patch takes the
simple, obvious route to fixing the problem...
Reported-and-tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-26 01:21:28 +03:00
__field ( int , datatype )
2023-02-10 20:11:06 +03:00
__field ( xfs_agnumber_t , highest_agno )
2009-12-21 17:03:03 +03:00
) ,
TP_fast_assign (
__entry - > dev = args - > mp - > m_super - > s_dev ;
__entry - > agno = args - > agno ;
__entry - > agbno = args - > agbno ;
__entry - > minlen = args - > minlen ;
__entry - > maxlen = args - > maxlen ;
__entry - > mod = args - > mod ;
__entry - > prod = args - > prod ;
__entry - > minleft = args - > minleft ;
__entry - > total = args - > total ;
__entry - > alignment = args - > alignment ;
__entry - > minalignslop = args - > minalignslop ;
__entry - > len = args - > len ;
__entry - > wasdel = args - > wasdel ;
__entry - > wasfromfl = args - > wasfromfl ;
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
__entry - > resv = args - > resv ;
xfs: remote attribute blocks aren't really userdata
When adding a new remote attribute, we write the attribute to the
new extent before the allocation transaction is committed. This
means we cannot reuse busy extents as that violates crash
consistency semantics. Hence we currently treat remote attribute
extent allocation like userdata because it has the same overwrite
ordering constraints as userdata.
Unfortunately, this also allows the allocator to incorrectly apply
extent size hints to the remote attribute extent allocation. This
results in interesting failures, such as transaction block
reservation overruns and in-memory inode attribute fork corruption.
To fix this, we need to separate the busy extent reuse configuration
from the userdata configuration. This changes the definition of
XFS_BMAPI_METADATA slightly - it now means that allocation is
metadata and reuse of busy extents is acceptible due to the metadata
ordering semantics of the journal. If this flag is not set, it
means the allocation is that has unordered data writeback, and hence
busy extent reuse is not allowed. It no longer implies the
allocation is for user data, just that the data write will not be
strictly ordered. This matches the semantics for both user data
and remote attribute block allocation.
As such, This patch changes the "userdata" field to a "datatype"
field, and adds a "no busy reuse" flag to the field.
When we detect an unordered data extent allocation, we immediately set
the no reuse flag. We then set the "user data" flags based on the
inode fork we are allocating the extent to. Hence we only set
userdata flags on data fork allocations now and consider attribute
fork remote extents to be an unordered metadata extent.
The result is that remote attribute extents now have the expected
allocation semantics, and the data fork allocation behaviour is
completely unchanged.
It should be noted that there may be other ways to fix this (e.g.
use ordered metadata buffers for the remote attribute extent data
write) but they are more invasive and difficult to validate both
from a design and implementation POV. Hence this patch takes the
simple, obvious route to fixing the problem...
Reported-and-tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-26 01:21:28 +03:00
__entry - > datatype = args - > datatype ;
2023-02-10 20:11:06 +03:00
__entry - > highest_agno = args - > tp - > t_highest_agno ;
2009-12-21 17:03:03 +03:00
) ,
2021-08-17 19:28:53 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u "
2009-12-21 17:03:03 +03:00
" prod %u minleft %u total %u alignment %u minalignslop %u "
2023-02-13 01:14:54 +03:00
" len %u wasdel %d wasfromfl %d resv %d "
2023-02-10 20:11:06 +03:00
" datatype 0x%x highest_agno 0x%x " ,
2009-12-21 17:03:03 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > minlen ,
__entry - > maxlen ,
__entry - > mod ,
__entry - > prod ,
__entry - > minleft ,
__entry - > total ,
__entry - > alignment ,
__entry - > minalignslop ,
__entry - > len ,
__entry - > wasdel ,
__entry - > wasfromfl ,
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
__entry - > resv ,
xfs: remote attribute blocks aren't really userdata
When adding a new remote attribute, we write the attribute to the
new extent before the allocation transaction is committed. This
means we cannot reuse busy extents as that violates crash
consistency semantics. Hence we currently treat remote attribute
extent allocation like userdata because it has the same overwrite
ordering constraints as userdata.
Unfortunately, this also allows the allocator to incorrectly apply
extent size hints to the remote attribute extent allocation. This
results in interesting failures, such as transaction block
reservation overruns and in-memory inode attribute fork corruption.
To fix this, we need to separate the busy extent reuse configuration
from the userdata configuration. This changes the definition of
XFS_BMAPI_METADATA slightly - it now means that allocation is
metadata and reuse of busy extents is acceptible due to the metadata
ordering semantics of the journal. If this flag is not set, it
means the allocation is that has unordered data writeback, and hence
busy extent reuse is not allowed. It no longer implies the
allocation is for user data, just that the data write will not be
strictly ordered. This matches the semantics for both user data
and remote attribute block allocation.
As such, This patch changes the "userdata" field to a "datatype"
field, and adds a "no busy reuse" flag to the field.
When we detect an unordered data extent allocation, we immediately set
the no reuse flag. We then set the "user data" flags based on the
inode fork we are allocating the extent to. Hence we only set
userdata flags on data fork allocations now and consider attribute
fork remote extents to be an unordered metadata extent.
The result is that remote attribute extents now have the expected
allocation semantics, and the data fork allocation behaviour is
completely unchanged.
It should be noted that there may be other ways to fix this (e.g.
use ordered metadata buffers for the remote attribute extent data
write) but they are more invasive and difficult to validate both
from a design and implementation POV. Hence this patch takes the
simple, obvious route to fixing the problem...
Reported-and-tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-26 01:21:28 +03:00
__entry - > datatype ,
2023-02-10 20:11:06 +03:00
__entry - > highest_agno )
2009-12-15 02:14:59 +03:00
)
2009-12-21 17:03:03 +03:00
# define DEFINE_ALLOC_EVENT(name) \
DEFINE_EVENT ( xfs_alloc_class , name , \
TP_PROTO ( struct xfs_alloc_arg * args ) , \
TP_ARGS ( args ) )
2009-12-15 02:14:59 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_exact_done ) ;
2010-12-10 18:03:57 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_exact_notfound ) ;
2009-12-15 02:14:59 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_exact_error ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_near_nominleft ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_near_first ) ;
2019-10-14 03:10:35 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_cur ) ;
2019-10-14 03:10:33 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_cur_right ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_cur_left ) ;
2019-10-14 03:10:36 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_cur_lookup ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_cur_lookup_done ) ;
2009-12-15 02:14:59 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_near_error ) ;
2011-04-24 23:06:15 +04:00
DEFINE_ALLOC_EVENT ( xfs_alloc_near_noentry ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_near_busy ) ;
2009-12-15 02:14:59 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_size_neither ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_size_noentry ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_size_nominleft ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_size_done ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_size_error ) ;
2011-04-24 23:06:15 +04:00
DEFINE_ALLOC_EVENT ( xfs_alloc_size_busy ) ;
2009-12-15 02:14:59 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_small_freelist ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_small_notenough ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_small_done ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_small_error ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_vextent_badargs ) ;
2023-02-10 20:07:06 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_vextent_skip_deadlock ) ;
2009-12-15 02:14:59 +03:00
DEFINE_ALLOC_EVENT ( xfs_alloc_vextent_nofix ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_vextent_noagbp ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_vextent_loopfailed ) ;
DEFINE_ALLOC_EVENT ( xfs_alloc_vextent_allfailed ) ;
2019-10-14 03:10:33 +03:00
TRACE_EVENT ( xfs_alloc_cur_check ,
TP_PROTO ( struct xfs_mount * mp , xfs_btnum_t btnum , xfs_agblock_t bno ,
xfs_extlen_t len , xfs_extlen_t diff , bool new ) ,
TP_ARGS ( mp , btnum , bno , len , diff , new ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_btnum_t , btnum )
__field ( xfs_agblock_t , bno )
__field ( xfs_extlen_t , len )
__field ( xfs_extlen_t , diff )
__field ( bool , new )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > btnum = btnum ;
__entry - > bno = bno ;
__entry - > len = len ;
__entry - > diff = diff ;
__entry - > new = new ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d " ,
2019-10-14 03:10:33 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_symbolic ( __entry - > btnum , XFS_BTNUM_STRINGS ) ,
__entry - > bno , __entry - > len , __entry - > diff , __entry - > new )
)
2012-03-22 09:15:13 +04:00
DECLARE_EVENT_CLASS ( xfs_da_class ,
2009-12-21 17:03:03 +03:00
TP_PROTO ( struct xfs_da_args * args ) ,
TP_ARGS ( args ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__dynamic_array ( char , name , args - > namelen )
__field ( int , namelen )
__field ( xfs_dahash_t , hashval )
__field ( xfs_ino_t , inumber )
2022-04-21 03:46:47 +03:00
__field ( uint32_t , op_flags )
2009-12-21 17:03:03 +03:00
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( args - > dp ) - > i_sb - > s_dev ;
__entry - > ino = args - > dp - > i_ino ;
if ( args - > namelen )
memcpy ( __get_str ( name ) , args - > name , args - > namelen ) ;
__entry - > namelen = args - > namelen ;
__entry - > hashval = args - > hashval ;
__entry - > inumber = args - > inumber ;
__entry - > op_flags = args - > op_flags ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
" inumber 0x%llx op_flags %s " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > namelen ,
__entry - > namelen ? __get_str ( name ) : NULL ,
__entry - > namelen ,
__entry - > hashval ,
__entry - > inumber ,
__print_flags ( __entry - > op_flags , " | " , XFS_DA_OP_FLAGS ) )
)
# define DEFINE_DIR2_EVENT(name) \
2012-03-22 09:15:13 +04:00
DEFINE_EVENT ( xfs_da_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_da_args * args ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( args ) )
DEFINE_DIR2_EVENT ( xfs_dir2_sf_addname ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_create ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_lookup ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_replace ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_removename ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_toino4 ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_toino8 ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_sf_to_block ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_block_addname ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_block_lookup ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_block_replace ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_block_removename ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_block_to_sf ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_block_to_leaf ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_leaf_addname ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_leaf_lookup ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_leaf_replace ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_leaf_removename ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_leaf_to_block ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_leaf_to_node ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_node_addname ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_node_lookup ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_node_replace ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_node_removename ) ;
DEFINE_DIR2_EVENT ( xfs_dir2_node_to_leaf ) ;
2012-11-12 15:53:53 +04:00
DECLARE_EVENT_CLASS ( xfs_attr_class ,
TP_PROTO ( struct xfs_da_args * args ) ,
TP_ARGS ( args ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__dynamic_array ( char , name , args - > namelen )
__field ( int , namelen )
__field ( int , valuelen )
__field ( xfs_dahash_t , hashval )
2020-02-27 04:30:42 +03:00
__field ( unsigned int , attr_filter )
__field ( unsigned int , attr_flags )
2022-04-21 03:46:47 +03:00
__field ( uint32_t , op_flags )
2012-11-12 15:53:53 +04:00
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( args - > dp ) - > i_sb - > s_dev ;
__entry - > ino = args - > dp - > i_ino ;
if ( args - > namelen )
memcpy ( __get_str ( name ) , args - > name , args - > namelen ) ;
__entry - > namelen = args - > namelen ;
__entry - > valuelen = args - > valuelen ;
__entry - > hashval = args - > hashval ;
2020-02-27 04:30:42 +03:00
__entry - > attr_filter = args - > attr_filter ;
__entry - > attr_flags = args - > attr_flags ;
2012-11-12 15:53:53 +04:00
__entry - > op_flags = args - > op_flags ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
2020-02-27 04:30:42 +03:00
" hashval 0x%x filter %s flags %s op_flags %s " ,
2012-11-12 15:53:53 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > namelen ,
__entry - > namelen ? __get_str ( name ) : NULL ,
__entry - > namelen ,
__entry - > valuelen ,
__entry - > hashval ,
2020-02-27 04:30:42 +03:00
__print_flags ( __entry - > attr_filter , " | " ,
XFS_ATTR_FILTER_FLAGS ) ,
__print_flags ( __entry - > attr_flags , " | " ,
{ XATTR_CREATE , " CREATE " } ,
{ XATTR_REPLACE , " REPLACE " } ) ,
2012-11-12 15:53:53 +04:00
__print_flags ( __entry - > op_flags , " | " , XFS_DA_OP_FLAGS ) )
)
2012-03-22 09:15:13 +04:00
# define DEFINE_ATTR_EVENT(name) \
2012-11-12 15:53:53 +04:00
DEFINE_EVENT ( xfs_attr_class , name , \
2012-03-22 09:15:13 +04:00
TP_PROTO ( struct xfs_da_args * args ) , \
TP_ARGS ( args ) )
DEFINE_ATTR_EVENT ( xfs_attr_sf_add ) ;
DEFINE_ATTR_EVENT ( xfs_attr_sf_addname ) ;
DEFINE_ATTR_EVENT ( xfs_attr_sf_create ) ;
DEFINE_ATTR_EVENT ( xfs_attr_sf_lookup ) ;
DEFINE_ATTR_EVENT ( xfs_attr_sf_remove ) ;
DEFINE_ATTR_EVENT ( xfs_attr_sf_to_leaf ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_add ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_add_old ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_add_new ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_add_work ) ;
2012-03-22 09:15:13 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_create ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_compact ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_get ) ;
2012-03-22 09:15:13 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_lookup ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_replace ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_remove ) ;
2012-03-22 09:15:13 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_removename ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_split ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_split_before ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_split_after ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_clearflag ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_setflag ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_flipflags ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_to_sf ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_to_node ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_rebalance ) ;
DEFINE_ATTR_EVENT ( xfs_attr_leaf_unbalance ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_leaf_toosmall ) ;
2012-03-22 09:15:13 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_node_addname ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_node_get ) ;
2012-03-22 09:15:13 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_node_replace ) ;
DEFINE_ATTR_EVENT ( xfs_attr_node_removename ) ;
2012-11-12 15:53:53 +04:00
DEFINE_ATTR_EVENT ( xfs_attr_fillstate ) ;
DEFINE_ATTR_EVENT ( xfs_attr_refillstate ) ;
DEFINE_ATTR_EVENT ( xfs_attr_rmtval_get ) ;
DEFINE_ATTR_EVENT ( xfs_attr_rmtval_set ) ;
2012-03-22 09:15:13 +04:00
# define DEFINE_DA_EVENT(name) \
DEFINE_EVENT ( xfs_da_class , name , \
TP_PROTO ( struct xfs_da_args * args ) , \
TP_ARGS ( args ) )
DEFINE_DA_EVENT ( xfs_da_split ) ;
DEFINE_DA_EVENT ( xfs_da_join ) ;
DEFINE_DA_EVENT ( xfs_da_link_before ) ;
DEFINE_DA_EVENT ( xfs_da_link_after ) ;
DEFINE_DA_EVENT ( xfs_da_unlink_back ) ;
DEFINE_DA_EVENT ( xfs_da_unlink_forward ) ;
DEFINE_DA_EVENT ( xfs_da_root_split ) ;
DEFINE_DA_EVENT ( xfs_da_root_join ) ;
DEFINE_DA_EVENT ( xfs_da_node_add ) ;
DEFINE_DA_EVENT ( xfs_da_node_create ) ;
DEFINE_DA_EVENT ( xfs_da_node_split ) ;
DEFINE_DA_EVENT ( xfs_da_node_remove ) ;
DEFINE_DA_EVENT ( xfs_da_node_rebalance ) ;
DEFINE_DA_EVENT ( xfs_da_node_unbalance ) ;
2012-11-12 15:53:53 +04:00
DEFINE_DA_EVENT ( xfs_da_node_toosmall ) ;
2012-03-22 09:15:13 +04:00
DEFINE_DA_EVENT ( xfs_da_swap_lastblock ) ;
DEFINE_DA_EVENT ( xfs_da_grow_inode ) ;
DEFINE_DA_EVENT ( xfs_da_shrink_inode ) ;
2012-11-12 15:53:53 +04:00
DEFINE_DA_EVENT ( xfs_da_fixhashpath ) ;
DEFINE_DA_EVENT ( xfs_da_path_shift ) ;
2012-03-22 09:15:13 +04:00
2009-12-21 17:03:03 +03:00
DECLARE_EVENT_CLASS ( xfs_dir2_space_class ,
TP_PROTO ( struct xfs_da_args * args , int idx ) ,
TP_ARGS ( args , idx ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
2022-04-21 03:46:47 +03:00
__field ( uint32_t , op_flags )
2009-12-21 17:03:03 +03:00
__field ( int , idx )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( args - > dp ) - > i_sb - > s_dev ;
__entry - > ino = args - > dp - > i_ino ;
__entry - > op_flags = args - > op_flags ;
__entry - > idx = idx ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx op_flags %s index %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__print_flags ( __entry - > op_flags , " | " , XFS_DA_OP_FLAGS ) ,
__entry - > idx )
2009-12-15 02:14:59 +03:00
)
2009-12-21 17:03:03 +03:00
# define DEFINE_DIR2_SPACE_EVENT(name) \
DEFINE_EVENT ( xfs_dir2_space_class , name , \
2009-12-15 02:14:59 +03:00
TP_PROTO ( struct xfs_da_args * args , int idx ) , \
2009-12-21 17:03:03 +03:00
TP_ARGS ( args , idx ) )
DEFINE_DIR2_SPACE_EVENT ( xfs_dir2_leafn_add ) ;
DEFINE_DIR2_SPACE_EVENT ( xfs_dir2_leafn_remove ) ;
DEFINE_DIR2_SPACE_EVENT ( xfs_dir2_grow_inode ) ;
DEFINE_DIR2_SPACE_EVENT ( xfs_dir2_shrink_inode ) ;
2009-12-15 02:14:59 +03:00
TRACE_EVENT ( xfs_dir2_leafn_moveents ,
TP_PROTO ( struct xfs_da_args * args , int src_idx , int dst_idx , int count ) ,
TP_ARGS ( args , src_idx , dst_idx , count ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
2022-04-21 03:46:47 +03:00
__field ( uint32_t , op_flags )
2009-12-15 02:14:59 +03:00
__field ( int , src_idx )
__field ( int , dst_idx )
__field ( int , count )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( args - > dp ) - > i_sb - > s_dev ;
__entry - > ino = args - > dp - > i_ino ;
__entry - > op_flags = args - > op_flags ;
__entry - > src_idx = src_idx ;
__entry - > dst_idx = dst_idx ;
__entry - > count = count ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx op_flags %s "
" src_idx %d dst_idx %d count %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__print_flags ( __entry - > op_flags , " | " , XFS_DA_OP_FLAGS ) ,
__entry - > src_idx ,
__entry - > dst_idx ,
__entry - > count )
) ;
2010-01-14 04:33:55 +03:00
# define XFS_SWAPEXT_INODES \
{ 0 , " target " } , \
{ 1 , " temp " }
2018-12-19 01:32:30 +03:00
TRACE_DEFINE_ENUM ( XFS_DINODE_FMT_DEV ) ;
TRACE_DEFINE_ENUM ( XFS_DINODE_FMT_LOCAL ) ;
TRACE_DEFINE_ENUM ( XFS_DINODE_FMT_EXTENTS ) ;
TRACE_DEFINE_ENUM ( XFS_DINODE_FMT_BTREE ) ;
TRACE_DEFINE_ENUM ( XFS_DINODE_FMT_UUID ) ;
2010-01-14 04:33:55 +03:00
DECLARE_EVENT_CLASS ( xfs_swap_extent_class ,
TP_PROTO ( struct xfs_inode * ip , int which ) ,
TP_ARGS ( ip , which ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( int , which )
__field ( xfs_ino_t , ino )
__field ( int , format )
2021-02-26 08:54:31 +03:00
__field ( xfs_extnum_t , nex )
2010-01-14 04:33:55 +03:00
__field ( int , broot_size )
__field ( int , fork_off )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > which = which ;
__entry - > ino = ip - > i_ino ;
2020-05-18 20:28:05 +03:00
__entry - > format = ip - > i_df . if_format ;
2020-05-18 20:27:22 +03:00
__entry - > nex = ip - > i_df . if_nextents ;
2010-01-14 04:33:55 +03:00
__entry - > broot_size = ip - > i_df . if_broot_bytes ;
2022-07-09 20:56:07 +03:00
__entry - > fork_off = xfs_inode_fork_boff ( ip ) ;
2010-01-14 04:33:55 +03:00
) ,
2021-11-16 10:28:40 +03:00
TP_printk ( " dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
2021-08-17 20:09:12 +03:00
" broot size %d, forkoff 0x%x " ,
2010-01-14 04:33:55 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__print_symbolic ( __entry - > which , XFS_SWAPEXT_INODES ) ,
__print_symbolic ( __entry - > format , XFS_INODE_FORMAT_STR ) ,
__entry - > nex ,
__entry - > broot_size ,
__entry - > fork_off )
)
# define DEFINE_SWAPEXT_EVENT(name) \
DEFINE_EVENT ( xfs_swap_extent_class , name , \
TP_PROTO ( struct xfs_inode * ip , int which ) , \
TP_ARGS ( ip , which ) )
DEFINE_SWAPEXT_EVENT ( xfs_swap_extent_before ) ;
DEFINE_SWAPEXT_EVENT ( xfs_swap_extent_after ) ;
2017-08-09 04:21:53 +03:00
TRACE_EVENT ( xfs_log_recover ,
TP_PROTO ( struct xlog * log , xfs_daddr_t headblk , xfs_daddr_t tailblk ) ,
TP_ARGS ( log , headblk , tailblk ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_daddr_t , headblk )
__field ( xfs_daddr_t , tailblk )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > headblk = headblk ;
__entry - > tailblk = tailblk ;
) ,
TP_printk ( " dev %d:%d headblk 0x%llx tailblk 0x%llx " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) , __entry - > headblk ,
__entry - > tailblk )
)
2016-09-26 01:34:52 +03:00
TRACE_EVENT ( xfs_log_recover_record ,
TP_PROTO ( struct xlog * log , struct xlog_rec_header * rhead , int pass ) ,
TP_ARGS ( log , rhead , pass ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_lsn_t , lsn )
__field ( int , len )
__field ( int , num_logops )
__field ( int , pass )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > lsn = be64_to_cpu ( rhead - > h_lsn ) ;
__entry - > len = be32_to_cpu ( rhead - > h_len ) ;
__entry - > num_logops = be32_to_cpu ( rhead - > h_num_logops ) ;
__entry - > pass = pass ;
) ,
TP_printk ( " dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > lsn , __entry - > len , __entry - > num_logops ,
__entry - > pass )
)
2010-04-13 09:06:46 +04:00
DECLARE_EVENT_CLASS ( xfs_log_recover_item_class ,
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xlog_recover * trans ,
2010-04-13 09:06:46 +04:00
struct xlog_recover_item * item , int pass ) ,
TP_ARGS ( log , trans , item , pass ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned long , item )
__field ( xlog_tid_t , tid )
2016-09-26 01:34:52 +03:00
__field ( xfs_lsn_t , lsn )
2010-04-13 09:06:46 +04:00
__field ( int , type )
__field ( int , pass )
__field ( int , count )
__field ( int , total )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > item = ( unsigned long ) item ;
__entry - > tid = trans - > r_log_tid ;
2016-09-26 01:34:52 +03:00
__entry - > lsn = trans - > r_lsn ;
2010-04-13 09:06:46 +04:00
__entry - > type = ITEM_TYPE ( item ) ;
__entry - > pass = pass ;
__entry - > count = item - > ri_cnt ;
__entry - > total = item - > ri_total ;
) ,
2018-01-09 22:43:36 +03:00
TP_printk ( " dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item %p, "
2016-09-26 01:34:52 +03:00
" item type %s item region count/total %d/%d " ,
2010-04-13 09:06:46 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > tid ,
2016-09-26 01:34:52 +03:00
__entry - > lsn ,
2010-04-13 09:06:46 +04:00
__entry - > pass ,
( void * ) __entry - > item ,
__print_symbolic ( __entry - > type , XFS_LI_TYPE_DESC ) ,
__entry - > count ,
__entry - > total )
)
# define DEFINE_LOG_RECOVER_ITEM(name) \
DEFINE_EVENT ( xfs_log_recover_item_class , name , \
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xlog_recover * trans , \
2010-04-13 09:06:46 +04:00
struct xlog_recover_item * item , int pass ) , \
TP_ARGS ( log , trans , item , pass ) )
DEFINE_LOG_RECOVER_ITEM ( xfs_log_recover_item_add ) ;
DEFINE_LOG_RECOVER_ITEM ( xfs_log_recover_item_add_cont ) ;
DEFINE_LOG_RECOVER_ITEM ( xfs_log_recover_item_reorder_head ) ;
DEFINE_LOG_RECOVER_ITEM ( xfs_log_recover_item_reorder_tail ) ;
DEFINE_LOG_RECOVER_ITEM ( xfs_log_recover_item_recover ) ;
DECLARE_EVENT_CLASS ( xfs_log_recover_buf_item_class ,
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xfs_buf_log_format * buf_f ) ,
2010-04-13 09:06:46 +04:00
TP_ARGS ( log , buf_f ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
2017-06-16 21:00:05 +03:00
__field ( int64_t , blkno )
2010-04-13 09:06:46 +04:00
__field ( unsigned short , len )
__field ( unsigned short , flags )
__field ( unsigned short , size )
__field ( unsigned int , map_size )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > blkno = buf_f - > blf_blkno ;
__entry - > len = buf_f - > blf_len ;
__entry - > flags = buf_f - > blf_flags ;
__entry - > size = buf_f - > blf_size ;
__entry - > map_size = buf_f - > blf_map_size ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, "
2010-04-13 09:06:46 +04:00
" map_size %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > blkno ,
__entry - > len ,
__entry - > flags ,
__entry - > size ,
__entry - > map_size )
)
# define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
DEFINE_EVENT ( xfs_log_recover_buf_item_class , name , \
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xfs_buf_log_format * buf_f ) , \
2010-04-13 09:06:46 +04:00
TP_ARGS ( log , buf_f ) )
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_not_cancel ) ;
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_cancel ) ;
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_cancel_add ) ;
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_cancel_ref_inc ) ;
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_recover ) ;
2016-09-26 01:34:52 +03:00
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_skip ) ;
2010-04-13 09:06:46 +04:00
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_inode_buf ) ;
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_reg_buf ) ;
DEFINE_LOG_RECOVER_BUF_ITEM ( xfs_log_recover_buf_dquot_buf ) ;
DECLARE_EVENT_CLASS ( xfs_log_recover_ino_item_class ,
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xfs_inode_log_format * in_f ) ,
2010-04-13 09:06:46 +04:00
TP_ARGS ( log , in_f ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( unsigned short , size )
__field ( int , fields )
__field ( unsigned short , asize )
__field ( unsigned short , dsize )
2017-06-16 21:00:05 +03:00
__field ( int64_t , blkno )
2010-04-13 09:06:46 +04:00
__field ( int , len )
__field ( int , boffset )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > ino = in_f - > ilf_ino ;
__entry - > size = in_f - > ilf_size ;
__entry - > fields = in_f - > ilf_fields ;
__entry - > asize = in_f - > ilf_asize ;
__entry - > dsize = in_f - > ilf_dsize ;
__entry - > blkno = in_f - > ilf_blkno ;
__entry - > len = in_f - > ilf_len ;
__entry - > boffset = in_f - > ilf_boffset ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
2021-08-17 22:45:59 +03:00
" dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d " ,
2010-04-13 09:06:46 +04:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > size ,
__entry - > fields ,
__entry - > asize ,
__entry - > dsize ,
__entry - > blkno ,
__entry - > len ,
__entry - > boffset )
)
# define DEFINE_LOG_RECOVER_INO_ITEM(name) \
DEFINE_EVENT ( xfs_log_recover_ino_item_class , name , \
2012-06-14 18:22:15 +04:00
TP_PROTO ( struct xlog * log , struct xfs_inode_log_format * in_f ) , \
2010-04-13 09:06:46 +04:00
TP_ARGS ( log , in_f ) )
DEFINE_LOG_RECOVER_INO_ITEM ( xfs_log_recover_inode_recover ) ;
DEFINE_LOG_RECOVER_INO_ITEM ( xfs_log_recover_inode_cancel ) ;
DEFINE_LOG_RECOVER_INO_ITEM ( xfs_log_recover_inode_skip ) ;
2015-08-19 02:58:48 +03:00
DECLARE_EVENT_CLASS ( xfs_log_recover_icreate_item_class ,
TP_PROTO ( struct xlog * log , struct xfs_icreate_log * in_f ) ,
TP_ARGS ( log , in_f ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( unsigned int , count )
__field ( unsigned int , isize )
__field ( xfs_agblock_t , length )
__field ( unsigned int , gen )
) ,
TP_fast_assign (
__entry - > dev = log - > l_mp - > m_super - > s_dev ;
__entry - > agno = be32_to_cpu ( in_f - > icl_ag ) ;
__entry - > agbno = be32_to_cpu ( in_f - > icl_agbno ) ;
__entry - > count = be32_to_cpu ( in_f - > icl_count ) ;
__entry - > isize = be32_to_cpu ( in_f - > icl_isize ) ;
__entry - > length = be32_to_cpu ( in_f - > icl_length ) ;
__entry - > gen = be32_to_cpu ( in_f - > icl_gen ) ;
) ,
2021-08-18 01:45:25 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x " ,
2021-08-17 22:45:59 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > length ,
__entry - > count ,
__entry - > isize ,
__entry - > gen )
2015-08-19 02:58:48 +03:00
)
# define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
DEFINE_EVENT ( xfs_log_recover_icreate_item_class , name , \
TP_PROTO ( struct xlog * log , struct xfs_icreate_log * in_f ) , \
TP_ARGS ( log , in_f ) )
DEFINE_LOG_RECOVER_ICREATE_ITEM ( xfs_log_recover_icreate_cancel ) ;
DEFINE_LOG_RECOVER_ICREATE_ITEM ( xfs_log_recover_icreate_recover ) ;
2011-01-07 16:02:04 +03:00
DECLARE_EVENT_CLASS ( xfs_discard_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs_agblock_t agbno , xfs_extlen_t len ) ,
TP_ARGS ( mp , agno , agbno , len ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agbno = agbno ;
__entry - > len = len ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x " ,
2011-01-07 16:02:04 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > len )
)
# define DEFINE_DISCARD_EVENT(name) \
DEFINE_EVENT ( xfs_discard_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
xfs_agblock_t agbno , xfs_extlen_t len ) , \
TP_ARGS ( mp , agno , agbno , len ) )
DEFINE_DISCARD_EVENT ( xfs_discard_extent ) ;
DEFINE_DISCARD_EVENT ( xfs_discard_toosmall ) ;
DEFINE_DISCARD_EVENT ( xfs_discard_exclude ) ;
DEFINE_DISCARD_EVENT ( xfs_discard_busy ) ;
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
/* btree cursor events */
2018-12-19 01:32:31 +03:00
TRACE_DEFINE_ENUM ( XFS_BTNUM_BNOi ) ;
TRACE_DEFINE_ENUM ( XFS_BTNUM_CNTi ) ;
TRACE_DEFINE_ENUM ( XFS_BTNUM_BMAPi ) ;
TRACE_DEFINE_ENUM ( XFS_BTNUM_INOi ) ;
TRACE_DEFINE_ENUM ( XFS_BTNUM_FINOi ) ;
TRACE_DEFINE_ENUM ( XFS_BTNUM_RMAPi ) ;
TRACE_DEFINE_ENUM ( XFS_BTNUM_REFCi ) ;
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
DECLARE_EVENT_CLASS ( xfs_btree_cur_class ,
TP_PROTO ( struct xfs_btree_cur * cur , int level , struct xfs_buf * bp ) ,
TP_ARGS ( cur , level , bp ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_btnum_t , btnum )
__field ( int , level )
__field ( int , nlevels )
__field ( int , ptr )
__field ( xfs_daddr_t , daddr )
) ,
TP_fast_assign (
__entry - > dev = cur - > bc_mp - > m_super - > s_dev ;
__entry - > btnum = cur - > bc_btnum ;
__entry - > level = level ;
__entry - > nlevels = cur - > bc_nlevels ;
2021-09-16 22:24:04 +03:00
__entry - > ptr = cur - > bc_levels [ level ] . ptr ;
2021-08-19 04:47:05 +03:00
__entry - > daddr = bp ? xfs_buf_daddr ( bp ) : - 1 ;
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
) ,
2018-12-19 01:32:31 +03:00
TP_printk ( " dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx " ,
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
2018-12-19 01:32:31 +03:00
__print_symbolic ( __entry - > btnum , XFS_BTNUM_STRINGS ) ,
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
__entry - > level ,
__entry - > nlevels ,
__entry - > ptr ,
( unsigned long long ) __entry - > daddr )
)
# define DEFINE_BTREE_CUR_EVENT(name) \
DEFINE_EVENT ( xfs_btree_cur_class , name , \
TP_PROTO ( struct xfs_btree_cur * cur , int level , struct xfs_buf * bp ) , \
TP_ARGS ( cur , level , bp ) )
DEFINE_BTREE_CUR_EVENT ( xfs_btree_updkeys ) ;
2016-08-03 04:10:21 +03:00
DEFINE_BTREE_CUR_EVENT ( xfs_btree_overlapped_query_range ) ;
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 04:08:36 +03:00
2016-08-03 04:13:02 +03:00
/* deferred ops */
struct xfs_defer_pending ;
DECLARE_EVENT_CLASS ( xfs_defer_class ,
2018-08-01 17:20:35 +03:00
TP_PROTO ( struct xfs_trans * tp , unsigned long caller_ip ) ,
TP_ARGS ( tp , caller_ip ) ,
2016-08-03 04:13:02 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
2018-08-01 17:20:35 +03:00
__field ( struct xfs_trans * , tp )
2017-04-21 21:24:42 +03:00
__field ( char , committed )
2018-05-09 17:48:52 +03:00
__field ( unsigned long , caller_ip )
2016-08-03 04:13:02 +03:00
) ,
TP_fast_assign (
2018-08-01 17:20:35 +03:00
__entry - > dev = tp - > t_mountp - > m_super - > s_dev ;
__entry - > tp = tp ;
2018-05-09 17:48:52 +03:00
__entry - > caller_ip = caller_ip ;
2016-08-03 04:13:02 +03:00
) ,
2018-08-01 17:20:35 +03:00
TP_printk ( " dev %d:%d tp %p caller %pS " ,
2016-08-03 04:13:02 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
2018-08-01 17:20:35 +03:00
__entry - > tp ,
2018-05-09 17:48:52 +03:00
( char * ) __entry - > caller_ip )
2016-08-03 04:13:02 +03:00
)
# define DEFINE_DEFER_EVENT(name) \
DEFINE_EVENT ( xfs_defer_class , name , \
2018-08-01 17:20:35 +03:00
TP_PROTO ( struct xfs_trans * tp , unsigned long caller_ip ) , \
TP_ARGS ( tp , caller_ip ) )
2016-08-03 04:13:02 +03:00
DECLARE_EVENT_CLASS ( xfs_defer_error_class ,
2018-08-01 17:20:35 +03:00
TP_PROTO ( struct xfs_trans * tp , int error ) ,
TP_ARGS ( tp , error ) ,
2016-08-03 04:13:02 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
2018-08-01 17:20:35 +03:00
__field ( struct xfs_trans * , tp )
2017-04-21 21:24:42 +03:00
__field ( char , committed )
2016-08-03 04:13:02 +03:00
__field ( int , error )
) ,
TP_fast_assign (
2018-08-01 17:20:35 +03:00
__entry - > dev = tp - > t_mountp - > m_super - > s_dev ;
__entry - > tp = tp ;
2016-08-03 04:13:02 +03:00
__entry - > error = error ;
) ,
2018-08-01 17:20:35 +03:00
TP_printk ( " dev %d:%d tp %p err %d " ,
2016-08-03 04:13:02 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
2018-08-01 17:20:35 +03:00
__entry - > tp ,
2016-08-03 04:13:02 +03:00
__entry - > error )
)
# define DEFINE_DEFER_ERROR_EVENT(name) \
DEFINE_EVENT ( xfs_defer_error_class , name , \
2018-08-01 17:20:35 +03:00
TP_PROTO ( struct xfs_trans * tp , int error ) , \
TP_ARGS ( tp , error ) )
2016-08-03 04:13:02 +03:00
DECLARE_EVENT_CLASS ( xfs_defer_pending_class ,
TP_PROTO ( struct xfs_mount * mp , struct xfs_defer_pending * dfp ) ,
TP_ARGS ( mp , dfp ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( int , type )
__field ( void * , intent )
2017-04-21 21:24:42 +03:00
__field ( char , committed )
2016-08-03 04:13:02 +03:00
__field ( int , nr )
) ,
TP_fast_assign (
__entry - > dev = mp ? mp - > m_super - > s_dev : 0 ;
2018-12-12 19:46:22 +03:00
__entry - > type = dfp - > dfp_type ;
2016-08-03 04:13:02 +03:00
__entry - > intent = dfp - > dfp_intent ;
2016-08-30 06:51:39 +03:00
__entry - > committed = dfp - > dfp_done ! = NULL ;
2016-08-03 04:13:02 +03:00
__entry - > nr = dfp - > dfp_count ;
) ,
2017-04-14 21:43:27 +03:00
TP_printk ( " dev %d:%d optype %d intent %p committed %d nr %d " ,
2016-08-03 04:13:02 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > type ,
__entry - > intent ,
__entry - > committed ,
__entry - > nr )
)
# define DEFINE_DEFER_PENDING_EVENT(name) \
DEFINE_EVENT ( xfs_defer_pending_class , name , \
TP_PROTO ( struct xfs_mount * mp , struct xfs_defer_pending * dfp ) , \
TP_ARGS ( mp , dfp ) )
DECLARE_EVENT_CLASS ( xfs_phys_extent_deferred_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
int type , xfs_agblock_t agbno , xfs_extlen_t len ) ,
TP_ARGS ( mp , agno , type , agbno , len ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( int , type )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > type = type ;
__entry - > agbno = agbno ;
__entry - > len = len ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x " ,
2016-08-03 04:13:02 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > type ,
__entry - > agno ,
__entry - > agbno ,
__entry - > len )
) ;
# define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
DEFINE_EVENT ( xfs_phys_extent_deferred_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
int type , \
xfs_agblock_t bno , \
xfs_extlen_t len ) , \
TP_ARGS ( mp , agno , type , bno , len ) )
DECLARE_EVENT_CLASS ( xfs_map_extent_deferred_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
int op ,
xfs_agblock_t agbno ,
xfs_ino_t ino ,
int whichfork ,
xfs_fileoff_t offset ,
xfs_filblks_t len ,
xfs_exntst_t state ) ,
TP_ARGS ( mp , agno , op , agbno , ino , whichfork , offset , len , state ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_ino_t , ino )
__field ( xfs_agblock_t , agbno )
__field ( int , whichfork )
__field ( xfs_fileoff_t , l_loff )
__field ( xfs_filblks_t , l_len )
__field ( xfs_exntst_t , l_state )
__field ( int , op )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > ino = ino ;
__entry - > agbno = agbno ;
__entry - > whichfork = whichfork ;
__entry - > l_loff = offset ;
__entry - > l_len = len ;
__entry - > l_state = state ;
__entry - > op = op ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d " ,
2016-08-03 04:13:02 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > op ,
__entry - > agno ,
__entry - > agbno ,
__entry - > ino ,
2021-08-17 23:09:26 +03:00
__print_symbolic ( __entry - > whichfork , XFS_WHICHFORK_STRINGS ) ,
2016-08-03 04:13:02 +03:00
__entry - > l_loff ,
__entry - > l_len ,
__entry - > l_state )
) ;
# define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
DEFINE_EVENT ( xfs_map_extent_deferred_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
int op , \
xfs_agblock_t agbno , \
xfs_ino_t ino , \
int whichfork , \
xfs_fileoff_t offset , \
xfs_filblks_t len , \
xfs_exntst_t state ) , \
TP_ARGS ( mp , agno , op , agbno , ino , whichfork , offset , len , state ) )
DEFINE_DEFER_EVENT ( xfs_defer_cancel ) ;
DEFINE_DEFER_EVENT ( xfs_defer_trans_roll ) ;
DEFINE_DEFER_EVENT ( xfs_defer_trans_abort ) ;
DEFINE_DEFER_EVENT ( xfs_defer_finish ) ;
DEFINE_DEFER_EVENT ( xfs_defer_finish_done ) ;
DEFINE_DEFER_ERROR_EVENT ( xfs_defer_trans_roll_error ) ;
DEFINE_DEFER_ERROR_EVENT ( xfs_defer_finish_error ) ;
2018-08-01 17:20:34 +03:00
DEFINE_DEFER_PENDING_EVENT ( xfs_defer_create_intent ) ;
DEFINE_DEFER_PENDING_EVENT ( xfs_defer_cancel_list ) ;
2016-08-03 04:13:02 +03:00
DEFINE_DEFER_PENDING_EVENT ( xfs_defer_pending_finish ) ;
DEFINE_DEFER_PENDING_EVENT ( xfs_defer_pending_abort ) ;
xfs: periodically relog deferred intent items
There's a subtle design flaw in the deferred log item code that can lead
to pinning the log tail. Taking up the defer ops chain examples from
the previous commit, we can get trapped in sequences like this:
Caller hands us a transaction t0 with D0-D3 attached. The defer ops
chain will look like the following if the transaction rolls succeed:
t1: D0(t0), D1(t0), D2(t0), D3(t0)
t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0)
t3: d5(t1), D1(t0), D2(t0), D3(t0)
...
t9: d9(t7), D3(t0)
t10: D3(t0)
t11: d10(t10), d11(t10)
t12: d11(t10)
In transaction 9, we finish d9 and try to roll to t10 while holding onto
an intent item for D3 that we logged in t0.
The previous commit changed the order in which we place new defer ops in
the defer ops processing chain to reduce the maximum chain length. Now
make xfs_defer_finish_noroll capable of relogging the entire chain
periodically so that we can always move the log tail forward. Most
chains will never get relogged, except for operations that generate very
long chains (large extents containing many blocks with different sharing
levels) or are on filesystems with small logs and a lot of ongoing
metadata updates.
Callers are now required to ensure that the transaction reservation is
large enough to handle logging done items and new intent items for the
maximum possible chain length. Most callers are careful to keep the
chain lengths low, so the overhead should be minimal.
The decision to relog an intent item is made based on whether the intent
was logged in a previous checkpoint, since there's no point in relogging
an intent into the same checkpoint.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
2020-09-28 02:18:13 +03:00
DEFINE_DEFER_PENDING_EVENT ( xfs_defer_relog_intent ) ;
2016-08-03 04:13:02 +03:00
2016-08-03 04:26:33 +03:00
# define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT ( xfs_bmap_free_defer ) ;
DEFINE_BMAP_FREE_DEFERRED_EVENT ( xfs_bmap_free_deferred ) ;
2018-05-08 03:38:47 +03:00
DEFINE_BMAP_FREE_DEFERRED_EVENT ( xfs_agfl_free_defer ) ;
DEFINE_BMAP_FREE_DEFERRED_EVENT ( xfs_agfl_free_deferred ) ;
2016-08-03 04:26:33 +03:00
2016-08-03 04:33:43 +03:00
/* rmap tracepoints */
DECLARE_EVENT_CLASS ( xfs_rmap_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs_agblock_t agbno , xfs_extlen_t len , bool unwritten ,
2018-12-12 19:46:23 +03:00
const struct xfs_owner_info * oinfo ) ,
2016-08-03 04:33:43 +03:00
TP_ARGS ( mp , agno , agbno , len , unwritten , oinfo ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
__field ( uint64_t , owner )
__field ( uint64_t , offset )
__field ( unsigned long , flags )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agbno = agbno ;
__entry - > len = len ;
__entry - > owner = oinfo - > oi_owner ;
__entry - > offset = oinfo - > oi_offset ;
__entry - > flags = oinfo - > oi_flags ;
2016-08-03 04:36:07 +03:00
if ( unwritten )
__entry - > flags | = XFS_RMAP_UNWRITTEN ;
2016-08-03 04:33:43 +03:00
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx " ,
2016-08-03 04:33:43 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > len ,
__entry - > owner ,
__entry - > offset ,
__entry - > flags )
) ;
# define DEFINE_RMAP_EVENT(name) \
DEFINE_EVENT ( xfs_rmap_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
xfs_agblock_t agbno , xfs_extlen_t len , bool unwritten , \
2018-12-12 19:46:23 +03:00
const struct xfs_owner_info * oinfo ) , \
2016-08-03 04:33:43 +03:00
TP_ARGS ( mp , agno , agbno , len , unwritten , oinfo ) )
/* simple AG-based error/%ip tracepoint class */
DECLARE_EVENT_CLASS ( xfs_ag_error_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , int error ,
unsigned long caller_ip ) ,
TP_ARGS ( mp , agno , error , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( int , error )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > error = error ;
__entry - > caller_ip = caller_ip ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x error %d caller %pS " ,
2016-08-03 04:33:43 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > error ,
( char * ) __entry - > caller_ip )
) ;
# define DEFINE_AG_ERROR_EVENT(name) \
DEFINE_EVENT ( xfs_ag_error_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , int error , \
unsigned long caller_ip ) , \
TP_ARGS ( mp , agno , error , caller_ip ) )
DEFINE_RMAP_EVENT ( xfs_rmap_unmap ) ;
DEFINE_RMAP_EVENT ( xfs_rmap_unmap_done ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_unmap_error ) ;
DEFINE_RMAP_EVENT ( xfs_rmap_map ) ;
DEFINE_RMAP_EVENT ( xfs_rmap_map_done ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_map_error ) ;
2016-08-03 05:03:19 +03:00
DEFINE_RMAP_EVENT ( xfs_rmap_convert ) ;
DEFINE_RMAP_EVENT ( xfs_rmap_convert_done ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_convert_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_convert_state ) ;
2016-08-03 04:33:43 +03:00
2016-08-03 04:43:24 +03:00
DECLARE_EVENT_CLASS ( xfs_rmapbt_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs_agblock_t agbno , xfs_extlen_t len ,
uint64_t owner , uint64_t offset , unsigned int flags ) ,
TP_ARGS ( mp , agno , agbno , len , owner , offset , flags ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
__field ( uint64_t , owner )
__field ( uint64_t , offset )
__field ( unsigned int , flags )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agbno = agbno ;
__entry - > len = len ;
__entry - > owner = owner ;
__entry - > offset = offset ;
__entry - > flags = flags ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x " ,
2016-08-03 04:43:24 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__entry - > len ,
__entry - > owner ,
__entry - > offset ,
__entry - > flags )
) ;
# define DEFINE_RMAPBT_EVENT(name) \
DEFINE_EVENT ( xfs_rmapbt_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
xfs_agblock_t agbno , xfs_extlen_t len , \
uint64_t owner , uint64_t offset , unsigned int flags ) , \
TP_ARGS ( mp , agno , agbno , len , owner , offset , flags ) )
# define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
DEFINE_RMAP_DEFERRED_EVENT ( xfs_rmap_defer ) ;
DEFINE_RMAP_DEFERRED_EVENT ( xfs_rmap_deferred ) ;
2016-08-03 04:39:05 +03:00
DEFINE_BUSY_EVENT ( xfs_rmapbt_alloc_block ) ;
DEFINE_BUSY_EVENT ( xfs_rmapbt_free_block ) ;
2016-08-03 04:43:24 +03:00
DEFINE_RMAPBT_EVENT ( xfs_rmap_update ) ;
DEFINE_RMAPBT_EVENT ( xfs_rmap_insert ) ;
DEFINE_RMAPBT_EVENT ( xfs_rmap_delete ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_insert_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_delete_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_rmap_update_error ) ;
2016-10-03 19:11:48 +03:00
DEFINE_RMAPBT_EVENT ( xfs_rmap_find_left_neighbor_candidate ) ;
DEFINE_RMAPBT_EVENT ( xfs_rmap_find_left_neighbor_query ) ;
DEFINE_RMAPBT_EVENT ( xfs_rmap_lookup_le_range_candidate ) ;
DEFINE_RMAPBT_EVENT ( xfs_rmap_lookup_le_range ) ;
2016-08-03 04:44:21 +03:00
DEFINE_RMAPBT_EVENT ( xfs_rmap_lookup_le_range_result ) ;
DEFINE_RMAPBT_EVENT ( xfs_rmap_find_right_neighbor_result ) ;
2016-08-03 05:03:19 +03:00
DEFINE_RMAPBT_EVENT ( xfs_rmap_find_left_neighbor_result ) ;
2016-08-03 04:39:05 +03:00
2016-10-03 19:11:28 +03:00
/* deferred bmbt updates */
# define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT
DEFINE_BMAP_DEFERRED_EVENT ( xfs_bmap_defer ) ;
DEFINE_BMAP_DEFERRED_EVENT ( xfs_bmap_deferred ) ;
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
/* per-AG reservation */
DECLARE_EVENT_CLASS ( xfs_ag_resv_class ,
TP_PROTO ( struct xfs_perag * pag , enum xfs_ag_resv_type resv ,
xfs_extlen_t len ) ,
TP_ARGS ( pag , resv , len ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( int , resv )
__field ( xfs_extlen_t , freeblks )
__field ( xfs_extlen_t , flcount )
__field ( xfs_extlen_t , reserved )
__field ( xfs_extlen_t , asked )
__field ( xfs_extlen_t , len )
) ,
TP_fast_assign (
struct xfs_ag_resv * r = xfs_perag_resv ( pag , resv ) ;
__entry - > dev = pag - > pag_mount - > m_super - > s_dev ;
__entry - > agno = pag - > pag_agno ;
__entry - > resv = resv ;
__entry - > freeblks = pag - > pagf_freeblks ;
__entry - > flcount = pag - > pagf_flcount ;
__entry - > reserved = r ? r - > ar_reserved : 0 ;
__entry - > asked = r ? r - > ar_asked : 0 ;
__entry - > len = len ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x resv %d freeblks %u flcount %u "
2017-04-14 21:43:27 +03:00
" resv %u ask %u len %u " ,
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > resv ,
__entry - > freeblks ,
__entry - > flcount ,
__entry - > reserved ,
__entry - > asked ,
__entry - > len )
)
# define DEFINE_AG_RESV_EVENT(name) \
DEFINE_EVENT ( xfs_ag_resv_class , name , \
TP_PROTO ( struct xfs_perag * pag , enum xfs_ag_resv_type type , \
xfs_extlen_t len ) , \
TP_ARGS ( pag , type , len ) )
/* per-AG reservation tracepoints */
DEFINE_AG_RESV_EVENT ( xfs_ag_resv_init ) ;
DEFINE_AG_RESV_EVENT ( xfs_ag_resv_free ) ;
DEFINE_AG_RESV_EVENT ( xfs_ag_resv_alloc_extent ) ;
DEFINE_AG_RESV_EVENT ( xfs_ag_resv_free_extent ) ;
DEFINE_AG_RESV_EVENT ( xfs_ag_resv_critical ) ;
DEFINE_AG_RESV_EVENT ( xfs_ag_resv_needed ) ;
DEFINE_AG_ERROR_EVENT ( xfs_ag_resv_free_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_ag_resv_init_error ) ;
2016-10-03 19:11:15 +03:00
/* refcount tracepoint classes */
/* reuse the discard trace class for agbno/aglen-based traces */
# define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
/* ag btree lookup tracepoint class */
2018-12-19 01:32:29 +03:00
TRACE_DEFINE_ENUM ( XFS_LOOKUP_EQi ) ;
TRACE_DEFINE_ENUM ( XFS_LOOKUP_LEi ) ;
TRACE_DEFINE_ENUM ( XFS_LOOKUP_GEi ) ;
2016-10-03 19:11:15 +03:00
DECLARE_EVENT_CLASS ( xfs_ag_btree_lookup_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs_agblock_t agbno , xfs_lookup_t dir ) ,
TP_ARGS ( mp , agno , agbno , dir ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( xfs_lookup_t , dir )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agbno = agbno ;
__entry - > dir = dir ;
) ,
2021-08-17 19:28:53 +03:00
TP_printk ( " dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d) " ,
2016-10-03 19:11:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agbno ,
__print_symbolic ( __entry - > dir , XFS_AG_BTREE_CMP_FORMAT_STR ) ,
__entry - > dir )
)
# define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
DEFINE_EVENT ( xfs_ag_btree_lookup_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
xfs_agblock_t agbno , xfs_lookup_t dir ) , \
TP_ARGS ( mp , agno , agbno , dir ) )
/* single-rcext tracepoint class */
DECLARE_EVENT_CLASS ( xfs_refcount_extent_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
struct xfs_refcount_irec * irec ) ,
TP_ARGS ( mp , agno , irec ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , startblock )
__field ( xfs_extlen_t , blockcount )
__field ( xfs_nlink_t , refcount )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
2022-10-27 00:23:58 +03:00
__entry - > domain = irec - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > startblock = irec - > rc_startblock ;
__entry - > blockcount = irec - > rc_blockcount ;
__entry - > refcount = irec - > rc_refcount ;
) ,
2022-10-27 00:23:58 +03:00
TP_printk ( " dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u " ,
2016-10-03 19:11:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > startblock ,
__entry - > blockcount ,
__entry - > refcount )
)
# define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
DEFINE_EVENT ( xfs_refcount_extent_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
struct xfs_refcount_irec * irec ) , \
TP_ARGS ( mp , agno , irec ) )
/* single-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS ( xfs_refcount_extent_at_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
struct xfs_refcount_irec * irec , xfs_agblock_t agbno ) ,
TP_ARGS ( mp , agno , irec , agbno ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , startblock )
__field ( xfs_extlen_t , blockcount )
__field ( xfs_nlink_t , refcount )
__field ( xfs_agblock_t , agbno )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
2022-10-27 00:23:58 +03:00
__entry - > domain = irec - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > startblock = irec - > rc_startblock ;
__entry - > blockcount = irec - > rc_blockcount ;
__entry - > refcount = irec - > rc_refcount ;
__entry - > agbno = agbno ;
) ,
2022-10-27 00:23:58 +03:00
TP_printk ( " dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x " ,
2016-10-03 19:11:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > startblock ,
__entry - > blockcount ,
__entry - > refcount ,
__entry - > agbno )
)
# define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
DEFINE_EVENT ( xfs_refcount_extent_at_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
struct xfs_refcount_irec * irec , xfs_agblock_t agbno ) , \
TP_ARGS ( mp , agno , irec , agbno ) )
/* double-rcext tracepoint class */
DECLARE_EVENT_CLASS ( xfs_refcount_double_extent_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
struct xfs_refcount_irec * i1 , struct xfs_refcount_irec * i2 ) ,
TP_ARGS ( mp , agno , i1 , i2 ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i1_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i1_startblock )
__field ( xfs_extlen_t , i1_blockcount )
__field ( xfs_nlink_t , i1_refcount )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i2_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i2_startblock )
__field ( xfs_extlen_t , i2_blockcount )
__field ( xfs_nlink_t , i2_refcount )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
2022-10-27 00:23:58 +03:00
__entry - > i1_domain = i1 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i1_startblock = i1 - > rc_startblock ;
__entry - > i1_blockcount = i1 - > rc_blockcount ;
__entry - > i1_refcount = i1 - > rc_refcount ;
2022-10-27 00:23:58 +03:00
__entry - > i2_domain = i2 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i2_startblock = i2 - > rc_startblock ;
__entry - > i2_blockcount = i2 - > rc_blockcount ;
__entry - > i2_refcount = i2 - > rc_refcount ;
) ,
2022-10-27 00:23:58 +03:00
TP_printk ( " dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
" dom %s agbno 0x%x fsbcount 0x%x refcount %u " ,
2016-10-03 19:11:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i1_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i1_startblock ,
__entry - > i1_blockcount ,
__entry - > i1_refcount ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i2_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i2_startblock ,
__entry - > i2_blockcount ,
__entry - > i2_refcount )
)
# define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
DEFINE_EVENT ( xfs_refcount_double_extent_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
struct xfs_refcount_irec * i1 , struct xfs_refcount_irec * i2 ) , \
TP_ARGS ( mp , agno , i1 , i2 ) )
/* double-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS ( xfs_refcount_double_extent_at_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
struct xfs_refcount_irec * i1 , struct xfs_refcount_irec * i2 ,
xfs_agblock_t agbno ) ,
TP_ARGS ( mp , agno , i1 , i2 , agbno ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i1_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i1_startblock )
__field ( xfs_extlen_t , i1_blockcount )
__field ( xfs_nlink_t , i1_refcount )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i2_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i2_startblock )
__field ( xfs_extlen_t , i2_blockcount )
__field ( xfs_nlink_t , i2_refcount )
__field ( xfs_agblock_t , agbno )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
2022-10-27 00:23:58 +03:00
__entry - > i1_domain = i1 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i1_startblock = i1 - > rc_startblock ;
__entry - > i1_blockcount = i1 - > rc_blockcount ;
__entry - > i1_refcount = i1 - > rc_refcount ;
2022-10-27 00:23:58 +03:00
__entry - > i2_domain = i2 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i2_startblock = i2 - > rc_startblock ;
__entry - > i2_blockcount = i2 - > rc_blockcount ;
__entry - > i2_refcount = i2 - > rc_refcount ;
__entry - > agbno = agbno ;
) ,
2022-10-27 00:23:58 +03:00
TP_printk ( " dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
" dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x " ,
2016-10-03 19:11:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i1_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i1_startblock ,
__entry - > i1_blockcount ,
__entry - > i1_refcount ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i2_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i2_startblock ,
__entry - > i2_blockcount ,
__entry - > i2_refcount ,
__entry - > agbno )
)
# define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
DEFINE_EVENT ( xfs_refcount_double_extent_at_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
struct xfs_refcount_irec * i1 , struct xfs_refcount_irec * i2 , \
xfs_agblock_t agbno ) , \
TP_ARGS ( mp , agno , i1 , i2 , agbno ) )
/* triple-rcext tracepoint class */
DECLARE_EVENT_CLASS ( xfs_refcount_triple_extent_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
struct xfs_refcount_irec * i1 , struct xfs_refcount_irec * i2 ,
struct xfs_refcount_irec * i3 ) ,
TP_ARGS ( mp , agno , i1 , i2 , i3 ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i1_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i1_startblock )
__field ( xfs_extlen_t , i1_blockcount )
__field ( xfs_nlink_t , i1_refcount )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i2_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i2_startblock )
__field ( xfs_extlen_t , i2_blockcount )
__field ( xfs_nlink_t , i2_refcount )
2022-10-27 00:23:58 +03:00
__field ( enum xfs_refc_domain , i3_domain )
2016-10-03 19:11:15 +03:00
__field ( xfs_agblock_t , i3_startblock )
__field ( xfs_extlen_t , i3_blockcount )
__field ( xfs_nlink_t , i3_refcount )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
2022-10-27 00:23:58 +03:00
__entry - > i1_domain = i1 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i1_startblock = i1 - > rc_startblock ;
__entry - > i1_blockcount = i1 - > rc_blockcount ;
__entry - > i1_refcount = i1 - > rc_refcount ;
2022-10-27 00:23:58 +03:00
__entry - > i2_domain = i2 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i2_startblock = i2 - > rc_startblock ;
__entry - > i2_blockcount = i2 - > rc_blockcount ;
__entry - > i2_refcount = i2 - > rc_refcount ;
2022-10-27 00:23:58 +03:00
__entry - > i3_domain = i3 - > rc_domain ;
2016-10-03 19:11:15 +03:00
__entry - > i3_startblock = i3 - > rc_startblock ;
__entry - > i3_blockcount = i3 - > rc_blockcount ;
__entry - > i3_refcount = i3 - > rc_refcount ;
) ,
2022-10-27 00:23:58 +03:00
TP_printk ( " dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
" dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
" dom %s agbno 0x%x fsbcount 0x%x refcount %u " ,
2016-10-03 19:11:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i1_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i1_startblock ,
__entry - > i1_blockcount ,
__entry - > i1_refcount ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i2_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i2_startblock ,
__entry - > i2_blockcount ,
__entry - > i2_refcount ,
2022-10-27 00:23:58 +03:00
__print_symbolic ( __entry - > i3_domain , XFS_REFC_DOMAIN_STRINGS ) ,
2016-10-03 19:11:15 +03:00
__entry - > i3_startblock ,
__entry - > i3_blockcount ,
__entry - > i3_refcount )
) ;
# define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
DEFINE_EVENT ( xfs_refcount_triple_extent_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
struct xfs_refcount_irec * i1 , struct xfs_refcount_irec * i2 , \
struct xfs_refcount_irec * i3 ) , \
TP_ARGS ( mp , agno , i1 , i2 , i3 ) )
/* refcount btree tracepoints */
DEFINE_BUSY_EVENT ( xfs_refcountbt_alloc_block ) ;
DEFINE_BUSY_EVENT ( xfs_refcountbt_free_block ) ;
DEFINE_AG_BTREE_LOOKUP_EVENT ( xfs_refcount_lookup ) ;
DEFINE_REFCOUNT_EXTENT_EVENT ( xfs_refcount_get ) ;
DEFINE_REFCOUNT_EXTENT_EVENT ( xfs_refcount_update ) ;
DEFINE_REFCOUNT_EXTENT_EVENT ( xfs_refcount_insert ) ;
DEFINE_REFCOUNT_EXTENT_EVENT ( xfs_refcount_delete ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_insert_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_delete_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_update_error ) ;
/* refcount adjustment tracepoints */
DEFINE_AG_EXTENT_EVENT ( xfs_refcount_increase ) ;
DEFINE_AG_EXTENT_EVENT ( xfs_refcount_decrease ) ;
2016-10-03 19:11:39 +03:00
DEFINE_AG_EXTENT_EVENT ( xfs_refcount_cow_increase ) ;
DEFINE_AG_EXTENT_EVENT ( xfs_refcount_cow_decrease ) ;
2016-10-03 19:11:15 +03:00
DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT ( xfs_refcount_merge_center_extents ) ;
DEFINE_REFCOUNT_EXTENT_EVENT ( xfs_refcount_modify_extent ) ;
2016-10-03 19:11:39 +03:00
DEFINE_REFCOUNT_EXTENT_EVENT ( xfs_refcount_recover_extent ) ;
2016-10-03 19:11:15 +03:00
DEFINE_REFCOUNT_EXTENT_AT_EVENT ( xfs_refcount_split_extent ) ;
DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT ( xfs_refcount_merge_left_extent ) ;
DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT ( xfs_refcount_merge_right_extent ) ;
DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT ( xfs_refcount_find_left_extent ) ;
DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT ( xfs_refcount_find_right_extent ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_adjust_error ) ;
2016-10-03 19:11:39 +03:00
DEFINE_AG_ERROR_EVENT ( xfs_refcount_adjust_cow_error ) ;
2016-10-03 19:11:15 +03:00
DEFINE_AG_ERROR_EVENT ( xfs_refcount_merge_center_extents_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_modify_extent_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_split_extent_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_merge_left_extent_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_merge_right_extent_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_find_left_extent_error ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_find_right_extent_error ) ;
/* reflink helpers */
DEFINE_AG_EXTENT_EVENT ( xfs_refcount_find_shared ) ;
DEFINE_AG_EXTENT_EVENT ( xfs_refcount_find_shared_result ) ;
DEFINE_AG_ERROR_EVENT ( xfs_refcount_find_shared_error ) ;
2016-10-03 19:11:22 +03:00
# define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_REFCOUNT_DEFERRED_EVENT ( xfs_refcount_defer ) ;
DEFINE_REFCOUNT_DEFERRED_EVENT ( xfs_refcount_deferred ) ;
2016-10-03 19:11:15 +03:00
2016-10-03 19:11:21 +03:00
TRACE_EVENT ( xfs_refcount_finish_one_leftover ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
2023-02-01 21:16:04 +03:00
int type , xfs_agblock_t agbno , xfs_extlen_t len ) ,
TP_ARGS ( mp , agno , type , agbno , len ) ,
2016-10-03 19:11:21 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( int , type )
__field ( xfs_agblock_t , agbno )
__field ( xfs_extlen_t , len )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > type = type ;
__entry - > agbno = agbno ;
__entry - > len = len ;
) ,
2023-02-01 21:16:04 +03:00
TP_printk ( " dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x " ,
2016-10-03 19:11:21 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > type ,
__entry - > agno ,
__entry - > agbno ,
2023-02-01 21:16:04 +03:00
__entry - > len )
2016-10-03 19:11:21 +03:00
) ;
2016-10-03 19:11:27 +03:00
/* simple inode-based error/%ip tracepoint class */
DECLARE_EVENT_CLASS ( xfs_inode_error_class ,
TP_PROTO ( struct xfs_inode * ip , int error , unsigned long caller_ip ) ,
TP_ARGS ( ip , error , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( int , error )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > error = error ;
__entry - > caller_ip = caller_ip ;
) ,
2021-08-17 19:20:27 +03:00
TP_printk ( " dev %d:%d ino 0x%llx error %d caller %pS " ,
2016-10-03 19:11:27 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > error ,
( char * ) __entry - > caller_ip )
) ;
# define DEFINE_INODE_ERROR_EVENT(name) \
DEFINE_EVENT ( xfs_inode_error_class , name , \
TP_PROTO ( struct xfs_inode * ip , int error , \
unsigned long caller_ip ) , \
TP_ARGS ( ip , error , caller_ip ) )
2016-10-03 19:11:30 +03:00
/* reflink tracepoint classes */
/* two-file io tracepoint class */
DECLARE_EVENT_CLASS ( xfs_double_io_class ,
TP_PROTO ( struct xfs_inode * src , xfs_off_t soffset , xfs_off_t len ,
struct xfs_inode * dest , xfs_off_t doffset ) ,
TP_ARGS ( src , soffset , len , dest , doffset ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , src_ino )
__field ( loff_t , src_isize )
__field ( loff_t , src_disize )
__field ( loff_t , src_offset )
2021-08-17 23:00:13 +03:00
__field ( long long , len )
2016-10-03 19:11:30 +03:00
__field ( xfs_ino_t , dest_ino )
__field ( loff_t , dest_isize )
__field ( loff_t , dest_disize )
__field ( loff_t , dest_offset )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( src ) - > i_sb - > s_dev ;
__entry - > src_ino = src - > i_ino ;
__entry - > src_isize = VFS_I ( src ) - > i_size ;
2021-03-29 21:11:40 +03:00
__entry - > src_disize = src - > i_disk_size ;
2016-10-03 19:11:30 +03:00
__entry - > src_offset = soffset ;
__entry - > len = len ;
__entry - > dest_ino = dest - > i_ino ;
__entry - > dest_isize = VFS_I ( dest ) - > i_size ;
2021-03-29 21:11:40 +03:00
__entry - > dest_disize = dest - > i_disk_size ;
2016-10-03 19:11:30 +03:00
__entry - > dest_offset = doffset ;
) ,
2021-08-17 23:00:13 +03:00
TP_printk ( " dev %d:%d bytecount 0x%llx "
2021-08-17 20:09:12 +03:00
" ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
" ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx " ,
2016-10-03 19:11:30 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > len ,
__entry - > src_ino ,
__entry - > src_isize ,
__entry - > src_disize ,
__entry - > src_offset ,
__entry - > dest_ino ,
__entry - > dest_isize ,
__entry - > dest_disize ,
__entry - > dest_offset )
)
# define DEFINE_DOUBLE_IO_EVENT(name) \
DEFINE_EVENT ( xfs_double_io_class , name , \
TP_PROTO ( struct xfs_inode * src , xfs_off_t soffset , xfs_off_t len , \
struct xfs_inode * dest , xfs_off_t doffset ) , \
TP_ARGS ( src , soffset , len , dest , doffset ) )
/* inode/irec events */
DECLARE_EVENT_CLASS ( xfs_inode_irec_class ,
TP_PROTO ( struct xfs_inode * ip , struct xfs_bmbt_irec * irec ) ,
TP_ARGS ( ip , irec ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( xfs_fileoff_t , lblk )
__field ( xfs_extlen_t , len )
__field ( xfs_fsblock_t , pblk )
2017-02-03 02:14:02 +03:00
__field ( int , state )
2016-10-03 19:11:30 +03:00
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > lblk = irec - > br_startoff ;
__entry - > len = irec - > br_blockcount ;
__entry - > pblk = irec - > br_startblock ;
2017-02-03 02:14:02 +03:00
__entry - > state = irec - > br_state ;
2016-10-03 19:11:30 +03:00
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d " ,
2016-10-03 19:11:30 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > lblk ,
__entry - > len ,
2017-02-03 02:14:02 +03:00
__entry - > pblk ,
__entry - > state )
2016-10-03 19:11:30 +03:00
) ;
# define DEFINE_INODE_IREC_EVENT(name) \
DEFINE_EVENT ( xfs_inode_irec_class , name , \
TP_PROTO ( struct xfs_inode * ip , struct xfs_bmbt_irec * irec ) , \
TP_ARGS ( ip , irec ) )
2022-11-29 04:24:35 +03:00
/* inode iomap invalidation events */
DECLARE_EVENT_CLASS ( xfs_wb_invalid_class ,
TP_PROTO ( struct xfs_inode * ip , const struct iomap * iomap , unsigned int wpcseq , int whichfork ) ,
TP_ARGS ( ip , iomap , wpcseq , whichfork ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( u64 , addr )
__field ( loff_t , pos )
__field ( u64 , len )
__field ( u16 , type )
__field ( u16 , flags )
__field ( u32 , wpcseq )
__field ( u32 , forkseq )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > addr = iomap - > addr ;
__entry - > pos = iomap - > offset ;
__entry - > len = iomap - > length ;
__entry - > type = iomap - > type ;
__entry - > flags = iomap - > flags ;
__entry - > wpcseq = wpcseq ;
__entry - > forkseq = READ_ONCE ( xfs_ifork_ptr ( ip , whichfork ) - > if_seq ) ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > pos ,
__entry - > addr ,
__entry - > len ,
__entry - > type ,
__entry - > flags ,
__entry - > wpcseq ,
__entry - > forkseq )
) ;
# define DEFINE_WB_INVALID_EVENT(name) \
DEFINE_EVENT ( xfs_wb_invalid_class , name , \
TP_PROTO ( struct xfs_inode * ip , const struct iomap * iomap , unsigned int wpcseq , int whichfork ) , \
TP_ARGS ( ip , iomap , wpcseq , whichfork ) )
DEFINE_WB_INVALID_EVENT ( xfs_wb_cow_iomap_invalid ) ;
DEFINE_WB_INVALID_EVENT ( xfs_wb_data_iomap_invalid ) ;
2022-11-29 04:24:36 +03:00
DECLARE_EVENT_CLASS ( xfs_iomap_invalid_class ,
TP_PROTO ( struct xfs_inode * ip , const struct iomap * iomap ) ,
TP_ARGS ( ip , iomap ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( u64 , addr )
__field ( loff_t , pos )
__field ( u64 , len )
__field ( u64 , validity_cookie )
__field ( u64 , inodeseq )
__field ( u16 , type )
__field ( u16 , flags )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > addr = iomap - > addr ;
__entry - > pos = iomap - > offset ;
__entry - > len = iomap - > length ;
__entry - > validity_cookie = iomap - > validity_cookie ;
__entry - > type = iomap - > type ;
__entry - > flags = iomap - > flags ;
__entry - > inodeseq = xfs_iomap_inode_sequence ( ip , iomap - > flags ) ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino ,
__entry - > pos ,
__entry - > addr ,
__entry - > len ,
__entry - > type ,
__entry - > flags ,
__entry - > validity_cookie ,
__entry - > inodeseq )
) ;
# define DEFINE_IOMAP_INVALID_EVENT(name) \
DEFINE_EVENT ( xfs_iomap_invalid_class , name , \
TP_PROTO ( struct xfs_inode * ip , const struct iomap * iomap ) , \
TP_ARGS ( ip , iomap ) )
DEFINE_IOMAP_INVALID_EVENT ( xfs_iomap_invalid ) ;
2016-10-03 19:11:30 +03:00
/* refcount/reflink tracepoint definitions */
/* reflink tracepoints */
DEFINE_INODE_EVENT ( xfs_reflink_set_inode_flag ) ;
DEFINE_INODE_EVENT ( xfs_reflink_unset_inode_flag ) ;
DEFINE_ITRUNC_EVENT ( xfs_reflink_update_inode_size ) ;
2020-06-30 00:47:18 +03:00
TRACE_EVENT ( xfs_reflink_remap_blocks ,
2016-10-03 19:11:30 +03:00
TP_PROTO ( struct xfs_inode * src , xfs_fileoff_t soffset ,
xfs_filblks_t len , struct xfs_inode * dest ,
xfs_fileoff_t doffset ) ,
TP_ARGS ( src , soffset , len , dest , doffset ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , src_ino )
__field ( xfs_fileoff_t , src_lblk )
__field ( xfs_filblks_t , len )
__field ( xfs_ino_t , dest_ino )
__field ( xfs_fileoff_t , dest_lblk )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( src ) - > i_sb - > s_dev ;
__entry - > src_ino = src - > i_ino ;
__entry - > src_lblk = soffset ;
__entry - > len = len ;
__entry - > dest_ino = dest - > i_ino ;
__entry - > dest_lblk = doffset ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d fsbcount 0x%llx "
2021-08-17 20:09:12 +03:00
" ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx " ,
2016-10-03 19:11:30 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > len ,
__entry - > src_ino ,
__entry - > src_lblk ,
__entry - > dest_ino ,
__entry - > dest_lblk )
) ;
DEFINE_DOUBLE_IO_EVENT ( xfs_reflink_remap_range ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_remap_range_error ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_set_inode_flag_error ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_update_inode_size_error ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_remap_blocks_error ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_remap_extent_error ) ;
2020-06-30 00:47:18 +03:00
DEFINE_INODE_IREC_EVENT ( xfs_reflink_remap_extent_src ) ;
DEFINE_INODE_IREC_EVENT ( xfs_reflink_remap_extent_dest ) ;
2016-10-03 19:11:30 +03:00
/* dedupe tracepoints */
DEFINE_DOUBLE_IO_EVENT ( xfs_reflink_compare_extents ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_compare_extents_error ) ;
/* ioctl tracepoints */
TRACE_EVENT ( xfs_ioctl_clone ,
TP_PROTO ( struct inode * src , struct inode * dest ) ,
TP_ARGS ( src , dest ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned long , src_ino )
__field ( loff_t , src_isize )
__field ( unsigned long , dest_ino )
__field ( loff_t , dest_isize )
) ,
TP_fast_assign (
__entry - > dev = src - > i_sb - > s_dev ;
__entry - > src_ino = src - > i_ino ;
__entry - > src_isize = i_size_read ( src ) ;
__entry - > dest_ino = dest - > i_ino ;
__entry - > dest_isize = i_size_read ( dest ) ;
) ,
2021-08-17 23:03:19 +03:00
TP_printk ( " dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx " ,
2016-10-03 19:11:30 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > src_ino ,
__entry - > src_isize ,
__entry - > dest_ino ,
__entry - > dest_isize )
) ;
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT ( xfs_reflink_unshare ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_unshare_error ) ;
/* copy on write */
DEFINE_INODE_IREC_EVENT ( xfs_reflink_trim_around_shared ) ;
2016-10-03 19:11:32 +03:00
DEFINE_INODE_IREC_EVENT ( xfs_reflink_cow_found ) ;
DEFINE_INODE_IREC_EVENT ( xfs_reflink_cow_enospc ) ;
2017-02-03 02:14:02 +03:00
DEFINE_INODE_IREC_EVENT ( xfs_reflink_convert_cow ) ;
2016-10-03 19:11:30 +03:00
DEFINE_SIMPLE_IO_EVENT ( xfs_reflink_cancel_cow_range ) ;
DEFINE_SIMPLE_IO_EVENT ( xfs_reflink_end_cow ) ;
2022-04-26 04:38:15 +03:00
DEFINE_INODE_IREC_EVENT ( xfs_reflink_cow_remap_from ) ;
DEFINE_INODE_IREC_EVENT ( xfs_reflink_cow_remap_to ) ;
2016-10-03 19:11:30 +03:00
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_cancel_cow_range_error ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_reflink_end_cow_error ) ;
DEFINE_INODE_IREC_EVENT ( xfs_reflink_cancel_cow ) ;
2016-10-03 19:11:53 +03:00
/* rmap swapext tracepoints */
DEFINE_INODE_IREC_EVENT ( xfs_swap_extent_rmap_remap ) ;
DEFINE_INODE_IREC_EVENT ( xfs_swap_extent_rmap_remap_piece ) ;
DEFINE_INODE_ERROR_EVENT ( xfs_swap_extent_rmap_error ) ;
2017-03-29 00:56:37 +03:00
/* fsmap traces */
DECLARE_EVENT_CLASS ( xfs_fsmap_class ,
TP_PROTO ( struct xfs_mount * mp , u32 keydev , xfs_agnumber_t agno ,
2021-08-11 03:02:16 +03:00
const struct xfs_rmap_irec * rmap ) ,
2017-03-29 00:56:37 +03:00
TP_ARGS ( mp , keydev , agno , rmap ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( dev_t , keydev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_fsblock_t , bno )
__field ( xfs_filblks_t , len )
2017-06-16 21:00:05 +03:00
__field ( uint64_t , owner )
__field ( uint64_t , offset )
2017-03-29 00:56:37 +03:00
__field ( unsigned int , flags )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > keydev = new_decode_dev ( keydev ) ;
__entry - > agno = agno ;
__entry - > bno = rmap - > rm_startblock ;
__entry - > len = rmap - > rm_blockcount ;
__entry - > owner = rmap - > rm_owner ;
__entry - > offset = rmap - > rm_offset ;
__entry - > flags = rmap - > rm_flags ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x " ,
2017-03-29 00:56:37 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
MAJOR ( __entry - > keydev ) , MINOR ( __entry - > keydev ) ,
__entry - > agno ,
__entry - > bno ,
__entry - > len ,
__entry - > owner ,
__entry - > offset ,
__entry - > flags )
)
# define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT ( xfs_fsmap_class , name , \
TP_PROTO ( struct xfs_mount * mp , u32 keydev , xfs_agnumber_t agno , \
2021-08-11 03:02:16 +03:00
const struct xfs_rmap_irec * rmap ) , \
2017-03-29 00:56:37 +03:00
TP_ARGS ( mp , keydev , agno , rmap ) )
DEFINE_FSMAP_EVENT ( xfs_fsmap_low_key ) ;
DEFINE_FSMAP_EVENT ( xfs_fsmap_high_key ) ;
DEFINE_FSMAP_EVENT ( xfs_fsmap_mapping ) ;
DECLARE_EVENT_CLASS ( xfs_getfsmap_class ,
TP_PROTO ( struct xfs_mount * mp , struct xfs_fsmap * fsmap ) ,
TP_ARGS ( mp , fsmap ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( dev_t , keydev )
__field ( xfs_daddr_t , block )
__field ( xfs_daddr_t , len )
2017-06-16 21:00:05 +03:00
__field ( uint64_t , owner )
__field ( uint64_t , offset )
__field ( uint64_t , flags )
2017-03-29 00:56:37 +03:00
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > keydev = new_decode_dev ( fsmap - > fmr_device ) ;
__entry - > block = fsmap - > fmr_physical ;
__entry - > len = fsmap - > fmr_length ;
__entry - > owner = fsmap - > fmr_owner ;
__entry - > offset = fsmap - > fmr_offset ;
__entry - > flags = fsmap - > fmr_flags ;
) ,
2021-08-17 22:45:59 +03:00
TP_printk ( " dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx " ,
2017-03-29 00:56:37 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
MAJOR ( __entry - > keydev ) , MINOR ( __entry - > keydev ) ,
__entry - > block ,
__entry - > len ,
__entry - > owner ,
__entry - > offset ,
__entry - > flags )
)
# define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT ( xfs_getfsmap_class , name , \
TP_PROTO ( struct xfs_mount * mp , struct xfs_fsmap * fsmap ) , \
TP_ARGS ( mp , fsmap ) )
DEFINE_GETFSMAP_EVENT ( xfs_getfsmap_low_key ) ;
DEFINE_GETFSMAP_EVENT ( xfs_getfsmap_high_key ) ;
DEFINE_GETFSMAP_EVENT ( xfs_getfsmap_mapping ) ;
2022-04-26 04:38:13 +03:00
DECLARE_EVENT_CLASS ( xfs_trans_resv_class ,
2018-01-08 21:51:26 +03:00
TP_PROTO ( struct xfs_mount * mp , unsigned int type ,
struct xfs_trans_res * res ) ,
TP_ARGS ( mp , type , res ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( int , type )
__field ( uint , logres )
__field ( int , logcount )
__field ( int , logflags )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > type = type ;
__entry - > logres = res - > tr_logres ;
__entry - > logcount = res - > tr_logcount ;
__entry - > logflags = res - > tr_logflags ;
) ,
TP_printk ( " dev %d:%d type %d logres %u logcount %d flags 0x%x " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > type ,
__entry - > logres ,
__entry - > logcount ,
__entry - > logflags )
2022-04-26 04:38:13 +03:00
)
# define DEFINE_TRANS_RESV_EVENT(name) \
DEFINE_EVENT ( xfs_trans_resv_class , name , \
TP_PROTO ( struct xfs_mount * mp , unsigned int type , \
struct xfs_trans_res * res ) , \
TP_ARGS ( mp , type , res ) )
DEFINE_TRANS_RESV_EVENT ( xfs_trans_resv_calc ) ;
DEFINE_TRANS_RESV_EVENT ( xfs_trans_resv_calc_minlogsize ) ;
2018-01-08 21:51:26 +03:00
2022-04-26 04:38:13 +03:00
TRACE_EVENT ( xfs_log_get_max_trans_res ,
TP_PROTO ( struct xfs_mount * mp , const struct xfs_trans_res * res ) ,
TP_ARGS ( mp , res ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( uint , logres )
__field ( int , logcount )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > logres = res - > tr_logres ;
__entry - > logcount = res - > tr_logcount ;
) ,
TP_printk ( " dev %d:%d logres %u logcount %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > logres ,
__entry - > logcount )
2018-01-08 21:51:26 +03:00
) ;
2018-05-09 17:47:57 +03:00
DECLARE_EVENT_CLASS ( xfs_trans_class ,
TP_PROTO ( struct xfs_trans * tp , unsigned long caller_ip ) ,
TP_ARGS ( tp , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( uint32_t , tid )
__field ( uint32_t , flags )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = tp - > t_mountp - > m_super - > s_dev ;
__entry - > tid = 0 ;
if ( tp - > t_ticket )
__entry - > tid = tp - > t_ticket - > t_tid ;
__entry - > flags = tp - > t_flags ;
__entry - > caller_ip = caller_ip ;
) ,
TP_printk ( " dev %d:%d trans %x flags 0x%x caller %pS " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > tid ,
__entry - > flags ,
( char * ) __entry - > caller_ip )
)
# define DEFINE_TRANS_EVENT(name) \
DEFINE_EVENT ( xfs_trans_class , name , \
TP_PROTO ( struct xfs_trans * tp , unsigned long caller_ip ) , \
TP_ARGS ( tp , caller_ip ) )
DEFINE_TRANS_EVENT ( xfs_trans_alloc ) ;
DEFINE_TRANS_EVENT ( xfs_trans_cancel ) ;
DEFINE_TRANS_EVENT ( xfs_trans_commit ) ;
DEFINE_TRANS_EVENT ( xfs_trans_dup ) ;
DEFINE_TRANS_EVENT ( xfs_trans_free ) ;
DEFINE_TRANS_EVENT ( xfs_trans_roll ) ;
DEFINE_TRANS_EVENT ( xfs_trans_add_item ) ;
2019-06-29 05:27:31 +03:00
DEFINE_TRANS_EVENT ( xfs_trans_commit_items ) ;
2018-05-09 17:47:57 +03:00
DEFINE_TRANS_EVENT ( xfs_trans_free_items ) ;
2019-02-07 21:37:14 +03:00
TRACE_EVENT ( xfs_iunlink_update_bucket ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , unsigned int bucket ,
xfs_agino_t old_ptr , xfs_agino_t new_ptr ) ,
TP_ARGS ( mp , agno , bucket , old_ptr , new_ptr ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( unsigned int , bucket )
__field ( xfs_agino_t , old_ptr )
__field ( xfs_agino_t , new_ptr )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > bucket = bucket ;
__entry - > old_ptr = old_ptr ;
__entry - > new_ptr = new_ptr ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x " ,
2019-02-07 21:37:14 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > bucket ,
__entry - > old_ptr ,
__entry - > new_ptr )
) ;
2019-02-07 21:37:15 +03:00
TRACE_EVENT ( xfs_iunlink_update_dinode ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , xfs_agino_t agino ,
xfs_agino_t old_ptr , xfs_agino_t new_ptr ) ,
TP_ARGS ( mp , agno , agino , old_ptr , new_ptr ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , agino )
__field ( xfs_agino_t , old_ptr )
__field ( xfs_agino_t , new_ptr )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > agino = agino ;
__entry - > old_ptr = old_ptr ;
__entry - > new_ptr = new_ptr ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x " ,
2019-02-07 21:37:15 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno ,
__entry - > agino ,
__entry - > old_ptr ,
__entry - > new_ptr )
) ;
2019-02-07 21:37:16 +03:00
DECLARE_EVENT_CLASS ( xfs_ag_inode_class ,
TP_PROTO ( struct xfs_inode * ip ) ,
TP_ARGS ( ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , agino )
) ,
TP_fast_assign (
__entry - > dev = VFS_I ( ip ) - > i_sb - > s_dev ;
__entry - > agno = XFS_INO_TO_AGNO ( ip - > i_mount , ip - > i_ino ) ;
__entry - > agino = XFS_INO_TO_AGINO ( ip - > i_mount , ip - > i_ino ) ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x agino 0x%x " ,
2019-02-07 21:37:16 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno , __entry - > agino )
)
# define DEFINE_AGINODE_EVENT(name) \
DEFINE_EVENT ( xfs_ag_inode_class , name , \
TP_PROTO ( struct xfs_inode * ip ) , \
TP_ARGS ( ip ) )
DEFINE_AGINODE_EVENT ( xfs_iunlink ) ;
DEFINE_AGINODE_EVENT ( xfs_iunlink_remove ) ;
2019-04-12 17:40:25 +03:00
DECLARE_EVENT_CLASS ( xfs_fs_corrupt_class ,
TP_PROTO ( struct xfs_mount * mp , unsigned int flags ) ,
TP_ARGS ( mp , flags ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned int , flags )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > flags = flags ;
) ,
TP_printk ( " dev %d:%d flags 0x%x " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > flags )
) ;
# define DEFINE_FS_CORRUPT_EVENT(name) \
DEFINE_EVENT ( xfs_fs_corrupt_class , name , \
TP_PROTO ( struct xfs_mount * mp , unsigned int flags ) , \
TP_ARGS ( mp , flags ) )
DEFINE_FS_CORRUPT_EVENT ( xfs_fs_mark_sick ) ;
DEFINE_FS_CORRUPT_EVENT ( xfs_fs_mark_healthy ) ;
2019-04-12 17:41:16 +03:00
DEFINE_FS_CORRUPT_EVENT ( xfs_fs_unfixed_corruption ) ;
2019-04-12 17:40:25 +03:00
DEFINE_FS_CORRUPT_EVENT ( xfs_rt_mark_sick ) ;
DEFINE_FS_CORRUPT_EVENT ( xfs_rt_mark_healthy ) ;
2019-04-12 17:41:16 +03:00
DEFINE_FS_CORRUPT_EVENT ( xfs_rt_unfixed_corruption ) ;
2019-04-12 17:40:25 +03:00
DECLARE_EVENT_CLASS ( xfs_ag_corrupt_class ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , unsigned int flags ) ,
TP_ARGS ( mp , agno , flags ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( unsigned int , flags )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > flags = flags ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x flags 0x%x " ,
2019-04-12 17:40:25 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > agno , __entry - > flags )
) ;
# define DEFINE_AG_CORRUPT_EVENT(name) \
DEFINE_EVENT ( xfs_ag_corrupt_class , name , \
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno , \
unsigned int flags ) , \
TP_ARGS ( mp , agno , flags ) )
DEFINE_AG_CORRUPT_EVENT ( xfs_ag_mark_sick ) ;
DEFINE_AG_CORRUPT_EVENT ( xfs_ag_mark_healthy ) ;
2019-04-12 17:41:16 +03:00
DEFINE_AG_CORRUPT_EVENT ( xfs_ag_unfixed_corruption ) ;
2019-04-12 17:40:25 +03:00
DECLARE_EVENT_CLASS ( xfs_inode_corrupt_class ,
TP_PROTO ( struct xfs_inode * ip , unsigned int flags ) ,
TP_ARGS ( ip , flags ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_ino_t , ino )
__field ( unsigned int , flags )
) ,
TP_fast_assign (
__entry - > dev = ip - > i_mount - > m_super - > s_dev ;
__entry - > ino = ip - > i_ino ;
__entry - > flags = flags ;
) ,
TP_printk ( " dev %d:%d ino 0x%llx flags 0x%x " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > ino , __entry - > flags )
) ;
# define DEFINE_INODE_CORRUPT_EVENT(name) \
DEFINE_EVENT ( xfs_inode_corrupt_class , name , \
TP_PROTO ( struct xfs_inode * ip , unsigned int flags ) , \
TP_ARGS ( ip , flags ) )
DEFINE_INODE_CORRUPT_EVENT ( xfs_inode_mark_sick ) ;
DEFINE_INODE_CORRUPT_EVENT ( xfs_inode_mark_healthy ) ;
2019-07-02 19:39:38 +03:00
TRACE_EVENT ( xfs_iwalk_ag ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
xfs_agino_t startino ) ,
TP_ARGS ( mp , agno , startino ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , startino )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > startino = startino ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x startino 0x%x " ,
2019-07-02 19:39:38 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) , __entry - > agno ,
__entry - > startino )
)
TRACE_EVENT ( xfs_iwalk_ag_rec ,
TP_PROTO ( struct xfs_mount * mp , xfs_agnumber_t agno ,
struct xfs_inobt_rec_incore * irec ) ,
TP_ARGS ( mp , agno , irec ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , startino )
__field ( uint64_t , freemask )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > agno = agno ;
__entry - > startino = irec - > ir_startino ;
__entry - > freemask = irec - > ir_free ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx " ,
2019-07-02 19:39:38 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) , __entry - > agno ,
__entry - > startino , __entry - > freemask )
)
2019-07-03 17:33:26 +03:00
TRACE_EVENT ( xfs_pwork_init ,
TP_PROTO ( struct xfs_mount * mp , unsigned int nr_threads , pid_t pid ) ,
TP_ARGS ( mp , nr_threads , pid ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( unsigned int , nr_threads )
__field ( pid_t , pid )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > nr_threads = nr_threads ;
__entry - > pid = pid ;
) ,
TP_printk ( " dev %d:%d nr_threads %u pid %u " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > nr_threads , __entry - > pid )
)
2019-08-26 22:08:10 +03:00
DECLARE_EVENT_CLASS ( xfs_kmem_class ,
TP_PROTO ( ssize_t size , int flags , unsigned long caller_ip ) ,
TP_ARGS ( size , flags , caller_ip ) ,
TP_STRUCT__entry (
__field ( ssize_t , size )
__field ( int , flags )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > size = size ;
__entry - > flags = flags ;
__entry - > caller_ip = caller_ip ;
) ,
TP_printk ( " size %zd flags 0x%x caller %pS " ,
__entry - > size ,
__entry - > flags ,
( char * ) __entry - > caller_ip )
)
# define DEFINE_KMEM_EVENT(name) \
DEFINE_EVENT ( xfs_kmem_class , name , \
TP_PROTO ( ssize_t size , int flags , unsigned long caller_ip ) , \
TP_ARGS ( size , flags , caller_ip ) )
DEFINE_KMEM_EVENT ( kmem_alloc ) ;
2019-12-12 00:19:06 +03:00
TRACE_EVENT ( xfs_check_new_dalign ,
TP_PROTO ( struct xfs_mount * mp , int new_dalign , xfs_ino_t calc_rootino ) ,
TP_ARGS ( mp , new_dalign , calc_rootino ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( int , new_dalign )
__field ( xfs_ino_t , sb_rootino )
__field ( xfs_ino_t , calc_rootino )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > new_dalign = new_dalign ;
__entry - > sb_rootino = mp - > m_sb . sb_rootino ;
__entry - > calc_rootino = calc_rootino ;
) ,
2021-08-17 19:20:27 +03:00
TP_printk ( " dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx " ,
2019-12-12 00:19:06 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > new_dalign , __entry - > sb_rootino ,
__entry - > calc_rootino )
)
2020-03-11 20:40:26 +03:00
TRACE_EVENT ( xfs_btree_commit_afakeroot ,
TP_PROTO ( struct xfs_btree_cur * cur ) ,
TP_ARGS ( cur ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_btnum_t , btnum )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( unsigned int , levels )
__field ( unsigned int , blocks )
) ,
TP_fast_assign (
__entry - > dev = cur - > bc_mp - > m_super - > s_dev ;
__entry - > btnum = cur - > bc_btnum ;
2021-06-02 03:48:24 +03:00
__entry - > agno = cur - > bc_ag . pag - > pag_agno ;
2020-03-11 20:40:26 +03:00
__entry - > agbno = cur - > bc_ag . afake - > af_root ;
__entry - > levels = cur - > bc_ag . afake - > af_levels ;
__entry - > blocks = cur - > bc_ag . afake - > af_blocks ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u " ,
2020-03-11 20:40:26 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_symbolic ( __entry - > btnum , XFS_BTNUM_STRINGS ) ,
__entry - > agno ,
__entry - > levels ,
__entry - > blocks ,
__entry - > agbno )
)
2020-03-11 20:42:34 +03:00
TRACE_EVENT ( xfs_btree_commit_ifakeroot ,
TP_PROTO ( struct xfs_btree_cur * cur ) ,
TP_ARGS ( cur ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_btnum_t , btnum )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agino_t , agino )
__field ( unsigned int , levels )
__field ( unsigned int , blocks )
__field ( int , whichfork )
) ,
TP_fast_assign (
__entry - > dev = cur - > bc_mp - > m_super - > s_dev ;
__entry - > btnum = cur - > bc_btnum ;
__entry - > agno = XFS_INO_TO_AGNO ( cur - > bc_mp ,
cur - > bc_ino . ip - > i_ino ) ;
__entry - > agino = XFS_INO_TO_AGINO ( cur - > bc_mp ,
cur - > bc_ino . ip - > i_ino ) ;
__entry - > levels = cur - > bc_ino . ifake - > if_levels ;
__entry - > blocks = cur - > bc_ino . ifake - > if_blocks ;
__entry - > whichfork = cur - > bc_ino . whichfork ;
) ,
2021-08-17 19:24:26 +03:00
TP_printk ( " dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u " ,
2020-03-11 20:42:34 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_symbolic ( __entry - > btnum , XFS_BTNUM_STRINGS ) ,
__entry - > agno ,
__entry - > agino ,
2021-08-17 23:09:26 +03:00
__print_symbolic ( __entry - > whichfork , XFS_WHICHFORK_STRINGS ) ,
2020-03-11 20:42:34 +03:00
__entry - > levels ,
__entry - > blocks )
)
2020-03-11 20:51:50 +03:00
TRACE_EVENT ( xfs_btree_bload_level_geometry ,
TP_PROTO ( struct xfs_btree_cur * cur , unsigned int level ,
uint64_t nr_this_level , unsigned int nr_per_block ,
unsigned int desired_npb , uint64_t blocks ,
uint64_t blocks_with_extra ) ,
TP_ARGS ( cur , level , nr_this_level , nr_per_block , desired_npb , blocks ,
blocks_with_extra ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_btnum_t , btnum )
__field ( unsigned int , level )
__field ( unsigned int , nlevels )
__field ( uint64_t , nr_this_level )
__field ( unsigned int , nr_per_block )
__field ( unsigned int , desired_npb )
__field ( unsigned long long , blocks )
__field ( unsigned long long , blocks_with_extra )
) ,
TP_fast_assign (
__entry - > dev = cur - > bc_mp - > m_super - > s_dev ;
__entry - > btnum = cur - > bc_btnum ;
__entry - > level = level ;
__entry - > nlevels = cur - > bc_nlevels ;
__entry - > nr_this_level = nr_this_level ;
__entry - > nr_per_block = nr_per_block ;
__entry - > desired_npb = desired_npb ;
__entry - > blocks = blocks ;
__entry - > blocks_with_extra = blocks_with_extra ;
) ,
TP_printk ( " dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_symbolic ( __entry - > btnum , XFS_BTNUM_STRINGS ) ,
__entry - > level ,
__entry - > nlevels ,
__entry - > nr_this_level ,
__entry - > nr_per_block ,
__entry - > desired_npb ,
__entry - > blocks ,
__entry - > blocks_with_extra )
)
TRACE_EVENT ( xfs_btree_bload_block ,
TP_PROTO ( struct xfs_btree_cur * cur , unsigned int level ,
uint64_t block_idx , uint64_t nr_blocks ,
union xfs_btree_ptr * ptr , unsigned int nr_records ) ,
TP_ARGS ( cur , level , block_idx , nr_blocks , ptr , nr_records ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( xfs_btnum_t , btnum )
__field ( unsigned int , level )
__field ( unsigned long long , block_idx )
__field ( unsigned long long , nr_blocks )
__field ( xfs_agnumber_t , agno )
__field ( xfs_agblock_t , agbno )
__field ( unsigned int , nr_records )
) ,
TP_fast_assign (
__entry - > dev = cur - > bc_mp - > m_super - > s_dev ;
__entry - > btnum = cur - > bc_btnum ;
__entry - > level = level ;
__entry - > block_idx = block_idx ;
__entry - > nr_blocks = nr_blocks ;
if ( cur - > bc_flags & XFS_BTREE_LONG_PTRS ) {
xfs_fsblock_t fsb = be64_to_cpu ( ptr - > l ) ;
__entry - > agno = XFS_FSB_TO_AGNO ( cur - > bc_mp , fsb ) ;
__entry - > agbno = XFS_FSB_TO_AGBNO ( cur - > bc_mp , fsb ) ;
} else {
2021-06-02 03:48:24 +03:00
__entry - > agno = cur - > bc_ag . pag - > pag_agno ;
2020-03-11 20:51:50 +03:00
__entry - > agbno = be32_to_cpu ( ptr - > s ) ;
}
__entry - > nr_records = nr_records ;
) ,
2021-08-17 19:28:53 +03:00
TP_printk ( " dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u " ,
2020-03-11 20:51:50 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_symbolic ( __entry - > btnum , XFS_BTNUM_STRINGS ) ,
__entry - > level ,
__entry - > block_idx ,
__entry - > nr_blocks ,
__entry - > agno ,
__entry - > agbno ,
__entry - > nr_records )
)
2020-08-24 21:58:01 +03:00
DECLARE_EVENT_CLASS ( xfs_timestamp_range_class ,
TP_PROTO ( struct xfs_mount * mp , time64_t min , time64_t max ) ,
TP_ARGS ( mp , min , max ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( long long , min )
__field ( long long , max )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > min = min ;
__entry - > max = max ;
) ,
TP_printk ( " dev %d:%d min %lld max %lld " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > min ,
__entry - > max )
)
# define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
DEFINE_EVENT ( xfs_timestamp_range_class , name , \
TP_PROTO ( struct xfs_mount * mp , long long min , long long max ) , \
TP_ARGS ( mp , min , max ) )
DEFINE_TIMESTAMP_RANGE_EVENT ( xfs_inode_timestamp_range ) ;
DEFINE_TIMESTAMP_RANGE_EVENT ( xfs_quota_expiry_range ) ;
2021-06-07 19:34:51 +03:00
DECLARE_EVENT_CLASS ( xfs_icwalk_class ,
TP_PROTO ( struct xfs_mount * mp , struct xfs_icwalk * icw ,
2021-01-23 03:48:38 +03:00
unsigned long caller_ip ) ,
2021-06-07 19:34:51 +03:00
TP_ARGS ( mp , icw , caller_ip ) ,
2021-01-23 03:48:38 +03:00
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( __u32 , flags )
__field ( uint32_t , uid )
__field ( uint32_t , gid )
__field ( prid_t , prid )
__field ( __u64 , min_file_size )
2021-06-18 21:57:06 +03:00
__field ( long , scan_limit )
2021-01-23 03:48:38 +03:00
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
2021-06-07 19:34:51 +03:00
__entry - > flags = icw ? icw - > icw_flags : 0 ;
__entry - > uid = icw ? from_kuid ( mp - > m_super - > s_user_ns ,
icw - > icw_uid ) : 0 ;
__entry - > gid = icw ? from_kgid ( mp - > m_super - > s_user_ns ,
icw - > icw_gid ) : 0 ;
__entry - > prid = icw ? icw - > icw_prid : 0 ;
__entry - > min_file_size = icw ? icw - > icw_min_file_size : 0 ;
__entry - > scan_limit = icw ? icw - > icw_scan_limit : 0 ;
2021-01-23 03:48:38 +03:00
__entry - > caller_ip = caller_ip ;
) ,
2021-06-18 21:57:06 +03:00
TP_printk ( " dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS " ,
2021-01-23 03:48:38 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__entry - > flags ,
__entry - > uid ,
__entry - > gid ,
__entry - > prid ,
__entry - > min_file_size ,
2021-05-31 21:32:02 +03:00
__entry - > scan_limit ,
2021-01-23 03:48:38 +03:00
( char * ) __entry - > caller_ip )
) ;
2021-06-07 19:34:51 +03:00
# define DEFINE_ICWALK_EVENT(name) \
DEFINE_EVENT ( xfs_icwalk_class , name , \
TP_PROTO ( struct xfs_mount * mp , struct xfs_icwalk * icw , \
2021-01-23 03:48:38 +03:00
unsigned long caller_ip ) , \
2021-06-07 19:34:51 +03:00
TP_ARGS ( mp , icw , caller_ip ) )
DEFINE_ICWALK_EVENT ( xfs_ioc_free_eofblocks ) ;
DEFINE_ICWALK_EVENT ( xfs_blockgc_free_space ) ;
2021-01-23 03:48:38 +03:00
2021-06-18 21:57:05 +03:00
TRACE_DEFINE_ENUM ( XLOG_STATE_ACTIVE ) ;
TRACE_DEFINE_ENUM ( XLOG_STATE_WANT_SYNC ) ;
TRACE_DEFINE_ENUM ( XLOG_STATE_SYNCING ) ;
TRACE_DEFINE_ENUM ( XLOG_STATE_DONE_SYNC ) ;
TRACE_DEFINE_ENUM ( XLOG_STATE_CALLBACK ) ;
TRACE_DEFINE_ENUM ( XLOG_STATE_DIRTY ) ;
DECLARE_EVENT_CLASS ( xlog_iclog_class ,
TP_PROTO ( struct xlog_in_core * iclog , unsigned long caller_ip ) ,
TP_ARGS ( iclog , caller_ip ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( uint32_t , state )
__field ( int32_t , refcount )
__field ( uint32_t , offset )
2021-07-28 02:23:50 +03:00
__field ( uint32_t , flags )
2021-06-18 21:57:05 +03:00
__field ( unsigned long long , lsn )
__field ( unsigned long , caller_ip )
) ,
TP_fast_assign (
__entry - > dev = iclog - > ic_log - > l_mp - > m_super - > s_dev ;
__entry - > state = iclog - > ic_state ;
__entry - > refcount = atomic_read ( & iclog - > ic_refcnt ) ;
__entry - > offset = iclog - > ic_offset ;
2021-07-28 02:23:50 +03:00
__entry - > flags = iclog - > ic_flags ;
2021-06-18 21:57:05 +03:00
__entry - > lsn = be64_to_cpu ( iclog - > ic_header . h_lsn ) ;
__entry - > caller_ip = caller_ip ;
) ,
2021-07-28 02:23:50 +03:00
TP_printk ( " dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS " ,
2021-06-18 21:57:05 +03:00
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_symbolic ( __entry - > state , XLOG_STATE_STRINGS ) ,
__entry - > refcount ,
__entry - > offset ,
__entry - > lsn ,
2021-07-28 02:23:50 +03:00
__print_flags ( __entry - > flags , " | " , XLOG_ICL_STRINGS ) ,
2021-06-18 21:57:05 +03:00
( char * ) __entry - > caller_ip )
) ;
# define DEFINE_ICLOG_EVENT(name) \
DEFINE_EVENT ( xlog_iclog_class , name , \
TP_PROTO ( struct xlog_in_core * iclog , unsigned long caller_ip ) , \
TP_ARGS ( iclog , caller_ip ) )
DEFINE_ICLOG_EVENT ( xlog_iclog_activate ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_clean ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_callback ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_callbacks_start ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_callbacks_done ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_force ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_force_lsn ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_get_space ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_release ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_switch ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_sync ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_syncing ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_sync_done ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_want_sync ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_wait_on ) ;
DEFINE_ICLOG_EVENT ( xlog_iclog_write ) ;
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 08:12:52 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_UNINIT ) ;
TRACE_DEFINE_ENUM ( XFS_DAS_SF_ADD ) ;
2022-05-12 08:12:56 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_SF_REMOVE ) ;
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 08:12:56 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_ADD ) ;
2022-05-12 08:12:56 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_REMOVE ) ;
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 08:12:56 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_ADD ) ;
2022-05-12 08:12:56 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_REMOVE ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_SET_RMT ) ;
2022-05-12 08:12:54 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_ALLOC_RMT ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_REPLACE ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_REMOVE_OLD ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_REMOVE_RMT ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_LEAF_REMOVE_ATTR ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_SET_RMT ) ;
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_ALLOC_RMT ) ;
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_REPLACE ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_REMOVE_OLD ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_REMOVE_RMT ) ;
2022-05-12 08:12:55 +03:00
TRACE_DEFINE_ENUM ( XFS_DAS_NODE_REMOVE_ATTR ) ;
TRACE_DEFINE_ENUM ( XFS_DAS_DONE ) ;
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 08:12:52 +03:00
2021-08-08 18:27:13 +03:00
DECLARE_EVENT_CLASS ( xfs_das_state_class ,
TP_PROTO ( int das , struct xfs_inode * ip ) ,
TP_ARGS ( das , ip ) ,
TP_STRUCT__entry (
__field ( int , das )
__field ( xfs_ino_t , ino )
) ,
TP_fast_assign (
__entry - > das = das ;
__entry - > ino = ip - > i_ino ;
) ,
xfs: separate out initial attr_set states
We current use XFS_DAS_UNINIT for several steps in the attr_set
state machine. We use it for setting shortform xattrs, converting
from shortform to leaf, leaf add, leaf-to-node and leaf add. All of
these things are essentially known before we start the state machine
iterating, so we really should separate them out:
XFS_DAS_SF_ADD:
- tries to do a shortform add
- on success -> done
- on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD
- on error, dies.
XFS_DAS_LEAF_ADD:
- tries to do leaf add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK
- on ENOSPC converts to node, -> XFS_DAS_NODE_ADD
- on error, dies
XFS_DAS_NODE_ADD:
- tries to do node add
- on success:
- inline attr -> done
- remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK
- on error, dies
This makes it easier to understand how the state machine starts
up and sets us up on the path to further state machine
simplifications.
This also converts the DAS state tracepoints to use strings rather
than numbers, as converting between enums and numbers requires
manual counting rather than just reading the name.
This also introduces a XFS_DAS_DONE state so that we can trace
successful operation completions easily.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson<allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 08:12:52 +03:00
TP_printk ( " state change %s ino 0x%llx " ,
__print_symbolic ( __entry - > das , XFS_DAS_STRINGS ) ,
__entry - > ino )
2021-08-08 18:27:13 +03:00
)
# define DEFINE_DAS_STATE_EVENT(name) \
DEFINE_EVENT ( xfs_das_state_class , name , \
TP_PROTO ( int das , struct xfs_inode * ip ) , \
TP_ARGS ( das , ip ) )
DEFINE_DAS_STATE_EVENT ( xfs_attr_sf_addname_return ) ;
DEFINE_DAS_STATE_EVENT ( xfs_attr_set_iter_return ) ;
2022-05-11 10:01:22 +03:00
DEFINE_DAS_STATE_EVENT ( xfs_attr_leaf_addname_return ) ;
2021-08-08 18:27:13 +03:00
DEFINE_DAS_STATE_EVENT ( xfs_attr_node_addname_return ) ;
DEFINE_DAS_STATE_EVENT ( xfs_attr_remove_iter_return ) ;
2022-05-12 08:12:55 +03:00
DEFINE_DAS_STATE_EVENT ( xfs_attr_rmtval_alloc ) ;
2021-08-08 18:27:13 +03:00
DEFINE_DAS_STATE_EVENT ( xfs_attr_rmtval_remove_return ) ;
2022-05-11 10:05:23 +03:00
DEFINE_DAS_STATE_EVENT ( xfs_attr_defer_add ) ;
DEFINE_DAS_STATE_EVENT ( xfs_attr_defer_replace ) ;
DEFINE_DAS_STATE_EVENT ( xfs_attr_defer_remove ) ;
2021-08-11 03:00:54 +03:00
TRACE_EVENT ( xfs_force_shutdown ,
TP_PROTO ( struct xfs_mount * mp , int ptag , int flags , const char * fname ,
int line_num ) ,
TP_ARGS ( mp , ptag , flags , fname , line_num ) ,
TP_STRUCT__entry (
__field ( dev_t , dev )
__field ( int , ptag )
__field ( int , flags )
__string ( fname , fname )
__field ( int , line_num )
) ,
TP_fast_assign (
__entry - > dev = mp - > m_super - > s_dev ;
__entry - > ptag = ptag ;
__entry - > flags = flags ;
__assign_str ( fname , fname ) ;
__entry - > line_num = line_num ;
) ,
TP_printk ( " dev %d:%d tag %s flags %s file %s line_num %d " ,
MAJOR ( __entry - > dev ) , MINOR ( __entry - > dev ) ,
__print_flags ( __entry - > ptag , " | " , XFS_PTAG_STRINGS ) ,
__print_flags ( __entry - > flags , " | " , XFS_SHUTDOWN_STRINGS ) ,
__get_str ( fname ) ,
__entry - > line_num )
) ;
2009-12-15 02:14:59 +03:00
# endif /* _TRACE_XFS_H */
# undef TRACE_INCLUDE_PATH
# define TRACE_INCLUDE_PATH .
# define TRACE_INCLUDE_FILE xfs_trace
# include <trace/define_trace.h>