2018-06-06 05:42:14 +03:00
// SPDX-License-Identifier: GPL-2.0
2005-04-17 02:20:36 +04:00
/*
2005-11-02 06:58:39 +03:00
* Copyright ( c ) 2000 - 2003 , 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-17 02:20:36 +04:00
*/
# ifndef __XFS_LOG_H__
# define __XFS_LOG_H__
2019-06-29 05:27:34 +03:00
struct xfs_cil_ctx ;
2013-08-12 14:49:22 +04:00
struct xfs_log_vec {
2022-07-07 11:55:59 +03:00
struct list_head lv_list ; /* CIL lv chain ptrs */
2022-07-07 11:56:08 +03:00
uint32_t lv_order_id ; /* chain ordering info */
2013-08-12 14:49:22 +04:00
int lv_niovecs ; /* number of iovecs in lv */
struct xfs_log_iovec * lv_iovecp ; /* iovec array */
struct xfs_log_item * lv_item ; /* owner */
char * lv_buf ; /* formatted buffer */
2014-05-20 02:18:09 +04:00
int lv_bytes ; /* accounted space in buffer */
int lv_buf_len ; /* aligned size of buffer */
2013-08-12 14:50:05 +04:00
int lv_size ; /* size of allocated lv */
2013-08-12 14:49:22 +04:00
} ;
# define XFS_LOG_VEC_ORDERED (-1)
2022-05-04 04:45:50 +03:00
/*
* Calculate the log iovec length for a given user buffer length . Intended to be
* used by - > iop_size implementations when sizing buffers of arbitrary
* alignments .
*/
static inline int
xlog_calc_iovec_len ( int len )
{
return roundup ( len , sizeof ( uint32_t ) ) ;
}
2022-04-21 03:34:59 +03:00
void * xlog_prepare_iovec ( struct xfs_log_vec * lv , struct xfs_log_iovec * * vecp ,
uint type ) ;
2013-12-13 04:00:43 +04:00
2013-12-13 04:34:02 +04:00
static inline void
2022-05-04 04:45:50 +03:00
xlog_finish_iovec ( struct xfs_log_vec * lv , struct xfs_log_iovec * vec ,
int data_len )
2013-12-13 04:34:02 +04:00
{
2022-04-21 03:34:59 +03:00
struct xlog_op_header * oph = vec - > i_addr ;
2022-05-04 04:45:50 +03:00
int len ;
2022-04-21 03:34:59 +03:00
2022-05-04 04:45:50 +03:00
/*
* Always round up the length to the correct alignment so callers don ' t
* need to know anything about this log vec layout requirement . This
* means we have to zero the area the data to be written does not cover .
* This is complicated by fact the payload region is offset into the
* logvec region by the opheader that tracks the payload .
*/
len = xlog_calc_iovec_len ( data_len ) ;
if ( len - data_len ! = 0 ) {
char * buf = vec - > i_addr + sizeof ( struct xlog_op_header ) ;
memset ( buf + data_len , 0 , len - data_len ) ;
}
/*
* The opheader tracks aligned payload length , whilst the logvec tracks
* the overall region length .
*/
2022-04-21 03:34:59 +03:00
oph - > oh_len = cpu_to_be32 ( len ) ;
len + = sizeof ( struct xlog_op_header ) ;
2022-04-21 03:34:49 +03:00
lv - > lv_buf_len + = len ;
2014-05-20 02:18:09 +04:00
lv - > lv_bytes + = len ;
2013-12-13 04:34:02 +04:00
vec - > i_len = len ;
2022-05-04 04:45:50 +03:00
/* Catch buffer overruns */
ASSERT ( ( void * ) lv - > lv_buf + lv - > lv_bytes < = ( void * ) lv + lv - > lv_size ) ;
2013-12-13 04:34:02 +04:00
}
2022-05-04 04:45:50 +03:00
/*
* Copy the amount of data requested by the caller into a new log iovec .
*/
2013-12-13 04:34:02 +04:00
static inline void *
xlog_copy_iovec ( struct xfs_log_vec * lv , struct xfs_log_iovec * * vecp ,
uint type , void * data , int len )
{
void * buf ;
buf = xlog_prepare_iovec ( lv , vecp , type ) ;
memcpy ( buf , data , len ) ;
xlog_finish_iovec ( lv , * vecp , len ) ;
return buf ;
}
xfs: share xattr name and value buffers when logging xattr updates
While running xfs/297 and generic/642, I noticed a crash in
xfs_attri_item_relog when it tries to copy the attr name to the new
xattri log item. I think what happened here was that we called
->iop_commit on the old attri item (which nulls out the pointers) as
part of a log force at the same time that a chained attr operation was
ongoing. The system was busy enough that at some later point, the defer
ops operation decided it was necessary to relog the attri log item, but
as we've detached the name buffer from the old attri log item, we can't
copy it to the new one, and kaboom.
I think there's a broader refcounting problem with LARP mode -- the
setxattr code can return to userspace before the CIL actually formats
and commits the log item, which results in a UAF bug. Therefore, the
xattr log item needs to be able to retain a reference to the name and
value buffers until the log items have completely cleared the log.
Furthermore, each time we create an intent log item, we allocate new
memory and (re)copy the contents; sharing here would be very useful.
Solve the UAF and the unnecessary memory allocations by having the log
code create a single refcounted buffer to contain the name and value
contents. This buffer can be passed from old to new during a relog
operation, and the logging code can (optionally) attach it to the
xfs_attr_item for reuse when LARP mode is enabled.
This also fixes a problem where the xfs_attri_log_item objects weren't
being freed back to the same cache where they came from.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-23 01:43:46 +03:00
static inline void *
xlog_copy_from_iovec ( struct xfs_log_vec * lv , struct xfs_log_iovec * * vecp ,
const struct xfs_log_iovec * src )
{
return xlog_copy_iovec ( lv , vecp , src - > i_type , src - > i_addr , src - > i_len ) ;
}
2005-04-17 02:20:36 +04:00
/*
2006-03-29 02:55:14 +04:00
* By comparing each component , we don ' t have to worry about extra
2005-04-17 02:20:36 +04:00
* endian issues in treating two 32 bit numbers as one 64 bit number
*/
2006-01-08 12:04:09 +03:00
static inline xfs_lsn_t _lsn_cmp ( xfs_lsn_t lsn1 , xfs_lsn_t lsn2 )
2005-04-17 02:20:36 +04:00
{
if ( CYCLE_LSN ( lsn1 ) ! = CYCLE_LSN ( lsn2 ) )
return ( CYCLE_LSN ( lsn1 ) < CYCLE_LSN ( lsn2 ) ) ? - 999 : 999 ;
if ( BLOCK_LSN ( lsn1 ) ! = BLOCK_LSN ( lsn2 ) )
return ( BLOCK_LSN ( lsn1 ) < BLOCK_LSN ( lsn2 ) ) ? - 999 : 999 ;
return 0 ;
}
# define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
/*
* Flags to xfs_log_force ( )
*
* XFS_LOG_SYNC : Synchronous force in - core log to disk
*/
# define XFS_LOG_SYNC 0x1
/* Log manager interfaces */
struct xfs_mount ;
2010-02-16 02:34:54 +03:00
struct xlog_in_core ;
2008-11-17 09:37:10 +03:00
struct xlog_ticket ;
2010-03-23 02:10:00 +03:00
struct xfs_log_item ;
struct xfs_item_ops ;
2010-05-14 15:41:46 +04:00
struct xfs_trans ;
xfs: AIL needs asynchronous CIL forcing
The AIL pushing is stalling on log forces when it comes across
pinned items. This is happening on removal workloads where the AIL
is dominated by stale items that are removed from AIL when the
checkpoint that marks the items stale is committed to the journal.
This results is relatively few items in the AIL, but those that are
are often pinned as directories items are being removed from are
still being logged.
As a result, many push cycles through the CIL will first issue a
blocking log force to unpin the items. This can take some time to
complete, with tracing regularly showing push delays of half a
second and sometimes up into the range of several seconds. Sequences
like this aren't uncommon:
....
399.829437: xfsaild: last lsn 0x11002dd000 count 101 stuck 101 flushing 0 tout 20
<wanted 20ms, got 270ms delay>
400.099622: xfsaild: target 0x11002f3600, prev 0x11002f3600, last lsn 0x0
400.099623: xfsaild: first lsn 0x11002f3600
400.099679: xfsaild: last lsn 0x1100305000 count 16 stuck 11 flushing 0 tout 50
<wanted 50ms, got 500ms delay>
400.589348: xfsaild: target 0x110032e600, prev 0x11002f3600, last lsn 0x0
400.589349: xfsaild: first lsn 0x1100305000
400.589595: xfsaild: last lsn 0x110032e600 count 156 stuck 101 flushing 30 tout 50
<wanted 50ms, got 460ms delay>
400.950341: xfsaild: target 0x1100353000, prev 0x110032e600, last lsn 0x0
400.950343: xfsaild: first lsn 0x1100317c00
400.950436: xfsaild: last lsn 0x110033d200 count 105 stuck 101 flushing 0 tout 20
<wanted 20ms, got 200ms delay>
401.142333: xfsaild: target 0x1100361600, prev 0x1100353000, last lsn 0x0
401.142334: xfsaild: first lsn 0x110032e600
401.142535: xfsaild: last lsn 0x1100353000 count 122 stuck 101 flushing 8 tout 10
<wanted 10ms, got 10ms delay>
401.154323: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x1100353000
401.154328: xfsaild: first lsn 0x1100353000
401.154389: xfsaild: last lsn 0x1100353000 count 101 stuck 101 flushing 0 tout 20
<wanted 20ms, got 300ms delay>
401.451525: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x0
401.451526: xfsaild: first lsn 0x1100353000
401.451804: xfsaild: last lsn 0x1100377200 count 170 stuck 22 flushing 122 tout 50
<wanted 50ms, got 500ms delay>
401.933581: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x0
....
In each of these cases, every AIL pass saw 101 log items stuck on
the AIL (pinned) with very few other items being found. Each pass, a
log force was issued, and delay between last/first is the sleep time
+ the sync log force time.
Some of these 101 items pinned the tail of the log. The tail of the
log does slowly creep forward (first lsn), but the problem is that
the log is actually out of reservation space because it's been
running so many transactions that stale items that never reach the
AIL but consume log space. Hence we have a largely empty AIL, with
long term pins on items that pin the tail of the log that don't get
pushed frequently enough to keep log space available.
The problem is the hundreds of milliseconds that we block in the log
force pushing the CIL out to disk. The AIL should not be stalled
like this - it needs to run and flush items that are at the tail of
the log with minimal latency. What we really need to do is trigger a
log flush, but then not wait for it at all - we've already done our
waiting for stuff to complete when we backed off prior to the log
force being issued.
Even if we remove the XFS_LOG_SYNC from the xfs_log_force() call, we
still do a blocking flush of the CIL and that is what is causing the
issue. Hence we need a new interface for the CIL to trigger an
immediate background push of the CIL to get it moving faster but not
to wait on that to occur. While the CIL is pushing, the AIL can also
be pushing.
We already have an internal interface to do this -
xlog_cil_push_now() - but we need a wrapper for it to be used
externally. xlog_cil_force_seq() can easily be extended to do what
we need as it already implements the synchronous CIL push via
xlog_cil_push_now(). Add the necessary flags and "push current
sequence" semantics to xlog_cil_force_seq() and convert the AIL
pushing to use it.
One of the complexities here is that the CIL push does not guarantee
that the commit record for the CIL checkpoint is written to disk.
The current log force ensures this by submitting the current ACTIVE
iclog that the commit record was written to. We need the CIL to
actually write this commit record to disk for an async push to
ensure that the checkpoint actually makes it to disk and unpins the
pinned items in the checkpoint on completion. Hence we need to pass
down to the CIL push that we are doing an async flush so that it can
switch out the commit_iclog if necessary to get written to disk when
the commit iclog is finally released.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-11 04:00:44 +03:00
struct xlog ;
2010-02-16 02:34:54 +03:00
2018-03-14 09:15:28 +03:00
int xfs_log_force ( struct xfs_mount * mp , uint flags ) ;
2021-06-18 18:21:52 +03:00
int xfs_log_force_seq ( struct xfs_mount * mp , xfs_csn_t seq , uint flags ,
2018-03-14 09:15:28 +03:00
int * log_forced ) ;
2005-04-17 02:20:36 +04:00
int xfs_log_mount ( struct xfs_mount * mp ,
struct xfs_buftarg * log_target ,
xfs_daddr_t start_block ,
int num_bblocks ) ;
2008-08-13 10:49:32 +04:00
int xfs_log_mount_finish ( struct xfs_mount * mp ) ;
2019-07-03 17:34:18 +03:00
void xfs_log_mount_cancel ( struct xfs_mount * ) ;
2012-02-20 06:31:20 +04:00
xfs_lsn_t xlog_assign_tail_lsn ( struct xfs_mount * mp ) ;
2012-04-23 09:58:33 +04:00
xfs_lsn_t xlog_assign_tail_lsn_locked ( struct xfs_mount * mp ) ;
2022-04-21 03:34:33 +03:00
void xfs_log_space_wake ( struct xfs_mount * mp ) ;
int xfs_log_reserve ( struct xfs_mount * mp , int length , int count ,
struct xlog_ticket * * ticket , bool permanent ) ;
int xfs_log_regrant ( struct xfs_mount * mp , struct xlog_ticket * tic ) ;
void xfs_log_unmount ( struct xfs_mount * mp ) ;
2021-01-23 03:48:20 +03:00
bool xfs_log_writable ( struct xfs_mount * mp ) ;
2005-04-17 02:20:36 +04:00
xfs: Introduce delayed logging core code
The delayed logging code only changes in-memory structures and as
such can be enabled and disabled with a mount option. Add the mount
option and emit a warning that this is an experimental feature that
should not be used in production yet.
We also need infrastructure to track committed items that have not
yet been written to the log. This is what the Committed Item List
(CIL) is for.
The log item also needs to be extended to track the current log
vector, the associated memory buffer and it's location in the Commit
Item List. Extend the log item and log vector structures to enable
this tracking.
To maintain the current log format for transactions with delayed
logging, we need to introduce a checkpoint transaction and a context
for tracking each checkpoint from initiation to transaction
completion. This includes adding a log ticket for tracking space
log required/used by the context checkpoint.
To track all the changes we need an io vector array per log item,
rather than a single array for the entire transaction. Using the new
log vector structure for this requires two passes - the first to
allocate the log vector structures and chain them together, and the
second to fill them out. This log vector chain can then be passed
to the CIL for formatting, pinning and insertion into the CIL.
Formatting of the log vector chain is relatively simple - it's just
a loop over the iovecs on each log vector, but it is made slightly
more complex because we re-write the iovec after the copy to point
back at the memory buffer we just copied into.
This code also needs to pin log items. If the log item is not
already tracked in this checkpoint context, then it needs to be
pinned. Otherwise it is already pinned and we don't need to pin it
again.
The only other complexity is calculating the amount of new log space
the formatting has consumed. This needs to be accounted to the
transaction in progress, and the accounting is made more complex
becase we need also to steal space from it for log metadata in the
checkpoint transaction. Calculate all this at insert time and update
all the tickets, counters, etc correctly.
Once we've formatted all the log items in the transaction, attach
the busy extents to the checkpoint context so the busy extents live
until checkpoint completion and can be processed at that point in
time. Transactions can then be freed at this point in time.
Now we need to issue checkpoints - we are tracking the amount of log space
used by the items in the CIL, so we can trigger background checkpoints when the
space usage gets to a certain threshold. Otherwise, checkpoints need ot be
triggered when a log synchronisation point is reached - a log force event.
Because the log write code already handles chained log vectors, writing the
transaction is trivial, too. Construct a transaction header, add it
to the head of the chain and write it into the log, then issue a
commit record write. Then we can release the checkpoint log ticket
and attach the context to the log buffer so it can be called during
Io completion to complete the checkpoint.
We also need to allow for synchronising multiple in-flight
checkpoints. This is needed for two things - the first is to ensure
that checkpoint commit records appear in the log in the correct
sequence order (so they are replayed in the correct order). The
second is so that xfs_log_force_lsn() operates correctly and only
flushes and/or waits for the specific sequence it was provided with.
To do this we need a wait variable and a list tracking the
checkpoint commits in progress. We can walk this list and wait for
the checkpoints to change state or complete easily, an this provides
the necessary synchronisation for correct operation in both cases.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 08:37:18 +04:00
struct xlog_ticket * xfs_log_ticket_get ( struct xlog_ticket * ticket ) ;
2008-11-17 09:37:10 +03:00
void xfs_log_ticket_put ( struct xlog_ticket * ticket ) ;
2020-03-20 18:49:20 +03:00
void xlog_cil_process_committed ( struct list_head * list ) ;
2010-05-20 17:19:42 +04:00
bool xfs_log_item_in_current_chkpt ( struct xfs_log_item * lip ) ;
xfs: Introduce delayed logging core code
The delayed logging code only changes in-memory structures and as
such can be enabled and disabled with a mount option. Add the mount
option and emit a warning that this is an experimental feature that
should not be used in production yet.
We also need infrastructure to track committed items that have not
yet been written to the log. This is what the Committed Item List
(CIL) is for.
The log item also needs to be extended to track the current log
vector, the associated memory buffer and it's location in the Commit
Item List. Extend the log item and log vector structures to enable
this tracking.
To maintain the current log format for transactions with delayed
logging, we need to introduce a checkpoint transaction and a context
for tracking each checkpoint from initiation to transaction
completion. This includes adding a log ticket for tracking space
log required/used by the context checkpoint.
To track all the changes we need an io vector array per log item,
rather than a single array for the entire transaction. Using the new
log vector structure for this requires two passes - the first to
allocate the log vector structures and chain them together, and the
second to fill them out. This log vector chain can then be passed
to the CIL for formatting, pinning and insertion into the CIL.
Formatting of the log vector chain is relatively simple - it's just
a loop over the iovecs on each log vector, but it is made slightly
more complex because we re-write the iovec after the copy to point
back at the memory buffer we just copied into.
This code also needs to pin log items. If the log item is not
already tracked in this checkpoint context, then it needs to be
pinned. Otherwise it is already pinned and we don't need to pin it
again.
The only other complexity is calculating the amount of new log space
the formatting has consumed. This needs to be accounted to the
transaction in progress, and the accounting is made more complex
becase we need also to steal space from it for log metadata in the
checkpoint transaction. Calculate all this at insert time and update
all the tickets, counters, etc correctly.
Once we've formatted all the log items in the transaction, attach
the busy extents to the checkpoint context so the busy extents live
until checkpoint completion and can be processed at that point in
time. Transactions can then be freed at this point in time.
Now we need to issue checkpoints - we are tracking the amount of log space
used by the items in the CIL, so we can trigger background checkpoints when the
space usage gets to a certain threshold. Otherwise, checkpoints need ot be
triggered when a log synchronisation point is reached - a log force event.
Because the log write code already handles chained log vectors, writing the
transaction is trivial, too. Construct a transaction header, add it
to the head of the chain and write it into the log, then issue a
commit record write. Then we can release the checkpoint log ticket
and attach the context to the log buffer so it can be called during
Io completion to complete the checkpoint.
We also need to allow for synchronising multiple in-flight
checkpoints. This is needed for two things - the first is to ensure
that checkpoint commit records appear in the log in the correct
sequence order (so they are replayed in the correct order). The
second is so that xfs_log_force_lsn() operates correctly and only
flushes and/or waits for the specific sequence it was provided with.
To do this we need a wait variable and a list tracking the
checkpoint commits in progress. We can walk this list and wait for
the checkpoints to change state or complete easily, an this provides
the necessary synchronisation for correct operation in both cases.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 08:37:18 +04:00
2012-10-08 14:56:02 +04:00
void xfs_log_work_queue ( struct xfs_mount * mp ) ;
xfs: cover the log during log quiesce
The log quiesce mechanism historically terminates by marking the log
clean with an unmount record. The primary objective is to indicate
that log recovery is no longer required after the quiesce has
flushed all in-core changes and written back filesystem metadata.
While this is perfectly fine, it is somewhat hacky as currently used
in certain contexts. For example, filesystem freeze quiesces (i.e.
cleans) the log and immediately redirties it with a dummy superblock
transaction to ensure that log recovery runs in the event of a
crash.
While this functions correctly, cleaning the log from freeze context
is clearly superfluous given the current redirtying behavior.
Instead, the desired behavior can be achieved by simply covering the
log. This effectively retires all on-disk log items from the active
range of the log by issuing two synchronous and sequential dummy
superblock update transactions that serve to update the on-disk log
head and tail. The subtle difference is that the log technically
remains dirty due to the lack of an unmount record, though recovery
is effectively a no-op due to the content of the checkpoints being
clean (i.e. the unmodified on-disk superblock).
Log covering currently runs in the background and only triggers once
the filesystem and log has idled. The purpose of the background
mechanism is to prevent log recovery from replaying the most
recently logged items long after those items may have been written
back. In the quiesce path, the log has been deliberately idled by
forcing the log and pushing the AIL until empty in a context where
no further mutable filesystem operations are allowed. Therefore, we
can cover the log as the final step in the log quiesce codepath to
reflect that all previously active items have been successfully
written back.
This facilitates selective log covering from certain contexts (i.e.
freeze) that only seek to quiesce, but not necessarily clean the
log. Note that as a side effect of this change, log covering now
occurs when cleaning the log as well. This is harmless, facilitates
subsequent cleanups, and is mostly temporary as various operations
switch to use explicit log covering.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2021-01-23 03:48:22 +03:00
int xfs_log_quiesce ( struct xfs_mount * mp ) ;
2021-01-23 03:48:21 +03:00
void xfs_log_clean ( struct xfs_mount * mp ) ;
2015-10-12 07:59:25 +03:00
bool xfs_log_check_lsn ( struct xfs_mount * , xfs_lsn_t ) ;
2012-10-08 14:56:02 +04:00
2020-09-26 03:39:51 +03:00
xfs_lsn_t xlog_grant_push_threshold ( struct xlog * log , int need_bytes ) ;
2022-04-21 03:47:38 +03:00
bool xlog_force_shutdown ( struct xlog * log , uint32_t shutdown_flags ) ;
2020-09-26 03:39:51 +03:00
2021-08-08 18:27:12 +03:00
void xlog_use_incompat_feat ( struct xlog * log ) ;
void xlog_drop_incompat_feat ( struct xlog * log ) ;
2022-05-11 10:01:13 +03:00
int xfs_attr_use_log_assist ( struct xfs_mount * mp ) ;
2021-08-08 18:27:12 +03:00
2005-04-17 02:20:36 +04:00
# endif /* __XFS_LOG_H__ */