2018-06-06 05:42:14 +03:00
// SPDX-License-Identifier: GPL-2.0
2005-04-17 02:20:36 +04:00
/*
2005-11-02 06:58:39 +03:00
* Copyright ( c ) 2000 - 2002 , 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-17 02:20:36 +04:00
*/
# ifndef __XFS_TRANS_H__
# define __XFS_TRANS_H__
2013-08-12 14:49:32 +04:00
/* kernel only transaction subsystem defines */
2008-10-30 09:05:38 +03:00
2022-03-17 19:09:12 +03:00
struct xlog ;
2008-10-30 09:05:38 +03:00
struct xfs_buf ;
struct xfs_buftarg ;
struct xfs_efd_log_item ;
struct xfs_efi_log_item ;
struct xfs_inode ;
struct xfs_item_ops ;
struct xfs_log_iovec ;
struct xfs_mount ;
struct xfs_trans ;
2013-08-12 14:49:59 +04:00
struct xfs_trans_res ;
2008-10-30 09:05:38 +03:00
struct xfs_dquot_acct ;
2016-08-03 05:11:01 +03:00
struct xfs_rud_log_item ;
struct xfs_rui_log_item ;
xfs: propagate bmap updates to rmapbt
When we map, unmap, or convert an extent in a file's data or attr
fork, schedule a respective update in the rmapbt. Previous versions
of this patch required a 1:1 correspondence between bmap and rmap,
but this is no longer true as we now have ability to make interval
queries against the rmapbt.
We use the deferred operations code to handle redo operations
atomically and deadlock free. This plumbs in all five rmap actions
(map, unmap, convert extent, alloc, free); we'll use the first three
now for file data, and reflink will want the last two. We also add
an error injection site to test log recovery.
Finally, we need to fix the bmap shift extent code to adjust the
rmaps correctly.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 05:16:05 +03:00
struct xfs_btree_cur ;
2016-10-03 19:11:21 +03:00
struct xfs_cui_log_item ;
2016-10-03 19:11:22 +03:00
struct xfs_cud_log_item ;
2016-10-03 19:11:26 +03:00
struct xfs_bui_log_item ;
2016-10-03 19:11:28 +03:00
struct xfs_bud_log_item ;
2008-10-30 09:05:38 +03:00
2019-06-29 05:27:33 +03:00
struct xfs_log_item {
2008-10-30 09:05:38 +03:00
struct list_head li_ail ; /* AIL pointers */
2018-05-09 17:49:37 +03:00
struct list_head li_trans ; /* transaction list */
2008-10-30 09:05:38 +03:00
xfs_lsn_t li_lsn ; /* last on-disk lsn */
2022-03-17 19:09:12 +03:00
struct xlog * li_log ;
2008-10-30 09:39:46 +03:00
struct xfs_ail * li_ailp ; /* ptr to AIL */
2008-10-30 09:05:38 +03:00
uint li_type ; /* item type */
2018-05-09 17:47:34 +03:00
unsigned long li_flags ; /* misc flags */
xfs: Properly retry failed inode items in case of error during buffer writeback
When a buffer has been failed during writeback, the inode items into it
are kept flush locked, and are never resubmitted due the flush lock, so,
if any buffer fails to be written, the items in AIL are never written to
disk and never unlocked.
This causes unmount operation to hang due these items flush locked in AIL,
but this also causes the items in AIL to never be written back, even when
the IO device comes back to normal.
I've been testing this patch with a DM-thin device, creating a
filesystem larger than the real device.
When writing enough data to fill the DM-thin device, XFS receives ENOSPC
errors from the device, and keep spinning on xfsaild (when 'retry
forever' configuration is set).
At this point, the filesystem can not be unmounted because of the flush locked
items in AIL, but worse, the items in AIL are never retried at all
(once xfs_inode_item_push() will skip the items that are flush locked),
even if the underlying DM-thin device is expanded to the proper size.
This patch fixes both cases, retrying any item that has been failed
previously, using the infra-structure provided by the previous patch.
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-08-09 04:21:50 +03:00
struct xfs_buf * li_buf ; /* real buffer pointer */
2018-01-25 00:38:49 +03:00
struct list_head li_bio_list ; /* buffer item list */
2011-10-28 13:54:24 +04:00
const struct xfs_item_ops * li_ops ; /* function list */
xfs: Introduce delayed logging core code
The delayed logging code only changes in-memory structures and as
such can be enabled and disabled with a mount option. Add the mount
option and emit a warning that this is an experimental feature that
should not be used in production yet.
We also need infrastructure to track committed items that have not
yet been written to the log. This is what the Committed Item List
(CIL) is for.
The log item also needs to be extended to track the current log
vector, the associated memory buffer and it's location in the Commit
Item List. Extend the log item and log vector structures to enable
this tracking.
To maintain the current log format for transactions with delayed
logging, we need to introduce a checkpoint transaction and a context
for tracking each checkpoint from initiation to transaction
completion. This includes adding a log ticket for tracking space
log required/used by the context checkpoint.
To track all the changes we need an io vector array per log item,
rather than a single array for the entire transaction. Using the new
log vector structure for this requires two passes - the first to
allocate the log vector structures and chain them together, and the
second to fill them out. This log vector chain can then be passed
to the CIL for formatting, pinning and insertion into the CIL.
Formatting of the log vector chain is relatively simple - it's just
a loop over the iovecs on each log vector, but it is made slightly
more complex because we re-write the iovec after the copy to point
back at the memory buffer we just copied into.
This code also needs to pin log items. If the log item is not
already tracked in this checkpoint context, then it needs to be
pinned. Otherwise it is already pinned and we don't need to pin it
again.
The only other complexity is calculating the amount of new log space
the formatting has consumed. This needs to be accounted to the
transaction in progress, and the accounting is made more complex
becase we need also to steal space from it for log metadata in the
checkpoint transaction. Calculate all this at insert time and update
all the tickets, counters, etc correctly.
Once we've formatted all the log items in the transaction, attach
the busy extents to the checkpoint context so the busy extents live
until checkpoint completion and can be processed at that point in
time. Transactions can then be freed at this point in time.
Now we need to issue checkpoints - we are tracking the amount of log space
used by the items in the CIL, so we can trigger background checkpoints when the
space usage gets to a certain threshold. Otherwise, checkpoints need ot be
triggered when a log synchronisation point is reached - a log force event.
Because the log write code already handles chained log vectors, writing the
transaction is trivial, too. Construct a transaction header, add it
to the head of the chain and write it into the log, then issue a
commit record write. Then we can release the checkpoint log ticket
and attach the context to the log buffer so it can be called during
Io completion to complete the checkpoint.
We also need to allow for synchronising multiple in-flight
checkpoints. This is needed for two things - the first is to ensure
that checkpoint commit records appear in the log in the correct
sequence order (so they are replayed in the correct order). The
second is so that xfs_log_force_lsn() operates correctly and only
flushes and/or waits for the specific sequence it was provided with.
To do this we need a wait variable and a list tracking the
checkpoint commits in progress. We can walk this list and wait for
the checkpoints to change state or complete easily, an this provides
the necessary synchronisation for correct operation in both cases.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 08:37:18 +04:00
/* delayed logging */
struct list_head li_cil ; /* CIL pointers */
struct xfs_log_vec * li_lv ; /* active log vector */
xfs: allocate log vector buffers outside CIL context lock
One of the problems we currently have with delayed logging is that
under serious memory pressure we can deadlock memory reclaim. THis
occurs when memory reclaim (such as run by kswapd) is reclaiming XFS
inodes and issues a log force to unpin inodes that are dirty in the
CIL.
The CIL is pushed, but this will only occur once it gets the CIL
context lock to ensure that all committing transactions are complete
and no new transactions start being committed to the CIL while the
push switches to a new context.
The deadlock occurs when the CIL context lock is held by a
committing process that is doing memory allocation for log vector
buffers, and that allocation is then blocked on memory reclaim
making progress. Memory reclaim, however, is blocked waiting for
a log force to make progress, and so we effectively deadlock at this
point.
To solve this problem, we have to move the CIL log vector buffer
allocation outside of the context lock so that memory reclaim can
always make progress when it needs to force the log. The problem
with doing this is that a CIL push can take place while we are
determining if we need to allocate a new log vector buffer for
an item and hence the current log vector may go away without
warning. That means we canot rely on the existing log vector being
present when we finally grab the context lock and so we must have a
replacement buffer ready to go at all times.
To ensure this, introduce a "shadow log vector" buffer that is
always guaranteed to be present when we gain the CIL context lock
and format the item. This shadow buffer may or may not be used
during the formatting, but if the log item does not have an existing
log vector buffer or that buffer is too small for the new
modifications, we swap it for the new shadow buffer and format
the modifications into that new log vector buffer.
The result of this is that for any object we modify more than once
in a given CIL checkpoint, we double the memory required
to track dirty regions in the log. For single modifications then
we consume the shadow log vectorwe allocate on commit, and that gets
consumed by the checkpoint. However, if we make multiple
modifications, then the second transaction commit will allocate a
shadow log vector and hence we will end up with double the memory
usage as only one of the log vectors is consumed by the CIL
checkpoint. The remaining shadow vector will be freed when th elog
item is freed.
This can probably be optimised in future - access to the shadow log
vector is serialised by the object lock (as opposited to the active
log vector, which is controlled by the CIL context lock) and so we
can probably free shadow log vector from some objects when the log
item is marked clean on removal from the AIL.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-07-22 02:52:35 +03:00
struct xfs_log_vec * li_lv_shadow ; /* standby vector */
2021-06-18 18:21:52 +03:00
xfs_csn_t li_seq ; /* CIL commit seq */
2022-07-07 11:53:59 +03:00
uint32_t li_order_id ; /* CIL commit order */
2019-06-29 05:27:33 +03:00
} ;
2008-10-30 09:05:38 +03:00
2018-05-09 17:47:34 +03:00
/*
* li_flags use the ( set / test / clear ) _bit atomic interfaces because updates can
* race with each other and we don ' t want to have to use the AIL lock to
* serialise all updates .
*/
# define XFS_LI_IN_AIL 0
# define XFS_LI_ABORTED 1
# define XFS_LI_FAILED 2
xfs: intent item whiteouts
When we log modifications based on intents, we add both intent
and intent done items to the modification being made. These get
written to the log to ensure that the operation is re-run if the
intent done is not found in the log.
However, for operations that complete wholly within a single
checkpoint, the change in the checkpoint is atomic and will never
need replay. In this case, we don't need to actually write the
intent and intent done items to the journal because log recovery
will never need to manually restart this modification.
Log recovery currently handles intent/intent done matching by
inserting the intent into the AIL, then removing it when a matching
intent done item is found. Hence for all the intent-based operations
that complete within a checkpoint, we spend all that time parsing
the intent/intent done items just to cancel them and do nothing with
them.
Hence it follows that the only time we actually need intents in the
log is when the modification crosses checkpoint boundaries in the
log and so may only be partially complete in the journal. Hence if
we commit and intent done item to the CIL and the intent item is in
the same checkpoint, we don't actually have to write them to the
journal because log recovery will always cancel the intents.
We've never really worried about the overhead of logging intents
unnecessarily like this because the intents we log are generally
very much smaller than the change being made. e.g. freeing an extent
involves modifying at lease two freespace btree blocks and the AGF,
so the EFI/EFD overhead is only a small increase in space and
processing time compared to the overall cost of freeing an extent.
However, delayed attributes change this cost equation dramatically,
especially for inline attributes. In the case of adding an inline
attribute, we only log the inode core and attribute fork at present.
With delayed attributes, we now log the attr intent which includes
the name and value, the inode core adn attr fork, and finally the
attr intent done item. We increase the number of items we log from 1
to 3, and the number of log vectors (regions) goes up from 3 to 7.
Hence we tripple the number of objects that the CIL has to process,
and more than double the number of log vectors that need to be
written to the journal.
At scale, this means delayed attributes cause a non-pipelined CIL to
become CPU bound processing all the extra items, resulting in a > 40%
performance degradation on 16-way file+xattr create worklaods.
Pipelining the CIL (as per 5.15) reduces the performance degradation
to 20%, but now the limitation is the rate at which the log items
can be written to the iclogs and iclogs be dispatched for IO and
completed.
Even log IO completion is slowed down by these intents, because it
now has to process 3x the number of items in the checkpoint.
Processing completed intents is especially inefficient here, because
we first insert the intent into the AIL, then remove it from the AIL
when the intent done is processed. IOWs, we are also doing expensive
operations in log IO completion we could completely avoid if we
didn't log completed intent/intent done pairs.
Enter log item whiteouts.
When an intent done is committed, we can check to see if the
associated intent is in the same checkpoint as we are currently
committing the intent done to. If so, we can mark the intent log
item with a whiteout and immediately free the intent done item
rather than committing it to the CIL. We can basically skip the
entire formatting and CIL insertion steps for the intent done item.
However, we cannot remove the intent item from the CIL at this point
because the unlocked per-cpu CIL item lists do not permit removal
without holding the CIL context lock exclusively. Transaction commit
only holds the context lock shared, hence the best we can do is mark
the intent item with a whiteout so that the CIL push can release it
rather than writing it to the log.
This means we never write the intent to the log if the intent done
has also been committed to the same checkpoint, but we'll always
write the intent if the intent done has not been committed or has
been committed to a different checkpoint. This will result in
correct log recovery behaviour in all cases, without the overhead of
logging unnecessary intents.
This intent whiteout concept is generic - we can apply it to all
intent/intent done pairs that have a direct 1:1 relationship. The
way deferred ops iterate and relog intents mean that all intents
currently have a 1:1 relationship with their done intent, and hence
we can apply this cancellation to all existing intent/intent done
implementations.
For delayed attributes with a 16-way 64kB xattr create workload,
whiteouts reduce the amount of journalled metadata from ~2.5GB/s
down to ~600MB/s and improve the creation rate from 9000/s to
14000/s.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-04 04:50:29 +03:00
# define XFS_LI_DIRTY 3
# define XFS_LI_WHITEOUT 4
2008-10-30 09:05:38 +03:00
2009-12-15 02:14:59 +03:00
# define XFS_LI_FLAGS \
2022-04-21 03:47:07 +03:00
{ ( 1u < < XFS_LI_IN_AIL ) , " IN_AIL " } , \
{ ( 1u < < XFS_LI_ABORTED ) , " ABORTED " } , \
{ ( 1u < < XFS_LI_FAILED ) , " FAILED " } , \
xfs: intent item whiteouts
When we log modifications based on intents, we add both intent
and intent done items to the modification being made. These get
written to the log to ensure that the operation is re-run if the
intent done is not found in the log.
However, for operations that complete wholly within a single
checkpoint, the change in the checkpoint is atomic and will never
need replay. In this case, we don't need to actually write the
intent and intent done items to the journal because log recovery
will never need to manually restart this modification.
Log recovery currently handles intent/intent done matching by
inserting the intent into the AIL, then removing it when a matching
intent done item is found. Hence for all the intent-based operations
that complete within a checkpoint, we spend all that time parsing
the intent/intent done items just to cancel them and do nothing with
them.
Hence it follows that the only time we actually need intents in the
log is when the modification crosses checkpoint boundaries in the
log and so may only be partially complete in the journal. Hence if
we commit and intent done item to the CIL and the intent item is in
the same checkpoint, we don't actually have to write them to the
journal because log recovery will always cancel the intents.
We've never really worried about the overhead of logging intents
unnecessarily like this because the intents we log are generally
very much smaller than the change being made. e.g. freeing an extent
involves modifying at lease two freespace btree blocks and the AGF,
so the EFI/EFD overhead is only a small increase in space and
processing time compared to the overall cost of freeing an extent.
However, delayed attributes change this cost equation dramatically,
especially for inline attributes. In the case of adding an inline
attribute, we only log the inode core and attribute fork at present.
With delayed attributes, we now log the attr intent which includes
the name and value, the inode core adn attr fork, and finally the
attr intent done item. We increase the number of items we log from 1
to 3, and the number of log vectors (regions) goes up from 3 to 7.
Hence we tripple the number of objects that the CIL has to process,
and more than double the number of log vectors that need to be
written to the journal.
At scale, this means delayed attributes cause a non-pipelined CIL to
become CPU bound processing all the extra items, resulting in a > 40%
performance degradation on 16-way file+xattr create worklaods.
Pipelining the CIL (as per 5.15) reduces the performance degradation
to 20%, but now the limitation is the rate at which the log items
can be written to the iclogs and iclogs be dispatched for IO and
completed.
Even log IO completion is slowed down by these intents, because it
now has to process 3x the number of items in the checkpoint.
Processing completed intents is especially inefficient here, because
we first insert the intent into the AIL, then remove it from the AIL
when the intent done is processed. IOWs, we are also doing expensive
operations in log IO completion we could completely avoid if we
didn't log completed intent/intent done pairs.
Enter log item whiteouts.
When an intent done is committed, we can check to see if the
associated intent is in the same checkpoint as we are currently
committing the intent done to. If so, we can mark the intent log
item with a whiteout and immediately free the intent done item
rather than committing it to the CIL. We can basically skip the
entire formatting and CIL insertion steps for the intent done item.
However, we cannot remove the intent item from the CIL at this point
because the unlocked per-cpu CIL item lists do not permit removal
without holding the CIL context lock exclusively. Transaction commit
only holds the context lock shared, hence the best we can do is mark
the intent item with a whiteout so that the CIL push can release it
rather than writing it to the log.
This means we never write the intent to the log if the intent done
has also been committed to the same checkpoint, but we'll always
write the intent if the intent done has not been committed or has
been committed to a different checkpoint. This will result in
correct log recovery behaviour in all cases, without the overhead of
logging unnecessary intents.
This intent whiteout concept is generic - we can apply it to all
intent/intent done pairs that have a direct 1:1 relationship. The
way deferred ops iterate and relog intents mean that all intents
currently have a 1:1 relationship with their done intent, and hence
we can apply this cancellation to all existing intent/intent done
implementations.
For delayed attributes with a 16-way 64kB xattr create workload,
whiteouts reduce the amount of journalled metadata from ~2.5GB/s
down to ~600MB/s and improve the creation rate from 9000/s to
14000/s.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-04 04:50:29 +03:00
{ ( 1u < < XFS_LI_DIRTY ) , " DIRTY " } , \
{ ( 1u < < XFS_LI_WHITEOUT ) , " WHITEOUT " }
2009-12-15 02:14:59 +03:00
2011-10-28 13:54:24 +04:00
struct xfs_item_ops {
2019-06-29 05:27:32 +03:00
unsigned flags ;
2019-06-29 05:27:33 +03:00
void ( * iop_size ) ( struct xfs_log_item * , int * , int * ) ;
void ( * iop_format ) ( struct xfs_log_item * , struct xfs_log_vec * ) ;
void ( * iop_pin ) ( struct xfs_log_item * ) ;
void ( * iop_unpin ) ( struct xfs_log_item * , int remove ) ;
xfs: add log item precommit operation
For inodes that are dirty, we have an attached cluster buffer that
we want to use to track the dirty inode through the AIL.
Unfortunately, locking the cluster buffer and adding it to the
transaction when the inode is first logged in a transaction leads to
buffer lock ordering inversions.
The specific problem is ordering against the AGI buffer. When
modifying unlinked lists, the buffer lock order is AGI -> inode
cluster buffer as the AGI buffer lock serialises all access to the
unlinked lists. Unfortunately, functionality like xfs_droplink()
logs the inode before calling xfs_iunlink(), as do various directory
manipulation functions. The inode can be logged way down in the
stack as far as the bmapi routines and hence, without a major
rewrite of lots of APIs there's no way we can avoid the inode being
logged by something until after the AGI has been logged.
As we are going to be using ordered buffers for inode AIL tracking,
there isn't a need to actually lock that buffer against modification
as all the modifications are captured by logging the inode item
itself. Hence we don't actually need to join the cluster buffer into
the transaction until just before it is committed. This means we do
not perturb any of the existing buffer lock orders in transactions,
and the inode cluster buffer is always locked last in a transaction
that doesn't otherwise touch inode cluster buffers.
We do this by introducing a precommit log item method. This commit
just introduces the mechanism; the inode item implementation is in
followup commits.
The precommit items need to be sorted into consistent order as we
may be locking multiple items here. Hence if we have two dirty
inodes in cluster buffers A and B, and some other transaction has
two separate dirty inodes in the same cluster buffers, locking them
in different orders opens us up to ABBA deadlocks. Hence we sort the
items on the transaction based on the presence of a sort log item
method.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2022-07-14 04:47:26 +03:00
uint64_t ( * iop_sort ) ( struct xfs_log_item * lip ) ;
int ( * iop_precommit ) ( struct xfs_trans * tp , struct xfs_log_item * lip ) ;
2021-06-18 18:21:52 +03:00
void ( * iop_committing ) ( struct xfs_log_item * lip , xfs_csn_t seq ) ;
2019-06-29 05:27:33 +03:00
xfs_lsn_t ( * iop_committed ) ( struct xfs_log_item * , xfs_lsn_t ) ;
xfs: add log item precommit operation
For inodes that are dirty, we have an attached cluster buffer that
we want to use to track the dirty inode through the AIL.
Unfortunately, locking the cluster buffer and adding it to the
transaction when the inode is first logged in a transaction leads to
buffer lock ordering inversions.
The specific problem is ordering against the AGI buffer. When
modifying unlinked lists, the buffer lock order is AGI -> inode
cluster buffer as the AGI buffer lock serialises all access to the
unlinked lists. Unfortunately, functionality like xfs_droplink()
logs the inode before calling xfs_iunlink(), as do various directory
manipulation functions. The inode can be logged way down in the
stack as far as the bmapi routines and hence, without a major
rewrite of lots of APIs there's no way we can avoid the inode being
logged by something until after the AGI has been logged.
As we are going to be using ordered buffers for inode AIL tracking,
there isn't a need to actually lock that buffer against modification
as all the modifications are captured by logging the inode item
itself. Hence we don't actually need to join the cluster buffer into
the transaction until just before it is committed. This means we do
not perturb any of the existing buffer lock orders in transactions,
and the inode cluster buffer is always locked last in a transaction
that doesn't otherwise touch inode cluster buffers.
We do this by introducing a precommit log item method. This commit
just introduces the mechanism; the inode item implementation is in
followup commits.
The precommit items need to be sorted into consistent order as we
may be locking multiple items here. Hence if we have two dirty
inodes in cluster buffers A and B, and some other transaction has
two separate dirty inodes in the same cluster buffers, locking them
in different orders opens us up to ABBA deadlocks. Hence we sort the
items on the transaction based on the presence of a sort log item
method.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2022-07-14 04:47:26 +03:00
uint ( * iop_push ) ( struct xfs_log_item * , struct list_head * ) ;
void ( * iop_release ) ( struct xfs_log_item * ) ;
2020-05-02 02:00:54 +03:00
bool ( * iop_match ) ( struct xfs_log_item * item , uint64_t id ) ;
2022-05-04 04:46:39 +03:00
struct xfs_log_item * ( * iop_intent ) ( struct xfs_log_item * intent_done ) ;
2011-10-28 13:54:24 +04:00
} ;
2008-10-30 09:05:38 +03:00
2022-05-04 04:46:09 +03:00
/*
* Log item ops flags
*/
/*
* Release the log item when the journal commits instead of inserting into the
* AIL for writeback tracking and / or log tail pinning .
*/
# define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
# define XFS_ITEM_INTENT (1 << 1)
# define XFS_ITEM_INTENT_DONE (1 << 2)
2020-09-23 19:13:28 +03:00
static inline bool
xlog_item_is_intent ( struct xfs_log_item * lip )
{
2022-05-04 04:46:09 +03:00
return lip - > li_ops - > flags & XFS_ITEM_INTENT ;
2020-09-23 19:13:28 +03:00
}
static inline bool
xlog_item_is_intent_done ( struct xfs_log_item * lip )
{
2022-05-04 04:46:09 +03:00
return lip - > li_ops - > flags & XFS_ITEM_INTENT_DONE ;
2020-09-23 19:13:28 +03:00
}
2013-10-23 03:50:10 +04:00
void xfs_log_item_init ( struct xfs_mount * mp , struct xfs_log_item * item ,
int type , const struct xfs_item_ops * ops ) ;
2008-10-30 09:05:38 +03:00
/*
2013-08-28 15:12:03 +04:00
* Return values for the iop_push ( ) routines .
2008-10-30 09:05:38 +03:00
*/
xfs: on-stack delayed write buffer lists
Queue delwri buffers on a local on-stack list instead of a per-buftarg one,
and write back the buffers per-process instead of by waking up xfsbufd.
This is now easily doable given that we have very few places left that write
delwri buffers:
- log recovery:
Only done at mount time, and already forcing out the buffers
synchronously using xfs_flush_buftarg
- quotacheck:
Same story.
- dquot reclaim:
Writes out dirty dquots on the LRU under memory pressure. We might
want to look into doing more of this via xfsaild, but it's already
more optimal than the synchronous inode reclaim that writes each
buffer synchronously.
- xfsaild:
This is the main beneficiary of the change. By keeping a local list
of buffers to write we reduce latency of writing out buffers, and
more importably we can remove all the delwri list promotions which
were hitting the buffer cache hard under sustained metadata loads.
The implementation is very straight forward - xfs_buf_delwri_queue now gets
a new list_head pointer that it adds the delwri buffers to, and all callers
need to eventually submit the list using xfs_buf_delwi_submit or
xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are
skipped in xfs_buf_delwri_queue, assuming they already are on another delwri
list. The biggest change to pass down the buffer list was done to the AIL
pushing. Now that we operate on buffers the trylock, push and pushbuf log
item methods are merged into a single push routine, which tries to lock the
item, and if possible add the buffer that needs writeback to the buffer list.
This leads to much simpler code than the previous split but requires the
individual IOP_PUSH instances to unlock and reacquire the AIL around calls
to blocking routines.
Given that xfsailds now also handle writing out buffers, the conditions for
log forcing and the sleep times needed some small changes. The most
important one is that we consider an AIL busy as long we still have buffers
to push, and the other one is that we do increment the pushed LSN for
buffers that are under flushing at this moment, but still count them towards
the stuck items for restart purposes. Without this we could hammer on stuck
items without ever forcing the log and not make progress under heavy random
delete workloads on fast flash storage devices.
[ Dave Chinner:
- rebase on previous patches.
- improved comments for XBF_DELWRI_Q handling
- fix XBF_ASYNC handling in queue submission (test 106 failure)
- rename delwri submit function buffer list parameters for clarity
- xfs_efd_item_push() should return XFS_ITEM_PINNED ]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 09:58:39 +04:00
# define XFS_ITEM_SUCCESS 0
# define XFS_ITEM_PINNED 1
# define XFS_ITEM_LOCKED 2
# define XFS_ITEM_FLUSHING 3
2008-10-30 09:05:38 +03:00
/*
* This is the structure maintained for every active transaction .
*/
typedef struct xfs_trans {
unsigned int t_magic ; /* magic number */
unsigned int t_log_res ; /* amt of log space resvd */
unsigned int t_log_count ; /* count for perm log res */
unsigned int t_blk_res ; /* # of blocks resvd */
unsigned int t_blk_res_used ; /* # of resvd blocks used */
unsigned int t_rtx_res ; /* # of rt extents resvd */
unsigned int t_rtx_res_used ; /* # of resvd rt extents used */
2018-07-24 23:43:11 +03:00
unsigned int t_flags ; /* misc flags */
2023-02-10 20:11:06 +03:00
xfs_agnumber_t t_highest_agno ; /* highest AGF locked */
2010-02-16 02:34:54 +03:00
struct xlog_ticket * t_ticket ; /* log mgr ticket */
2008-10-30 09:05:38 +03:00
struct xfs_mount * t_mountp ; /* ptr to fs mount struct */
struct xfs_dquot_acct * t_dqinfo ; /* acctg info for dquots */
int64_t t_icount_delta ; /* superblock icount change */
int64_t t_ifree_delta ; /* superblock ifree change */
int64_t t_fdblocks_delta ; /* superblock fdblocks chg */
int64_t t_res_fdblocks_delta ; /* on-disk only chg */
int64_t t_frextents_delta ; /* superblock freextents chg*/
int64_t t_res_frextents_delta ; /* on-disk only chg */
int64_t t_dblocks_delta ; /* superblock dblocks change */
int64_t t_agcount_delta ; /* superblock agcount change */
int64_t t_imaxpct_delta ; /* superblock imaxpct change */
int64_t t_rextsize_delta ; /* superblock rextsize chg */
int64_t t_rbmblocks_delta ; /* superblock rbmblocks chg */
int64_t t_rblocks_delta ; /* superblock rblocks change */
int64_t t_rextents_delta ; /* superblocks rextents chg */
int64_t t_rextslog_delta ; /* superblocks rextslog chg */
2010-06-23 12:11:15 +04:00
struct list_head t_items ; /* log item descriptors */
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy
extent array so that it is not re-used before the freeing
transaction hits the disk. This array is fixed size, so when it
overflows we make further allocation transactions synchronous
because we cannot track more freed extents until those transactions
hit the disk and are completed. Under heavy mixed allocation and
freeing workloads with large log buffers, we can overflow this array
quite easily.
Further, the array is sparsely populated, which means that inserts
need to search for a free slot, and array searches often have to
search many more slots that are actually used to check all the
busy extents. Quite inefficient, really.
To enable this aspect of extent freeing to scale better, we need
a structure that can grow dynamically. While in other areas of
XFS we have used radix trees, the extents being freed are at random
locations on disk so are better suited to being indexed by an rbtree.
So, use a per-AG rbtree indexed by block number to track busy
extents. This incures a memory allocation when marking an extent
busy, but should not occur too often in low memory situations. This
should scale to an arbitrary number of extents so should not be a
limitation for features such as in-memory aggregation of
transactions.
However, there are still situations where we can't avoid allocating
busy extents (such as allocation from the AGFL). To minimise the
overhead of such occurences, we need to avoid doing a synchronous
log force while holding the AGF locked to ensure that the previous
transactions are safely on disk before we use the extent. We can do
this by marking the transaction doing the allocation as synchronous
rather issuing a log force.
Because of the locking involved and the ordering of transactions,
the synchronous transaction provides the same guarantees as a
synchronous log force because it ensures that all the prior
transactions are already on disk when the synchronous transaction
hits the disk. i.e. it preserves the free->allocate order of the
extent correctly in recovery.
By doing this, we avoid holding the AGF locked while log writes are
in progress, hence reducing the length of time the lock is held and
therefore we increase the rate at which we can allocate and free
from the allocation group, thereby increasing overall throughput.
The only problem with this approach is that when a metadata buffer is
marked stale (e.g. a directory block is removed), then buffer remains
pinned and locked until the log goes to disk. The issue here is that
if that stale buffer is reallocated in a subsequent transaction, the
attempt to lock that buffer in the transaction will hang waiting
the log to go to disk to unlock and unpin the buffer. Hence if
someone tries to lock a pinned, stale, locked buffer we need to
push on the log to get it unlocked ASAP. Effectively we are trading
off a guaranteed log force for a much less common trigger for log
force to occur.
Ideally we should not reallocate busy extents. That is a much more
complex fix to the problem as it involves direct intervention in the
allocation btree searches in many places. This is left to a future
set of modifications.
Finally, now that we track busy extents in allocated memory, we
don't need the descriptors in the transaction structure to point to
them. We can replace the complex busy chunk infrastructure with a
simple linked list of busy extents. This allows us to remove a large
chunk of code, making the overall change a net reduction in code
size.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
2010-05-21 06:07:08 +04:00
struct list_head t_busy ; /* list of busy extents */
2018-08-01 17:20:35 +03:00
struct list_head t_dfops ; /* deferred operations */
2008-10-30 09:05:38 +03:00
unsigned long t_pflags ; /* saved process flags state */
} xfs_trans_t ;
2005-04-17 02:20:36 +04:00
/*
* XFS transaction mechanism exported interfaces that are
* actually macros .
*/
# define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
/*
* XFS transaction mechanism exported interfaces .
*/
2016-04-06 02:19:55 +03:00
int xfs_trans_alloc ( struct xfs_mount * mp , struct xfs_trans_res * resp ,
uint blocks , uint rtextents , uint flags ,
struct xfs_trans * * tpp ) ;
2023-12-15 21:03:39 +03:00
int xfs_trans_reserve_more ( struct xfs_trans * tp ,
unsigned int blocks , unsigned int rtextents ) ;
2017-03-29 00:56:37 +03:00
int xfs_trans_alloc_empty ( struct xfs_mount * mp ,
struct xfs_trans * * tpp ) ;
2007-02-10 10:36:10 +03:00
void xfs_trans_mod_sb ( xfs_trans_t * , uint , int64_t ) ;
2012-06-22 12:50:11 +04:00
2020-01-24 04:01:18 +03:00
int xfs_trans_get_buf_map ( struct xfs_trans * tp , struct xfs_buftarg * target ,
struct xfs_buf_map * map , int nmaps , xfs_buf_flags_t flags ,
struct xfs_buf * * bpp ) ;
2012-06-22 12:50:11 +04:00
2020-01-24 04:01:18 +03:00
static inline int
2012-06-22 12:50:11 +04:00
xfs_trans_get_buf (
struct xfs_trans * tp ,
struct xfs_buftarg * target ,
xfs_daddr_t blkno ,
int numblks ,
2022-04-21 01:44:59 +03:00
xfs_buf_flags_t flags ,
2020-01-24 04:01:18 +03:00
struct xfs_buf * * bpp )
2012-06-22 12:50:11 +04:00
{
2012-11-12 15:54:01 +04:00
DEFINE_SINGLE_BUF_MAP ( map , blkno , numblks ) ;
2020-01-24 04:01:18 +03:00
return xfs_trans_get_buf_map ( tp , target , & map , 1 , flags , bpp ) ;
2012-06-22 12:50:11 +04:00
}
int xfs_trans_read_buf_map ( struct xfs_mount * mp ,
struct xfs_trans * tp ,
struct xfs_buftarg * target ,
struct xfs_buf_map * map , int nmaps ,
xfs_buf_flags_t flags ,
2012-11-12 15:54:01 +04:00
struct xfs_buf * * bpp ,
2012-11-14 10:54:40 +04:00
const struct xfs_buf_ops * ops ) ;
2012-06-22 12:50:11 +04:00
static inline int
xfs_trans_read_buf (
struct xfs_mount * mp ,
struct xfs_trans * tp ,
struct xfs_buftarg * target ,
xfs_daddr_t blkno ,
int numblks ,
xfs_buf_flags_t flags ,
2012-11-12 15:54:01 +04:00
struct xfs_buf * * bpp ,
2012-11-14 10:54:40 +04:00
const struct xfs_buf_ops * ops )
2012-06-22 12:50:11 +04:00
{
2012-11-12 15:54:01 +04:00
DEFINE_SINGLE_BUF_MAP ( map , blkno , numblks ) ;
return xfs_trans_read_buf_map ( mp , tp , target , & map , 1 ,
2012-11-14 10:54:40 +04:00
flags , bpp , ops ) ;
2012-06-22 12:50:11 +04:00
}
2020-09-01 20:55:47 +03:00
struct xfs_buf * xfs_trans_getsb ( struct xfs_trans * ) ;
2005-04-17 02:20:36 +04:00
void xfs_trans_brelse ( xfs_trans_t * , struct xfs_buf * ) ;
void xfs_trans_bjoin ( xfs_trans_t * , struct xfs_buf * ) ;
xfs: launder in-memory btree buffers before transaction commit
As we've noted in various places, all current users of in-memory btrees
are online fsck. Online fsck only stages a btree long enough to rebuild
an ondisk data structure, which means that the in-memory btree is
ephemeral. Furthermore, if we encounter /any/ errors while updating an
in-memory btree, all we do is tear down all the staged data and return
an errno to userspace. In-memory btrees need not be transactional, so
their buffers should not be committed to the ondisk log, nor should they
be checkpointed by the AIL. That's just as well since the ephemeral
nature of the btree means that the buftarg and the buffers may disappear
quickly anyway.
Therefore, we need a way to launder the btree buffers that get attached
to the transaction by the generic btree code. Because the buffers are
directly mapped to backing file pages, there's no need to bwrite them
back to the tmpfs file. All we need to do is clean enough of the buffer
log item state so that the bli can be detached from the buffer, remove
the bli from the transaction's log item list, and reset the transaction
dirty state as if the laundered items had never been there.
For simplicity, create xfbtree transaction commit and cancel helpers
that launder the in-memory btree buffers for callers. Once laundered,
call the write verifier on non-stale buffers to avoid integrity issues,
or punch a hole in the backing file for stale buffers.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2024-02-22 23:43:36 +03:00
void xfs_trans_bdetach ( struct xfs_trans * tp , struct xfs_buf * bp ) ;
2005-04-17 02:20:36 +04:00
void xfs_trans_bhold ( xfs_trans_t * , struct xfs_buf * ) ;
2005-09-05 02:29:01 +04:00
void xfs_trans_bhold_release ( xfs_trans_t * , struct xfs_buf * ) ;
2005-04-17 02:20:36 +04:00
void xfs_trans_binval ( xfs_trans_t * , struct xfs_buf * ) ;
void xfs_trans_inode_buf ( xfs_trans_t * , struct xfs_buf * ) ;
void xfs_trans_stale_inode_buf ( xfs_trans_t * , struct xfs_buf * ) ;
2017-08-29 20:08:40 +03:00
bool xfs_trans_ordered_buf ( xfs_trans_t * , struct xfs_buf * ) ;
2005-04-17 02:20:36 +04:00
void xfs_trans_dquot_buf ( xfs_trans_t * , struct xfs_buf * , uint ) ;
void xfs_trans_inode_alloc_buf ( xfs_trans_t * , struct xfs_buf * ) ;
2010-09-28 06:27:25 +04:00
void xfs_trans_ichgtime ( struct xfs_trans * , struct xfs_inode * , int ) ;
2011-09-19 19:00:54 +04:00
void xfs_trans_ijoin ( struct xfs_trans * , struct xfs_inode * , uint ) ;
2017-08-29 20:08:38 +03:00
void xfs_trans_log_buf ( struct xfs_trans * , struct xfs_buf * , uint ,
uint ) ;
void xfs_trans_dirty_buf ( struct xfs_trans * , struct xfs_buf * ) ;
2018-10-18 09:20:35 +03:00
bool xfs_trans_buf_is_dirty ( struct xfs_buf * bp ) ;
2005-04-17 02:20:36 +04:00
void xfs_trans_log_inode ( xfs_trans_t * , struct xfs_inode * , uint ) ;
2016-08-03 04:14:35 +03:00
2015-06-04 06:48:08 +03:00
int xfs_trans_commit ( struct xfs_trans * ) ;
2017-08-28 20:21:03 +03:00
int xfs_trans_roll ( struct xfs_trans * * ) ;
int xfs_trans_roll_inode ( struct xfs_trans * * , struct xfs_inode * ) ;
2015-06-04 06:47:56 +03:00
void xfs_trans_cancel ( xfs_trans_t * ) ;
[XFS] Move AIL pushing into it's own thread
When many hundreds to thousands of threads all try to do simultaneous
transactions and the log is in a tail-pushing situation (i.e. full), we
can get multiple threads walking the AIL list and contending on the AIL
lock.
The AIL push is, in effect, a simple I/O dispatch algorithm complicated by
the ordering constraints placed on it by the transaction subsystem. It
really does not need multiple threads to push on it - even when only a
single CPU is pushing the AIL, it can push the I/O out far faster that
pretty much any disk subsystem can handle.
So, to avoid contention problems stemming from multiple list walkers, move
the list walk off into another thread and simply provide a "target" to
push to. When a thread requires a push, it sets the target and wakes the
push thread, then goes to sleep waiting for the required amount of space
to become available in the log.
This mechanism should also be a lot fairer under heavy load as the waiters
will queue in arrival order, rather than queuing in "who completed a push
first" order.
Also, by moving the pushing to a separate thread we can do more
effectively overload detection and prevention as we can keep context from
loop iteration to loop iteration. That is, we can push only part of the
list each loop and not have to loop back to the start of the list every
time we run. This should also help by reducing the number of items we try
to lock and/or push items that we cannot move.
Note that this patch is not intended to solve the inefficiencies in the
AIL structure and the associated issues with extremely large list
contents. That needs to be addresses separately; parallel access would
cause problems to any new structure as well, so I'm only aiming to isolate
the structure from unbounded parallelism here.
SGI-PV: 972759
SGI-Modid: xfs-linux-melb:xfs-kern:30371a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
2008-02-05 04:13:32 +03:00
int xfs_trans_ail_init ( struct xfs_mount * ) ;
void xfs_trans_ail_destroy ( struct xfs_mount * ) ;
2005-04-17 02:20:36 +04:00
2013-10-23 03:51:50 +04:00
void xfs_trans_buf_set_type ( struct xfs_trans * , struct xfs_buf * ,
enum xfs_blft ) ;
void xfs_trans_buf_copy_type ( struct xfs_buf * dst_bp ,
struct xfs_buf * src_bp ) ;
2021-10-12 21:09:23 +03:00
extern struct kmem_cache * xfs_trans_cache ;
2007-11-23 08:28:09 +03:00
2021-01-27 23:07:57 +03:00
struct xfs_dquot ;
2021-01-27 03:33:29 +03:00
int xfs_trans_alloc_inode ( struct xfs_inode * ip , struct xfs_trans_res * resv ,
2021-01-27 03:44:07 +03:00
unsigned int dblocks , unsigned int rblocks , bool force ,
struct xfs_trans * * tpp ) ;
2023-12-15 21:03:39 +03:00
int xfs_trans_reserve_more_inode ( struct xfs_trans * tp , struct xfs_inode * ip ,
unsigned int dblocks , unsigned int rblocks , bool force_quota ) ;
2021-01-27 23:07:57 +03:00
int xfs_trans_alloc_icreate ( struct xfs_mount * mp , struct xfs_trans_res * resv ,
struct xfs_dquot * udqp , struct xfs_dquot * gdqp ,
struct xfs_dquot * pdqp , unsigned int dblocks ,
struct xfs_trans * * tpp ) ;
2021-01-29 22:32:09 +03:00
int xfs_trans_alloc_ichange ( struct xfs_inode * ip , struct xfs_dquot * udqp ,
struct xfs_dquot * gdqp , struct xfs_dquot * pdqp , bool force ,
struct xfs_trans * * tpp ) ;
2022-02-26 03:18:41 +03:00
int xfs_trans_alloc_dir ( struct xfs_inode * dp , struct xfs_trans_res * resv ,
struct xfs_inode * ip , unsigned int * dblocks ,
struct xfs_trans * * tpp , int * nospace_error ) ;
2021-01-27 03:33:29 +03:00
2021-02-23 21:26:06 +03:00
static inline void
xfs_trans_set_context (
struct xfs_trans * tp )
{
ASSERT ( current - > journal_info = = NULL ) ;
tp - > t_pflags = memalloc_nofs_save ( ) ;
current - > journal_info = tp ;
}
static inline void
xfs_trans_clear_context (
struct xfs_trans * tp )
{
if ( current - > journal_info = = tp ) {
memalloc_nofs_restore ( tp - > t_pflags ) ;
current - > journal_info = NULL ;
}
}
static inline void
xfs_trans_switch_context (
struct xfs_trans * old_tp ,
struct xfs_trans * new_tp )
{
ASSERT ( current - > journal_info = = old_tp ) ;
new_tp - > t_pflags = old_tp - > t_pflags ;
old_tp - > t_pflags = 0 ;
current - > journal_info = new_tp ;
}
2005-04-17 02:20:36 +04:00
# endif /* __XFS_TRANS_H__ */