fc0561cefc
xfs: timestamp updates cause excessive fdatasync log traffic Sage Weil reported that a ceph test workload was writing to the log on every fdatasync during an overwrite workload. Event tracing showed that the only metadata modification being made was the timestamp updates during the write(2) syscall, but fdatasync(2) is supposed to ignore them. The key observation was that the transactions in the log all looked like this: INODE: #regs: 4 ino: 0x8b flags: 0x45 dsize: 32 And contained a flags field of 0x45 or 0x85, and had data and attribute forks following the inode core. This means that the timestamp updates were triggering dirty relogging of previously logged parts of the inode that hadn't yet been flushed back to disk. There are two parts to this problem. The first is that XFS relogs dirty regions in subsequent transactions, so it carries around the fields that have been dirtied since the last time the inode was written back to disk, not since the last time the inode was forced into the log. The second part is that on v5 filesystems, the inode change count update during inode dirtying also sets the XFS_ILOG_CORE flag, so on v5 filesystems this makes a timestamp update dirty the entire inode. As a result when fdatasync is run, it looks at the dirty fields in the inode, and sees more than just the timestamp flag, even though the only metadata change since the last fdatasync was just the timestamps. Hence we force the log on every subsequent fdatasync even though it is not needed. To fix this, add a new field to the inode log item that tracks changes since the last time fsync/fdatasync forced the log to flush the changes to the journal. This flag is updated when we dirty the inode, but we do it before updating the change count so it does not carry the "core dirty" flag from timestamp updates. The fields are zeroed when the inode is marked clean (due to writeback/freeing) or when an fsync/datasync forces the log. Hence if we only dirty the timestamps on the inode between fsync/fdatasync calls, the fdatasync will not trigger another log force. Over 100 runs of the test program: Ext4 baseline: runtime: 1.63s +/- 0.24s avg lat: 1.59ms +/- 0.24ms iops: ~2000 XFS, vanilla kernel: runtime: 2.45s +/- 0.18s avg lat: 2.39ms +/- 0.18ms log forces: ~400/s iops: ~1000 XFS, patched kernel: runtime: 1.49s +/- 0.26s avg lat: 1.46ms +/- 0.25ms log forces: ~30/s iops: ~1500 Reported-by: Sage Weil <sage@redhat.com> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
145 lines
4.2 KiB
C
145 lines
4.2 KiB
C
/*
|
|
* Copyright (c) 2000,2005 Silicon Graphics, Inc.
|
|
* All Rights Reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it would be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
#include "xfs.h"
|
|
#include "xfs_fs.h"
|
|
#include "xfs_shared.h"
|
|
#include "xfs_format.h"
|
|
#include "xfs_log_format.h"
|
|
#include "xfs_trans_resv.h"
|
|
#include "xfs_mount.h"
|
|
#include "xfs_inode.h"
|
|
#include "xfs_trans.h"
|
|
#include "xfs_trans_priv.h"
|
|
#include "xfs_inode_item.h"
|
|
#include "xfs_trace.h"
|
|
|
|
/*
|
|
* Add a locked inode to the transaction.
|
|
*
|
|
* The inode must be locked, and it cannot be associated with any transaction.
|
|
* If lock_flags is non-zero the inode will be unlocked on transaction commit.
|
|
*/
|
|
void
|
|
xfs_trans_ijoin(
|
|
struct xfs_trans *tp,
|
|
struct xfs_inode *ip,
|
|
uint lock_flags)
|
|
{
|
|
xfs_inode_log_item_t *iip;
|
|
|
|
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
|
|
if (ip->i_itemp == NULL)
|
|
xfs_inode_item_init(ip, ip->i_mount);
|
|
iip = ip->i_itemp;
|
|
|
|
ASSERT(iip->ili_lock_flags == 0);
|
|
iip->ili_lock_flags = lock_flags;
|
|
|
|
/*
|
|
* Get a log_item_desc to point at the new item.
|
|
*/
|
|
xfs_trans_add_item(tp, &iip->ili_item);
|
|
}
|
|
|
|
/*
|
|
* Transactional inode timestamp update. Requires the inode to be locked and
|
|
* joined to the transaction supplied. Relies on the transaction subsystem to
|
|
* track dirty state and update/writeback the inode accordingly.
|
|
*/
|
|
void
|
|
xfs_trans_ichgtime(
|
|
struct xfs_trans *tp,
|
|
struct xfs_inode *ip,
|
|
int flags)
|
|
{
|
|
struct inode *inode = VFS_I(ip);
|
|
struct timespec tv;
|
|
|
|
ASSERT(tp);
|
|
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
|
|
|
|
tv = current_fs_time(inode->i_sb);
|
|
|
|
if ((flags & XFS_ICHGTIME_MOD) &&
|
|
!timespec_equal(&inode->i_mtime, &tv)) {
|
|
inode->i_mtime = tv;
|
|
ip->i_d.di_mtime.t_sec = tv.tv_sec;
|
|
ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
|
|
}
|
|
if ((flags & XFS_ICHGTIME_CHG) &&
|
|
!timespec_equal(&inode->i_ctime, &tv)) {
|
|
inode->i_ctime = tv;
|
|
ip->i_d.di_ctime.t_sec = tv.tv_sec;
|
|
ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This is called to mark the fields indicated in fieldmask as needing
|
|
* to be logged when the transaction is committed. The inode must
|
|
* already be associated with the given transaction.
|
|
*
|
|
* The values for fieldmask are defined in xfs_inode_item.h. We always
|
|
* log all of the core inode if any of it has changed, and we always log
|
|
* all of the inline data/extents/b-tree root if any of them has changed.
|
|
*/
|
|
void
|
|
xfs_trans_log_inode(
|
|
xfs_trans_t *tp,
|
|
xfs_inode_t *ip,
|
|
uint flags)
|
|
{
|
|
ASSERT(ip->i_itemp != NULL);
|
|
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
|
|
|
|
/*
|
|
* Record the specific change for fdatasync optimisation. This
|
|
* allows fdatasync to skip log forces for inodes that are only
|
|
* timestamp dirty. We do this before the change count so that
|
|
* the core being logged in this case does not impact on fdatasync
|
|
* behaviour.
|
|
*/
|
|
ip->i_itemp->ili_fsync_fields |= flags;
|
|
|
|
/*
|
|
* First time we log the inode in a transaction, bump the inode change
|
|
* counter if it is configured for this to occur. We don't use
|
|
* inode_inc_version() because there is no need for extra locking around
|
|
* i_version as we already hold the inode locked exclusively for
|
|
* metadata modification.
|
|
*/
|
|
if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
|
|
IS_I_VERSION(VFS_I(ip))) {
|
|
ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
|
|
flags |= XFS_ILOG_CORE;
|
|
}
|
|
|
|
tp->t_flags |= XFS_TRANS_DIRTY;
|
|
ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
|
|
|
|
/*
|
|
* Always OR in the bits from the ili_last_fields field.
|
|
* This is to coordinate with the xfs_iflush() and xfs_iflush_done()
|
|
* routines in the eventual clearing of the ili_fields bits.
|
|
* See the big comment in xfs_iflush() for an explanation of
|
|
* this coordination mechanism.
|
|
*/
|
|
flags |= ip->i_itemp->ili_last_fields;
|
|
ip->i_itemp->ili_fields |= flags;
|
|
}
|