2018-06-05 19:42:14 -07:00
// SPDX-License-Identifier: GPL-2.0
2005-04-16 15:20:36 -07:00
/*
2005-11-02 14:58:39 +11:00
* Copyright ( c ) 2000 , 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-16 15:20:36 -07:00
*/
# include "xfs.h"
2005-11-02 14:38:42 +11:00
# include "xfs_fs.h"
2013-10-23 10:36:05 +11:00
# include "xfs_shared.h"
2013-10-23 10:51:50 +11:00
# include "xfs_format.h"
2013-10-23 10:50:10 +11:00
# include "xfs_log_format.h"
# include "xfs_trans_resv.h"
2005-04-16 15:20:36 -07:00
# include "xfs_mount.h"
# include "xfs_inode.h"
2013-10-23 10:50:10 +11:00
# include "xfs_trans.h"
2005-11-02 14:38:42 +11:00
# include "xfs_trans_priv.h"
# include "xfs_inode_item.h"
2010-06-24 11:36:58 +10:00
# include "xfs_trace.h"
2005-04-16 15:20:36 -07:00
2017-12-11 06:35:19 -05:00
# include <linux/iversion.h>
2005-04-16 15:20:36 -07:00
/*
2010-06-24 11:36:58 +10:00
* Add a locked inode to the transaction .
*
* The inode must be locked , and it cannot be associated with any transaction .
2011-09-19 15:00:54 +00:00
* If lock_flags is non - zero the inode will be unlocked on transaction commit .
2005-04-16 15:20:36 -07:00
*/
void
xfs_trans_ijoin (
2010-06-24 11:36:58 +10:00
struct xfs_trans * tp ,
2011-09-19 15:00:54 +00:00
struct xfs_inode * ip ,
uint lock_flags )
2005-04-16 15:20:36 -07:00
{
xfs_inode_log_item_t * iip ;
2008-04-22 17:34:00 +10:00
ASSERT ( xfs_isilocked ( ip , XFS_ILOCK_EXCL ) ) ;
2005-04-16 15:20:36 -07:00
if ( ip - > i_itemp = = NULL )
xfs_inode_item_init ( ip , ip - > i_mount ) ;
iip = ip - > i_itemp ;
2011-09-19 15:00:54 +00:00
2010-06-24 11:36:58 +10:00
ASSERT ( iip - > ili_lock_flags = = 0 ) ;
2011-09-19 15:00:54 +00:00
iip - > ili_lock_flags = lock_flags ;
2005-04-16 15:20:36 -07:00
/*
* Get a log_item_desc to point at the new item .
*/
2010-06-23 18:11:15 +10:00
xfs_trans_add_item ( tp , & iip - > ili_item ) ;
2005-04-16 15:20:36 -07:00
}
2010-09-28 12:27:25 +10:00
/*
* Transactional inode timestamp update . Requires the inode to be locked and
* joined to the transaction supplied . Relies on the transaction subsystem to
* track dirty state and update / writeback the inode accordingly .
*/
void
xfs_trans_ichgtime (
struct xfs_trans * tp ,
struct xfs_inode * ip ,
int flags )
{
struct inode * inode = VFS_I ( ip ) ;
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
struct timespec64 tv ;
2010-09-28 12:27:25 +10:00
ASSERT ( tp ) ;
ASSERT ( xfs_isilocked ( ip , XFS_ILOCK_EXCL ) ) ;
2016-09-14 07:48:06 -07:00
tv = current_time ( inode ) ;
2010-09-28 12:27:25 +10:00
2016-02-09 16:54:58 +11:00
if ( flags & XFS_ICHGTIME_MOD )
2010-09-28 12:27:25 +10:00
inode - > i_mtime = tv ;
2016-02-09 16:54:58 +11:00
if ( flags & XFS_ICHGTIME_CHG )
2010-09-28 12:27:25 +10:00
inode - > i_ctime = tv ;
}
2005-04-16 15:20:36 -07:00
/*
* This is called to mark the fields indicated in fieldmask as needing
* to be logged when the transaction is committed . The inode must
* already be associated with the given transaction .
*
* The values for fieldmask are defined in xfs_inode_item . h . We always
* log all of the core inode if any of it has changed , and we always log
* all of the inline data / extents / b - tree root if any of them has changed .
*/
void
xfs_trans_log_inode (
xfs_trans_t * tp ,
xfs_inode_t * ip ,
uint flags )
{
2018-03-06 17:04:00 -08:00
struct inode * inode = VFS_I ( ip ) ;
2005-04-16 15:20:36 -07:00
ASSERT ( ip - > i_itemp ! = NULL ) ;
2008-04-22 17:34:00 +10:00
ASSERT ( xfs_isilocked ( ip , XFS_ILOCK_EXCL ) ) ;
2005-04-16 15:20:36 -07:00
2018-03-06 17:04:00 -08:00
/*
* Don ' t bother with i_lock for the I_DIRTY_TIME check here , as races
* don ' t matter - we either will need an extra transaction in 24 hours
* to log the timestamps , or will clear already cleared fields in the
* worst case .
*/
if ( inode - > i_state & ( I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED ) ) {
spin_lock ( & inode - > i_lock ) ;
inode - > i_state & = ~ ( I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED ) ;
spin_unlock ( & inode - > i_lock ) ;
}
xfs: optimise away log forces on timestamp updates for fdatasync
xfs: timestamp updates cause excessive fdatasync log traffic
Sage Weil reported that a ceph test workload was writing to the
log on every fdatasync during an overwrite workload. Event tracing
showed that the only metadata modification being made was the
timestamp updates during the write(2) syscall, but fdatasync(2)
is supposed to ignore them. The key observation was that the
transactions in the log all looked like this:
INODE: #regs: 4 ino: 0x8b flags: 0x45 dsize: 32
And contained a flags field of 0x45 or 0x85, and had data and
attribute forks following the inode core. This means that the
timestamp updates were triggering dirty relogging of previously
logged parts of the inode that hadn't yet been flushed back to
disk.
There are two parts to this problem. The first is that XFS relogs
dirty regions in subsequent transactions, so it carries around the
fields that have been dirtied since the last time the inode was
written back to disk, not since the last time the inode was forced
into the log.
The second part is that on v5 filesystems, the inode change count
update during inode dirtying also sets the XFS_ILOG_CORE flag, so
on v5 filesystems this makes a timestamp update dirty the entire
inode.
As a result when fdatasync is run, it looks at the dirty fields in
the inode, and sees more than just the timestamp flag, even though
the only metadata change since the last fdatasync was just the
timestamps. Hence we force the log on every subsequent fdatasync
even though it is not needed.
To fix this, add a new field to the inode log item that tracks
changes since the last time fsync/fdatasync forced the log to flush
the changes to the journal. This flag is updated when we dirty the
inode, but we do it before updating the change count so it does not
carry the "core dirty" flag from timestamp updates. The fields are
zeroed when the inode is marked clean (due to writeback/freeing) or
when an fsync/datasync forces the log. Hence if we only dirty the
timestamps on the inode between fsync/fdatasync calls, the fdatasync
will not trigger another log force.
Over 100 runs of the test program:
Ext4 baseline:
runtime: 1.63s +/- 0.24s
avg lat: 1.59ms +/- 0.24ms
iops: ~2000
XFS, vanilla kernel:
runtime: 2.45s +/- 0.18s
avg lat: 2.39ms +/- 0.18ms
log forces: ~400/s
iops: ~1000
XFS, patched kernel:
runtime: 1.49s +/- 0.26s
avg lat: 1.46ms +/- 0.25ms
log forces: ~30/s
iops: ~1500
Reported-by: Sage Weil <sage@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-11-03 13:14:59 +11:00
/*
* Record the specific change for fdatasync optimisation . This
* allows fdatasync to skip log forces for inodes that are only
* timestamp dirty . We do this before the change count so that
* the core being logged in this case does not impact on fdatasync
* behaviour .
*/
ip - > i_itemp - > ili_fsync_fields | = flags ;
2013-06-27 16:04:59 +10:00
/*
* First time we log the inode in a transaction , bump the inode change
2017-12-11 06:35:23 -05:00
* counter if it is configured for this to occur . While we have the
* inode locked exclusively for metadata modification , we can usually
* avoid setting XFS_ILOG_CORE if no one has queried the value since
* the last time it was incremented . If we have XFS_ILOG_CORE already
* set however , then go ahead and bump the i_version counter
* unconditionally .
2013-06-27 16:04:59 +10:00
*/
2018-05-09 07:49:37 -07:00
if ( ! test_and_set_bit ( XFS_LI_DIRTY , & ip - > i_itemp - > ili_item . li_flags ) & &
2013-06-27 16:04:59 +10:00
IS_I_VERSION ( VFS_I ( ip ) ) ) {
2017-12-11 06:35:23 -05:00
if ( inode_maybe_inc_iversion ( VFS_I ( ip ) , flags & XFS_ILOG_CORE ) )
flags | = XFS_ILOG_CORE ;
2013-06-27 16:04:59 +10:00
}
2005-04-16 15:20:36 -07:00
tp - > t_flags | = XFS_TRANS_DIRTY ;
/*
* Always OR in the bits from the ili_last_fields field .
* This is to coordinate with the xfs_iflush ( ) and xfs_iflush_done ( )
2012-02-29 09:53:54 +00:00
* routines in the eventual clearing of the ili_fields bits .
2005-04-16 15:20:36 -07:00
* See the big comment in xfs_iflush ( ) for an explanation of
2006-03-29 08:55:14 +10:00
* this coordination mechanism .
2005-04-16 15:20:36 -07:00
*/
flags | = ip - > i_itemp - > ili_last_fields ;
2012-02-29 09:53:54 +00:00
ip - > i_itemp - > ili_fields | = flags ;
2005-04-16 15:20:36 -07:00
}
2017-08-28 10:21:03 -07:00
int
xfs_trans_roll_inode (
struct xfs_trans * * tpp ,
struct xfs_inode * ip )
{
int error ;
xfs_trans_log_inode ( * tpp , ip , XFS_ILOG_CORE ) ;
error = xfs_trans_roll ( tpp ) ;
if ( ! error )
xfs_trans_ijoin ( * tpp , ip , 0 ) ;
return error ;
}