2019-05-24 13:04:05 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2021-05-07 04:06:44 +03:00
/*
2005-12-16 01:31:24 +03:00
* file . c
*
* File open , close , extend , truncate
*
* Copyright ( C ) 2002 , 2004 Oracle . All rights reserved .
*/
2006-01-11 23:17:46 +03:00
# include <linux/capability.h>
2005-12-16 01:31:24 +03:00
# include <linux/fs.h>
# include <linux/types.h>
# include <linux/slab.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
# include <linux/uio.h>
2006-10-04 04:53:05 +04:00
# include <linux/sched.h>
2007-06-04 11:59:47 +04:00
# include <linux/splice.h>
2006-11-15 10:48:42 +03:00
# include <linux/mount.h>
2007-02-10 07:24:12 +03:00
# include <linux/writeback.h>
2007-07-19 11:14:38 +04:00
# include <linux/falloc.h>
2008-10-09 21:38:40 +04:00
# include <linux/quotaops.h>
2010-08-05 22:32:45 +04:00
# include <linux/blkdev.h>
2015-05-23 00:13:32 +03:00
# include <linux/backing-dev.h>
2005-12-16 01:31:24 +03:00
# include <cluster/masklog.h>
# include "ocfs2.h"
# include "alloc.h"
# include "aops.h"
# include "dir.h"
# include "dlmglue.h"
# include "extent_map.h"
# include "file.h"
# include "sysfile.h"
# include "inode.h"
2006-07-04 04:27:12 +04:00
# include "ioctl.h"
2005-12-16 01:31:24 +03:00
# include "journal.h"
2007-12-21 03:49:04 +03:00
# include "locks.h"
2005-12-16 01:31:24 +03:00
# include "mmap.h"
# include "suballoc.h"
# include "super.h"
2008-08-18 13:11:00 +04:00
# include "xattr.h"
2008-11-14 06:17:18 +03:00
# include "acl.h"
2008-10-09 21:38:40 +04:00
# include "quota.h"
2009-08-25 04:02:48 +04:00
# include "refcounttree.h"
2011-02-22 17:14:41 +03:00
# include "ocfs2_trace.h"
2005-12-16 01:31:24 +03:00
# include "buffer_head_io.h"
2007-12-21 03:49:04 +03:00
static int ocfs2_init_file_private ( struct inode * inode , struct file * file )
{
struct ocfs2_file_private * fp ;
fp = kzalloc ( sizeof ( struct ocfs2_file_private ) , GFP_KERNEL ) ;
if ( ! fp )
return - ENOMEM ;
fp - > fp_file = file ;
mutex_init ( & fp - > fp_mutex ) ;
ocfs2_file_lock_res_init ( & fp - > fp_flock , fp ) ;
file - > private_data = fp ;
return 0 ;
}
static void ocfs2_free_file_private ( struct inode * inode , struct file * file )
{
struct ocfs2_file_private * fp = file - > private_data ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
if ( fp ) {
ocfs2_simple_drop_lockres ( osb , & fp - > fp_flock ) ;
ocfs2_lock_res_free ( & fp - > fp_flock ) ;
kfree ( fp ) ;
file - > private_data = NULL ;
}
}
2005-12-16 01:31:24 +03:00
static int ocfs2_file_open ( struct inode * inode , struct file * file )
{
int status ;
int mode = file - > f_flags ;
struct ocfs2_inode_info * oi = OCFS2_I ( inode ) ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_file_open ( inode , file , file - > f_path . dentry ,
2018-04-06 02:18:37 +03:00
( unsigned long long ) oi - > ip_blkno ,
2011-02-22 17:14:41 +03:00
file - > f_path . dentry - > d_name . len ,
file - > f_path . dentry - > d_name . name , mode ) ;
2005-12-16 01:31:24 +03:00
2015-07-14 14:36:02 +03:00
if ( file - > f_mode & FMODE_WRITE ) {
status = dquot_initialize ( inode ) ;
if ( status )
goto leave ;
}
2010-03-03 17:05:06 +03:00
2005-12-16 01:31:24 +03:00
spin_lock ( & oi - > ip_lock ) ;
/* Check that the inode hasn't been wiped from disk by another
* node . If it hasn ' t then we ' re safe as long as we hold the
* spin lock until our increment of open count . */
2018-04-06 02:18:37 +03:00
if ( oi - > ip_flags & OCFS2_INODE_DELETED ) {
2005-12-16 01:31:24 +03:00
spin_unlock ( & oi - > ip_lock ) ;
status = - ENOENT ;
goto leave ;
}
if ( mode & O_DIRECT )
oi - > ip_flags | = OCFS2_INODE_OPEN_DIRECT ;
oi - > ip_open_count + + ;
spin_unlock ( & oi - > ip_lock ) ;
2007-12-21 03:49:04 +03:00
status = ocfs2_init_file_private ( inode , file ) ;
if ( status ) {
/*
* We want to set open count back if we ' re failing the
* open .
*/
spin_lock ( & oi - > ip_lock ) ;
oi - > ip_open_count - - ;
spin_unlock ( & oi - > ip_lock ) ;
}
2018-02-01 03:15:25 +03:00
file - > f_mode | = FMODE_NOWAIT ;
2005-12-16 01:31:24 +03:00
leave :
return status ;
}
static int ocfs2_file_release ( struct inode * inode , struct file * file )
{
struct ocfs2_inode_info * oi = OCFS2_I ( inode ) ;
spin_lock ( & oi - > ip_lock ) ;
if ( ! - - oi - > ip_open_count )
oi - > ip_flags & = ~ OCFS2_INODE_OPEN_DIRECT ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_file_release ( inode , file , file - > f_path . dentry ,
oi - > ip_blkno ,
file - > f_path . dentry - > d_name . len ,
file - > f_path . dentry - > d_name . name ,
oi - > ip_open_count ) ;
2005-12-16 01:31:24 +03:00
spin_unlock ( & oi - > ip_lock ) ;
2007-12-21 03:49:04 +03:00
ocfs2_free_file_private ( inode , file ) ;
2005-12-16 01:31:24 +03:00
return 0 ;
}
2007-12-21 03:49:04 +03:00
static int ocfs2_dir_open ( struct inode * inode , struct file * file )
{
return ocfs2_init_file_private ( inode , file ) ;
}
static int ocfs2_dir_release ( struct inode * inode , struct file * file )
{
ocfs2_free_file_private ( inode , file ) ;
return 0 ;
}
2011-07-17 04:44:56 +04:00
static int ocfs2_sync_file ( struct file * file , loff_t start , loff_t end ,
int datasync )
2005-12-16 01:31:24 +03:00
{
int err = 0 ;
2010-05-26 19:53:25 +04:00
struct inode * inode = file - > f_mapping - > host ;
2005-12-16 01:31:24 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2014-04-04 01:46:48 +04:00
struct ocfs2_inode_info * oi = OCFS2_I ( inode ) ;
journal_t * journal = osb - > journal - > j_journal ;
int ret ;
tid_t commit_tid ;
bool needs_barrier = false ;
2005-12-16 01:31:24 +03:00
2011-02-22 17:14:41 +03:00
trace_ocfs2_sync_file ( inode , file , file - > f_path . dentry ,
2018-04-06 02:18:37 +03:00
oi - > ip_blkno ,
2011-02-22 17:14:41 +03:00
file - > f_path . dentry - > d_name . len ,
file - > f_path . dentry - > d_name . name ,
( unsigned long long ) datasync ) ;
2005-12-16 01:31:24 +03:00
2014-02-11 02:25:44 +04:00
if ( ocfs2_is_hard_readonly ( osb ) | | ocfs2_is_soft_readonly ( osb ) )
return - EROFS ;
2017-07-07 22:20:52 +03:00
err = file_write_and_wait_range ( file , start , end ) ;
2011-07-17 04:44:56 +04:00
if ( err )
return err ;
2014-04-04 01:46:48 +04:00
commit_tid = datasync ? oi - > i_datasync_tid : oi - > i_sync_tid ;
if ( journal - > j_flags & JBD2_BARRIER & &
! jbd2_trans_will_send_data_barrier ( journal , commit_tid ) )
needs_barrier = true ;
err = jbd2_complete_transaction ( journal , commit_tid ) ;
if ( needs_barrier ) {
2021-01-26 17:52:35 +03:00
ret = blkdev_issue_flush ( inode - > i_sb - > s_bdev ) ;
2014-04-04 01:46:48 +04:00
if ( ! err )
err = ret ;
2010-08-05 22:32:45 +04:00
}
2009-06-09 11:47:45 +04:00
2011-03-07 11:43:21 +03:00
if ( err )
mlog_errno ( err ) ;
2005-12-16 01:31:24 +03:00
return ( err < 0 ) ? - EIO : 0 ;
}
2006-11-15 10:48:42 +03:00
int ocfs2_should_update_atime ( struct inode * inode ,
struct vfsmount * vfsmnt )
{
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-09 05:36:02 +03:00
struct timespec64 now ;
2006-11-15 10:48:42 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
if ( ocfs2_is_hard_readonly ( osb ) | | ocfs2_is_soft_readonly ( osb ) )
return 0 ;
if ( ( inode - > i_flags & S_NOATIME ) | |
2017-11-28 00:05:09 +03:00
( ( inode - > i_sb - > s_flags & SB_NODIRATIME ) & & S_ISDIR ( inode - > i_mode ) ) )
2006-11-15 10:48:42 +03:00
return 0 ;
2006-12-20 02:25:52 +03:00
/*
* We can be called with no vfsmnt structure - NFSD will
* sometimes do this .
*
* Note that our action here is different than touch_atime ( ) -
* if we can ' t tell whether this is a noatime mount , then we
* don ' t know whether to trust the value of s_atime_quantum .
*/
if ( vfsmnt = = NULL )
return 0 ;
2006-11-15 10:48:42 +03:00
if ( ( vfsmnt - > mnt_flags & MNT_NOATIME ) | |
( ( vfsmnt - > mnt_flags & MNT_NODIRATIME ) & & S_ISDIR ( inode - > i_mode ) ) )
return 0 ;
2006-12-13 11:34:35 +03:00
if ( vfsmnt - > mnt_flags & MNT_RELATIME ) {
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-09 05:36:02 +03:00
if ( ( timespec64_compare ( & inode - > i_atime , & inode - > i_mtime ) < = 0 ) | |
( timespec64_compare ( & inode - > i_atime , & inode - > i_ctime ) < = 0 ) )
2006-12-13 11:34:35 +03:00
return 1 ;
return 0 ;
}
2016-09-14 17:48:04 +03:00
now = current_time ( inode ) ;
2006-11-15 10:48:42 +03:00
if ( ( now . tv_sec - inode - > i_atime . tv_sec < = osb - > s_atime_quantum ) )
return 0 ;
else
return 1 ;
}
int ocfs2_update_inode_atime ( struct inode * inode ,
struct buffer_head * bh )
{
int ret ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
handle_t * handle ;
2007-07-20 22:24:53 +04:00
struct ocfs2_dinode * di = ( struct ocfs2_dinode * ) bh - > b_data ;
2006-11-15 10:48:42 +03:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
2008-10-20 21:23:51 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
2006-11-15 10:48:42 +03:00
mlog_errno ( ret ) ;
goto out ;
}
2009-02-13 03:41:25 +03:00
ret = ocfs2_journal_access_di ( handle , INODE_CACHE ( inode ) , bh ,
2008-10-18 06:25:01 +04:00
OCFS2_JOURNAL_ACCESS_WRITE ) ;
2007-07-20 22:24:53 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out_commit ;
}
/*
* Don ' t use ocfs2_mark_inode_dirty ( ) here as we don ' t always
2022-03-23 00:38:45 +03:00
* have i_rwsem to guard against concurrent changes to other
2007-07-20 22:24:53 +04:00
* inode fields .
*/
2016-09-14 17:48:04 +03:00
inode - > i_atime = current_time ( inode ) ;
2007-07-20 22:24:53 +04:00
di - > i_atime = cpu_to_le64 ( inode - > i_atime . tv_sec ) ;
di - > i_atime_nsec = cpu_to_le32 ( inode - > i_atime . tv_nsec ) ;
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 0 ) ;
2010-03-20 00:13:52 +03:00
ocfs2_journal_dirty ( handle , bh ) ;
2006-11-15 10:48:42 +03:00
2007-07-20 22:24:53 +04:00
out_commit :
2018-04-06 02:18:33 +03:00
ocfs2_commit_trans ( osb , handle ) ;
2006-11-15 10:48:42 +03:00
out :
return ret ;
}
2015-02-17 02:59:50 +03:00
int ocfs2_set_inode_size ( handle_t * handle ,
2007-04-26 11:29:35 +04:00
struct inode * inode ,
struct buffer_head * fe_bh ,
u64 new_i_size )
2005-12-16 01:31:24 +03:00
{
int status ;
i_size_write ( inode , new_i_size ) ;
2007-03-23 02:53:23 +03:00
inode - > i_blocks = ocfs2_inode_sector_count ( inode ) ;
2016-09-14 17:48:04 +03:00
inode - > i_ctime = inode - > i_mtime = current_time ( inode ) ;
2005-12-16 01:31:24 +03:00
status = ocfs2_mark_inode_dirty ( handle , inode , fe_bh ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
bail :
return status ;
}
2008-08-25 21:56:50 +04:00
int ocfs2_simple_size_update ( struct inode * inode ,
struct buffer_head * di_bh ,
u64 new_i_size )
2005-12-16 01:31:24 +03:00
{
int ret ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2006-10-10 05:11:45 +04:00
handle_t * handle = NULL ;
2005-12-16 01:31:24 +03:00
2006-10-10 04:26:22 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
2008-10-20 21:23:51 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
2005-12-16 01:31:24 +03:00
mlog_errno ( ret ) ;
goto out ;
}
ret = ocfs2_set_inode_size ( handle , inode , di_bh ,
new_i_size ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 0 ) ;
2006-10-10 03:48:10 +04:00
ocfs2_commit_trans ( osb , handle ) ;
2005-12-16 01:31:24 +03:00
out :
return ret ;
}
2009-08-26 05:47:28 +04:00
static int ocfs2_cow_file_pos ( struct inode * inode ,
struct buffer_head * fe_bh ,
u64 offset )
{
int status ;
u32 phys , cpos = offset > > OCFS2_SB ( inode - > i_sb ) - > s_clustersize_bits ;
unsigned int num_clusters = 0 ;
unsigned int ext_flags = 0 ;
/*
* If the new offset is aligned to the range of the cluster , there is
* no space for ocfs2_zero_range_for_truncate to fill , so no need to
* CoW either .
*/
if ( ( offset & ( OCFS2_SB ( inode - > i_sb ) - > s_clustersize - 1 ) ) = = 0 )
return 0 ;
status = ocfs2_get_clusters ( inode , cpos , & phys ,
& num_clusters , & ext_flags ) ;
if ( status ) {
mlog_errno ( status ) ;
goto out ;
}
if ( ! ( ext_flags & OCFS2_EXT_REFCOUNTED ) )
goto out ;
2013-08-14 03:00:58 +04:00
return ocfs2_refcount_cow ( inode , fe_bh , cpos , 1 , cpos + 1 ) ;
2009-08-26 05:47:28 +04:00
out :
return status ;
}
2005-12-16 01:31:24 +03:00
static int ocfs2_orphan_for_truncate ( struct ocfs2_super * osb ,
struct inode * inode ,
struct buffer_head * fe_bh ,
u64 new_i_size )
{
int status ;
2006-10-10 05:11:45 +04:00
handle_t * handle ;
2007-02-16 22:46:50 +03:00
struct ocfs2_dinode * di ;
2007-07-07 01:41:18 +04:00
u64 cluster_bytes ;
2005-12-16 01:31:24 +03:00
2009-08-26 05:47:28 +04:00
/*
* We need to CoW the cluster contains the offset if it is reflinked
* since we will call ocfs2_zero_range_for_truncate later which will
* write " 0 " from offset to the end of the cluster .
*/
status = ocfs2_cow_file_pos ( inode , fe_bh , new_i_size ) ;
if ( status ) {
mlog_errno ( status ) ;
return status ;
}
2005-12-16 01:31:24 +03:00
/* TODO: This needs to actually orphan the inode in this
* transaction . */
2006-10-10 04:26:22 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
2005-12-16 01:31:24 +03:00
if ( IS_ERR ( handle ) ) {
status = PTR_ERR ( handle ) ;
mlog_errno ( status ) ;
goto out ;
}
2009-02-13 03:41:25 +03:00
status = ocfs2_journal_access_di ( handle , INODE_CACHE ( inode ) , fe_bh ,
2008-10-18 06:25:01 +04:00
OCFS2_JOURNAL_ACCESS_WRITE ) ;
2007-02-16 22:46:50 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto out_commit ;
}
/*
* Do this before setting i_size .
*/
2007-07-07 01:41:18 +04:00
cluster_bytes = ocfs2_align_bytes_to_clusters ( inode - > i_sb , new_i_size ) ;
status = ocfs2_zero_range_for_truncate ( inode , handle , new_i_size ,
cluster_bytes ) ;
2007-02-16 22:46:50 +03:00
if ( status ) {
mlog_errno ( status ) ;
goto out_commit ;
}
i_size_write ( inode , new_i_size ) ;
2016-09-14 17:48:04 +03:00
inode - > i_ctime = inode - > i_mtime = current_time ( inode ) ;
2007-02-16 22:46:50 +03:00
di = ( struct ocfs2_dinode * ) fe_bh - > b_data ;
di - > i_size = cpu_to_le64 ( new_i_size ) ;
di - > i_ctime = di - > i_mtime = cpu_to_le64 ( inode - > i_ctime . tv_sec ) ;
di - > i_ctime_nsec = di - > i_mtime_nsec = cpu_to_le32 ( inode - > i_ctime . tv_nsec ) ;
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 0 ) ;
2007-02-16 22:46:50 +03:00
2010-03-20 00:13:52 +03:00
ocfs2_journal_dirty ( handle , fe_bh ) ;
2005-12-16 01:31:24 +03:00
2007-02-16 22:46:50 +03:00
out_commit :
2006-10-10 03:48:10 +04:00
ocfs2_commit_trans ( osb , handle ) ;
2005-12-16 01:31:24 +03:00
out :
return status ;
}
2015-02-17 02:59:50 +03:00
int ocfs2_truncate_file ( struct inode * inode ,
2005-12-16 01:31:24 +03:00
struct buffer_head * di_bh ,
u64 new_i_size )
{
int status = 0 ;
struct ocfs2_dinode * fe = NULL ;
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2008-11-14 01:49:11 +03:00
/* We trust di_bh because it comes from ocfs2_inode_lock(), which
* already validated it */
2005-12-16 01:31:24 +03:00
fe = ( struct ocfs2_dinode * ) di_bh - > b_data ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_truncate_file ( ( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) le64_to_cpu ( fe - > i_size ) ,
( unsigned long long ) new_i_size ) ;
2005-12-16 01:31:24 +03:00
mlog_bug_on_msg ( le64_to_cpu ( fe - > i_size ) ! = i_size_read ( inode ) ,
2006-03-03 21:24:33 +03:00
" Inode %llu, inode i_size = %lld != di "
" i_size = %llu, i_flags = 0x%x \n " ,
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
2005-12-16 01:31:24 +03:00
i_size_read ( inode ) ,
2006-03-03 21:24:33 +03:00
( unsigned long long ) le64_to_cpu ( fe - > i_size ) ,
le32_to_cpu ( fe - > i_flags ) ) ;
2005-12-16 01:31:24 +03:00
if ( new_i_size > le64_to_cpu ( fe - > i_size ) ) {
2011-02-22 17:14:41 +03:00
trace_ocfs2_truncate_file_error (
( unsigned long long ) le64_to_cpu ( fe - > i_size ) ,
( unsigned long long ) new_i_size ) ;
2005-12-16 01:31:24 +03:00
status = - EINVAL ;
mlog_errno ( status ) ;
goto bail ;
}
2007-05-10 00:40:18 +04:00
down_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
2009-12-08 00:15:40 +03:00
ocfs2_resv_discard ( & osb - > osb_la_resmap ,
& OCFS2_I ( inode ) - > ip_la_data_resv ) ;
2007-10-19 02:23:46 +04:00
/*
* The inode lock forced other nodes to sync and drop their
* pages , which ( correctly ) happens even if we have a truncate
* without allocation change - ocfs2 cluster sizes can be much
* greater than page size , so we have to truncate them
* anyway .
*/
2007-05-10 00:40:18 +04:00
2007-09-08 01:46:51 +04:00
if ( OCFS2_I ( inode ) - > ip_dyn_features & OCFS2_INLINE_DATA_FL ) {
2021-11-05 23:34:55 +03:00
unmap_mapping_range ( inode - > i_mapping ,
new_i_size + PAGE_SIZE - 1 , 0 , 1 ) ;
truncate_inode_pages ( inode - > i_mapping , new_i_size ) ;
2007-09-08 01:46:51 +04:00
status = ocfs2_truncate_inline ( inode , di_bh , new_i_size ,
2007-11-20 22:56:39 +03:00
i_size_read ( inode ) , 1 ) ;
2007-09-08 01:46:51 +04:00
if ( status )
mlog_errno ( status ) ;
2007-10-19 02:23:46 +04:00
goto bail_unlock_sem ;
2007-09-08 01:46:51 +04:00
}
2005-12-16 01:31:24 +03:00
/* alright, we're going to need to do a full blown alloc size
* change . Orphan the inode so that recovery can complete the
* truncate if necessary . This does the task of marking
* i_size . */
status = ocfs2_orphan_for_truncate ( osb , inode , di_bh , new_i_size ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
2007-10-19 02:23:46 +04:00
goto bail_unlock_sem ;
2005-12-16 01:31:24 +03:00
}
2021-11-05 23:34:55 +03:00
unmap_mapping_range ( inode - > i_mapping , new_i_size + PAGE_SIZE - 1 , 0 , 1 ) ;
truncate_inode_pages ( inode - > i_mapping , new_i_size ) ;
2010-05-11 13:54:42 +04:00
status = ocfs2_commit_truncate ( osb , inode , di_bh ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
2007-10-19 02:23:46 +04:00
goto bail_unlock_sem ;
2005-12-16 01:31:24 +03:00
}
/* TODO: orphan dir cleanup here. */
2007-10-19 02:23:46 +04:00
bail_unlock_sem :
2007-05-10 00:40:18 +04:00
up_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
2005-12-16 01:31:24 +03:00
bail :
2009-08-18 07:43:49 +04:00
if ( ! status & & OCFS2_I ( inode ) - > ip_clusters = = 0 )
status = ocfs2_try_remove_refcount_tree ( inode , di_bh ) ;
2005-12-16 01:31:24 +03:00
return status ;
}
/*
2008-08-18 13:38:45 +04:00
* extend file allocation only here .
2005-12-16 01:31:24 +03:00
* we ' ll update all the disk stuff , and oip - > alloc_size
*
* expect stuff to be locked , a transaction started and enough data /
* metadata reservations in the contexts .
*
* Will return - EAGAIN , and a reason if a restart is needed .
* If passed in , * reason will always be set , even in error .
*/
2008-08-18 13:38:45 +04:00
int ocfs2_add_inode_data ( struct ocfs2_super * osb ,
struct inode * inode ,
u32 * logical_offset ,
u32 clusters_to_add ,
int mark_unwritten ,
struct buffer_head * fe_bh ,
handle_t * handle ,
struct ocfs2_alloc_context * data_ac ,
struct ocfs2_alloc_context * meta_ac ,
enum ocfs2_alloc_restarted * reason_ret )
2005-12-16 01:31:24 +03:00
{
2008-08-21 06:36:33 +04:00
struct ocfs2_extent_tree et ;
2005-12-16 01:31:24 +03:00
2009-02-13 14:54:22 +03:00
ocfs2_init_dinode_extent_tree ( & et , INODE_CACHE ( inode ) , fe_bh ) ;
2022-03-23 00:38:42 +03:00
return ocfs2_add_clusters_in_btree ( handle , & et , logical_offset ,
clusters_to_add , mark_unwritten ,
data_ac , meta_ac , reason_ret ) ;
2005-12-16 01:31:24 +03:00
}
2018-06-08 03:04:38 +03:00
static int ocfs2_extend_allocation ( struct inode * inode , u32 logical_start ,
u32 clusters_to_add , int mark_unwritten )
2005-12-16 01:31:24 +03:00
{
int status = 0 ;
int restart_func = 0 ;
2007-01-18 00:07:24 +03:00
int credits ;
2007-03-10 03:43:28 +03:00
u32 prev_clusters ;
2005-12-16 01:31:24 +03:00
struct buffer_head * bh = NULL ;
struct ocfs2_dinode * fe = NULL ;
2006-10-10 05:11:45 +04:00
handle_t * handle = NULL ;
2005-12-16 01:31:24 +03:00
struct ocfs2_alloc_context * data_ac = NULL ;
struct ocfs2_alloc_context * meta_ac = NULL ;
2015-02-11 01:08:46 +03:00
enum ocfs2_alloc_restarted why = RESTART_NONE ;
2005-12-16 01:31:24 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2008-08-21 06:36:33 +04:00
struct ocfs2_extent_tree et ;
2008-10-09 21:38:40 +04:00
int did_quota = 0 ;
2005-12-16 01:31:24 +03:00
2007-01-16 22:32:23 +03:00
/*
2013-11-13 03:06:53 +04:00
* Unwritten extent only exists for file systems which
2007-01-16 22:32:23 +03:00
* support holes .
*/
2007-03-10 03:43:28 +03:00
BUG_ON ( mark_unwritten & & ! ocfs2_sparse_alloc ( osb ) ) ;
2007-01-16 22:32:23 +03:00
2008-11-14 01:49:11 +03:00
status = ocfs2_read_inode_block ( inode , & bh ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto leave ;
}
fe = ( struct ocfs2_dinode * ) bh - > b_data ;
restart_all :
BUG_ON ( le32_to_cpu ( fe - > i_clusters ) ! = OCFS2_I ( inode ) - > ip_clusters ) ;
2009-02-13 14:54:22 +03:00
ocfs2_init_dinode_extent_tree ( & et , INODE_CACHE ( inode ) , bh ) ;
2008-08-21 06:36:33 +04:00
status = ocfs2_lock_allocators ( inode , & et , clusters_to_add , 0 ,
& data_ac , & meta_ac ) ;
2007-02-10 07:24:12 +03:00
if ( status ) {
mlog_errno ( status ) ;
goto leave ;
}
2013-11-13 03:06:52 +04:00
credits = ocfs2_calc_extend_credits ( osb - > sb , & fe - > id2 . i_list ) ;
2006-10-10 04:26:22 +04:00
handle = ocfs2_start_trans ( osb , credits ) ;
2005-12-16 01:31:24 +03:00
if ( IS_ERR ( handle ) ) {
status = PTR_ERR ( handle ) ;
handle = NULL ;
mlog_errno ( status ) ;
goto leave ;
}
restarted_transaction :
2011-02-22 17:14:41 +03:00
trace_ocfs2_extend_allocation (
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) i_size_read ( inode ) ,
le32_to_cpu ( fe - > i_clusters ) , clusters_to_add ,
why , restart_func ) ;
2010-03-03 17:05:00 +03:00
status = dquot_alloc_space_nodirty ( inode ,
ocfs2_clusters_to_bytes ( osb - > sb , clusters_to_add ) ) ;
if ( status )
2008-10-09 21:38:40 +04:00
goto leave ;
did_quota = 1 ;
2005-12-16 01:31:24 +03:00
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path , we can still
* update i_size . */
2009-02-13 03:41:25 +03:00
status = ocfs2_journal_access_di ( handle , INODE_CACHE ( inode ) , bh ,
2008-10-18 06:25:01 +04:00
OCFS2_JOURNAL_ACCESS_WRITE ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
mlog_errno ( status ) ;
goto leave ;
}
prev_clusters = OCFS2_I ( inode ) - > ip_clusters ;
2008-08-18 13:38:45 +04:00
status = ocfs2_add_inode_data ( osb ,
inode ,
& logical_start ,
clusters_to_add ,
mark_unwritten ,
bh ,
handle ,
data_ac ,
meta_ac ,
& why ) ;
2005-12-16 01:31:24 +03:00
if ( ( status < 0 ) & & ( status ! = - EAGAIN ) ) {
if ( status ! = - ENOSPC )
mlog_errno ( status ) ;
goto leave ;
}
2014-04-04 01:46:48 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 1 ) ;
2010-03-20 00:13:52 +03:00
ocfs2_journal_dirty ( handle , bh ) ;
2005-12-16 01:31:24 +03:00
spin_lock ( & OCFS2_I ( inode ) - > ip_lock ) ;
clusters_to_add - = ( OCFS2_I ( inode ) - > ip_clusters - prev_clusters ) ;
spin_unlock ( & OCFS2_I ( inode ) - > ip_lock ) ;
2008-10-09 21:38:40 +04:00
/* Release unused quota reservation */
2010-03-03 17:05:00 +03:00
dquot_free_space ( inode ,
2008-10-09 21:38:40 +04:00
ocfs2_clusters_to_bytes ( osb - > sb , clusters_to_add ) ) ;
did_quota = 0 ;
2005-12-16 01:31:24 +03:00
if ( why ! = RESTART_NONE & & clusters_to_add ) {
if ( why = = RESTART_META ) {
restart_func = 1 ;
2010-04-16 09:59:25 +04:00
status = 0 ;
2005-12-16 01:31:24 +03:00
} else {
BUG_ON ( why ! = RESTART_TRANS ) ;
ocfs2: lighten up allocate transaction
The issue scenario is as following:
When fallocating a very large disk space for a small file,
__ocfs2_extend_allocation attempts to get a very large transaction. For
some journal sizes, there may be not enough room for this transaction,
and the fallocate will fail.
The patch below extends & restarts the transaction as necessary while
allocating space, and should work with even the smallest journal. This
patch refers ext4 resize.
Test:
# mkfs.ocfs2 -b 4K -C 32K -T datafiles /dev/sdc
...(jounral size is 32M)
# mount.ocfs2 /dev/sdc /mnt/ocfs2/
# touch /mnt/ocfs2/1.log
# fallocate -o 0 -l 400G /mnt/ocfs2/1.log
fallocate: /mnt/ocfs2/1.log: fallocate failed: Cannot allocate memory
# tail -f /var/log/messages
[ 7372.278591] JBD: fallocate wants too many credits (2051 > 2048)
[ 7372.278597] (fallocate,6438,0):__ocfs2_extend_allocation:709 ERROR: status = -12
[ 7372.278603] (fallocate,6438,0):ocfs2_allocate_unwritten_extents:1504 ERROR: status = -12
[ 7372.278607] (fallocate,6438,0):__ocfs2_change_file_space:1955 ERROR: status = -12
^C
With this patch, the test works well.
Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-12 01:19:44 +04:00
status = ocfs2_allocate_extend_trans ( handle , 1 ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
/* handle still has to be committed at
* this point . */
status = - ENOMEM ;
mlog_errno ( status ) ;
goto leave ;
}
goto restarted_transaction ;
}
}
2011-02-22 17:14:41 +03:00
trace_ocfs2_extend_allocation_end ( OCFS2_I ( inode ) - > ip_blkno ,
2007-04-28 03:01:25 +04:00
le32_to_cpu ( fe - > i_clusters ) ,
2011-02-22 17:14:41 +03:00
( unsigned long long ) le64_to_cpu ( fe - > i_size ) ,
OCFS2_I ( inode ) - > ip_clusters ,
( unsigned long long ) i_size_read ( inode ) ) ;
2005-12-16 01:31:24 +03:00
leave :
2008-10-09 21:38:40 +04:00
if ( status < 0 & & did_quota )
2010-03-03 17:05:00 +03:00
dquot_free_space ( inode ,
2008-10-09 21:38:40 +04:00
ocfs2_clusters_to_bytes ( osb - > sb , clusters_to_add ) ) ;
2005-12-16 01:31:24 +03:00
if ( handle ) {
2006-10-10 03:48:10 +04:00
ocfs2_commit_trans ( osb , handle ) ;
2005-12-16 01:31:24 +03:00
handle = NULL ;
}
if ( data_ac ) {
ocfs2_free_alloc_context ( data_ac ) ;
data_ac = NULL ;
}
if ( meta_ac ) {
ocfs2_free_alloc_context ( meta_ac ) ;
meta_ac = NULL ;
}
if ( ( ! status ) & & restart_func ) {
restart_func = 0 ;
goto restart_all ;
}
2008-10-08 01:25:16 +04:00
brelse ( bh ) ;
bh = NULL ;
2005-12-16 01:31:24 +03:00
return status ;
}
2010-07-07 01:36:06 +04:00
/*
* While a write will already be ordering the data , a truncate will not .
* Thus , we need to explicitly order the zeroed pages .
*/
2014-02-11 02:25:53 +04:00
static handle_t * ocfs2_zero_start_ordered_transaction ( struct inode * inode ,
2019-09-24 01:33:08 +03:00
struct buffer_head * di_bh ,
loff_t start_byte ,
loff_t length )
2010-07-07 01:36:06 +04:00
{
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
handle_t * handle = NULL ;
int ret = 0 ;
if ( ! ocfs2_should_order_data ( inode ) )
goto out ;
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
if ( IS_ERR ( handle ) ) {
ret = - ENOMEM ;
mlog_errno ( ret ) ;
goto out ;
}
2019-09-24 01:33:08 +03:00
ret = ocfs2_jbd2_inode_add_write ( handle , inode , start_byte , length ) ;
2014-02-11 02:25:53 +04:00
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
ret = ocfs2_journal_access_di ( handle , INODE_CACHE ( inode ) , di_bh ,
OCFS2_JOURNAL_ACCESS_WRITE ) ;
if ( ret )
2010-07-07 01:36:06 +04:00
mlog_errno ( ret ) ;
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 1 ) ;
2010-07-07 01:36:06 +04:00
out :
if ( ret ) {
if ( ! IS_ERR ( handle ) )
ocfs2_commit_trans ( osb , handle ) ;
handle = ERR_PTR ( ret ) ;
}
return handle ;
}
2005-12-16 01:31:24 +03:00
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
2008-10-30 00:00:55 +03:00
* worry about recursive locking in - > write_begin ( ) and - > write_end ( ) . */
2010-07-07 01:36:06 +04:00
static int ocfs2_write_zero_page ( struct inode * inode , u64 abs_from ,
2014-02-11 02:25:53 +04:00
u64 abs_to , struct buffer_head * di_bh )
2005-12-16 01:31:24 +03:00
{
struct address_space * mapping = inode - > i_mapping ;
struct page * page ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
unsigned long index = abs_from > > PAGE_SHIFT ;
2014-10-10 02:25:15 +04:00
handle_t * handle ;
2010-07-17 00:32:33 +04:00
int ret = 0 ;
2010-07-07 01:36:06 +04:00
unsigned zero_from , zero_to , block_start , block_end ;
2014-02-11 02:25:53 +04:00
struct ocfs2_dinode * di = ( struct ocfs2_dinode * ) di_bh - > b_data ;
2005-12-16 01:31:24 +03:00
2010-07-07 01:36:06 +04:00
BUG_ON ( abs_from > = abs_to ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
BUG_ON ( abs_to > ( ( ( u64 ) index + 1 ) < < PAGE_SHIFT ) ) ;
2010-07-07 01:36:06 +04:00
BUG_ON ( abs_from & ( inode - > i_blkbits - 1 ) ) ;
2005-12-16 01:31:24 +03:00
2019-09-24 01:33:08 +03:00
handle = ocfs2_zero_start_ordered_transaction ( inode , di_bh ,
abs_from ,
abs_to - abs_from ) ;
2014-10-10 02:25:15 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
goto out ;
}
2010-08-24 16:28:03 +04:00
page = find_or_create_page ( mapping , index , GFP_NOFS ) ;
2005-12-16 01:31:24 +03:00
if ( ! page ) {
ret = - ENOMEM ;
mlog_errno ( ret ) ;
2014-10-10 02:25:15 +04:00
goto out_commit_trans ;
2005-12-16 01:31:24 +03:00
}
2010-07-07 01:36:06 +04:00
/* Get the offsets within the page that we want to zero */
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
zero_from = abs_from & ( PAGE_SIZE - 1 ) ;
zero_to = abs_to & ( PAGE_SIZE - 1 ) ;
2010-07-07 01:36:06 +04:00
if ( ! zero_to )
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
zero_to = PAGE_SIZE ;
2005-12-16 01:31:24 +03:00
2011-02-22 17:14:41 +03:00
trace_ocfs2_write_zero_page (
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) abs_from ,
( unsigned long long ) abs_to ,
index , zero_from , zero_to ) ;
2010-07-02 02:13:31 +04:00
2010-07-07 01:36:06 +04:00
/* We know that zero_from is block aligned */
for ( block_start = zero_from ; block_start < zero_to ;
block_start = block_end ) {
2017-02-28 01:28:32 +03:00
block_end = block_start + i_blocksize ( inode ) ;
2010-07-07 01:36:06 +04:00
/*
2010-10-06 12:47:23 +04:00
* block_start is block - aligned . Bump it by one to force
* __block_write_begin and block_commit_write to zero the
2010-07-07 01:36:06 +04:00
* whole block .
*/
2010-10-06 12:47:23 +04:00
ret = __block_write_begin ( page , block_start + 1 , 0 ,
ocfs2_get_block ) ;
2010-07-07 01:36:06 +04:00
if ( ret < 0 ) {
mlog_errno ( ret ) ;
2005-12-16 01:31:24 +03:00
goto out_unlock ;
}
2010-07-07 01:36:06 +04:00
/* must not update i_size! */
ret = block_commit_write ( page , block_start + 1 ,
block_start + 1 ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
else
ret = 0 ;
}
2005-12-16 01:31:24 +03:00
2014-10-10 02:25:15 +04:00
/*
* fs - writeback will release the dirty pages without page lock
* whose offset are over inode size , the release happens at
* block_write_full_page ( ) .
*/
i_size_write ( inode , abs_to ) ;
inode - > i_blocks = ocfs2_inode_sector_count ( inode ) ;
di - > i_size = cpu_to_le64 ( ( u64 ) i_size_read ( inode ) ) ;
2016-09-14 17:48:04 +03:00
inode - > i_mtime = inode - > i_ctime = current_time ( inode ) ;
2014-10-10 02:25:15 +04:00
di - > i_mtime = di - > i_ctime = cpu_to_le64 ( inode - > i_mtime . tv_sec ) ;
di - > i_ctime_nsec = cpu_to_le32 ( inode - > i_mtime . tv_nsec ) ;
di - > i_mtime_nsec = di - > i_ctime_nsec ;
2014-02-11 02:25:53 +04:00
if ( handle ) {
ocfs2_journal_dirty ( handle , di_bh ) ;
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 1 ) ;
2014-02-11 02:25:53 +04:00
}
2010-07-07 01:36:06 +04:00
2005-12-16 01:31:24 +03:00
out_unlock :
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2014-10-10 02:25:15 +04:00
out_commit_trans :
if ( handle )
ocfs2_commit_trans ( OCFS2_SB ( inode - > i_sb ) , handle ) ;
2005-12-16 01:31:24 +03:00
out :
return ret ;
}
2010-07-02 02:13:31 +04:00
/*
* Find the next range to zero . We do this in terms of bytes because
* that ' s what ocfs2_zero_extend ( ) wants , and it is dealing with the
* pagecache . We may return multiple extents .
*
* zero_start and zero_end are ocfs2_zero_extend ( ) s current idea of what
* needs to be zeroed . range_start and range_end return the next zeroing
* range . A subsequent call should pass the previous range_end as its
* zero_start . If range_end is 0 , there ' s nothing to do .
*
* Unwritten extents are skipped over . Refcounted extents are CoWd .
*/
static int ocfs2_zero_extend_get_range ( struct inode * inode ,
struct buffer_head * di_bh ,
u64 zero_start , u64 zero_end ,
u64 * range_start , u64 * range_end )
2005-12-16 01:31:24 +03:00
{
2010-07-02 02:13:31 +04:00
int rc = 0 , needs_cow = 0 ;
u32 p_cpos , zero_clusters = 0 ;
u32 zero_cpos =
zero_start > > OCFS2_SB ( inode - > i_sb ) - > s_clustersize_bits ;
u32 last_cpos = ocfs2_clusters_for_bytes ( inode - > i_sb , zero_end ) ;
unsigned int num_clusters = 0 ;
unsigned int ext_flags = 0 ;
2005-12-16 01:31:24 +03:00
2010-07-02 02:13:31 +04:00
while ( zero_cpos < last_cpos ) {
rc = ocfs2_get_clusters ( inode , zero_cpos , & p_cpos ,
& num_clusters , & ext_flags ) ;
if ( rc ) {
mlog_errno ( rc ) ;
2005-12-16 01:31:24 +03:00
goto out ;
}
2010-07-02 02:13:31 +04:00
if ( p_cpos & & ! ( ext_flags & OCFS2_EXT_UNWRITTEN ) ) {
zero_clusters = num_clusters ;
if ( ext_flags & OCFS2_EXT_REFCOUNTED )
needs_cow = 1 ;
break ;
}
zero_cpos + = num_clusters ;
}
if ( ! zero_clusters ) {
* range_end = 0 ;
goto out ;
}
while ( ( zero_cpos + zero_clusters ) < last_cpos ) {
rc = ocfs2_get_clusters ( inode , zero_cpos + zero_clusters ,
& p_cpos , & num_clusters ,
& ext_flags ) ;
if ( rc ) {
mlog_errno ( rc ) ;
goto out ;
}
if ( ! p_cpos | | ( ext_flags & OCFS2_EXT_UNWRITTEN ) )
break ;
if ( ext_flags & OCFS2_EXT_REFCOUNTED )
needs_cow = 1 ;
zero_clusters + = num_clusters ;
}
if ( ( zero_cpos + zero_clusters ) > last_cpos )
zero_clusters = last_cpos - zero_cpos ;
if ( needs_cow ) {
2013-08-14 03:00:58 +04:00
rc = ocfs2_refcount_cow ( inode , di_bh , zero_cpos ,
2010-08-12 06:36:38 +04:00
zero_clusters , UINT_MAX ) ;
2010-07-02 02:13:31 +04:00
if ( rc ) {
mlog_errno ( rc ) ;
goto out ;
}
}
* range_start = ocfs2_clusters_to_bytes ( inode - > i_sb , zero_cpos ) ;
* range_end = ocfs2_clusters_to_bytes ( inode - > i_sb ,
zero_cpos + zero_clusters ) ;
out :
return rc ;
}
/*
* Zero one range returned from ocfs2_zero_extend_get_range ( ) . The caller
* has made sure that the entire range needs zeroing .
*/
static int ocfs2_zero_extend_range ( struct inode * inode , u64 range_start ,
2014-02-11 02:25:53 +04:00
u64 range_end , struct buffer_head * di_bh )
2010-07-02 02:13:31 +04:00
{
int rc = 0 ;
u64 next_pos ;
u64 zero_pos = range_start ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_zero_extend_range (
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) range_start ,
( unsigned long long ) range_end ) ;
2010-07-02 02:13:31 +04:00
BUG_ON ( range_start > = range_end ) ;
while ( zero_pos < range_end ) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
next_pos = ( zero_pos & PAGE_MASK ) + PAGE_SIZE ;
2010-07-02 02:13:31 +04:00
if ( next_pos > range_end )
next_pos = range_end ;
2014-02-11 02:25:53 +04:00
rc = ocfs2_write_zero_page ( inode , zero_pos , next_pos , di_bh ) ;
2010-07-02 02:13:31 +04:00
if ( rc < 0 ) {
mlog_errno ( rc ) ;
break ;
}
zero_pos = next_pos ;
2006-10-04 04:53:05 +04:00
/*
* Very large extends have the potential to lock up
* the cpu for extended periods of time .
*/
cond_resched ( ) ;
2005-12-16 01:31:24 +03:00
}
2010-07-02 02:13:31 +04:00
return rc ;
}
int ocfs2_zero_extend ( struct inode * inode , struct buffer_head * di_bh ,
loff_t zero_to_size )
{
int ret = 0 ;
u64 zero_start , range_start = 0 , range_end = 0 ;
struct super_block * sb = inode - > i_sb ;
zero_start = ocfs2_align_bytes_to_blocks ( sb , i_size_read ( inode ) ) ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_zero_extend ( ( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) zero_start ,
( unsigned long long ) i_size_read ( inode ) ) ;
2010-07-02 02:13:31 +04:00
while ( zero_start < zero_to_size ) {
ret = ocfs2_zero_extend_get_range ( inode , di_bh , zero_start ,
zero_to_size ,
& range_start ,
& range_end ) ;
if ( ret ) {
mlog_errno ( ret ) ;
break ;
}
if ( ! range_end )
break ;
/* Trim the ends */
if ( range_start < zero_start )
range_start = zero_start ;
if ( range_end > zero_to_size )
range_end = zero_to_size ;
ret = ocfs2_zero_extend_range ( inode , range_start ,
2014-02-11 02:25:53 +04:00
range_end , di_bh ) ;
2010-07-02 02:13:31 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
break ;
}
zero_start = range_end ;
}
2005-12-16 01:31:24 +03:00
return ret ;
}
2010-07-02 02:13:31 +04:00
int ocfs2_extend_no_holes ( struct inode * inode , struct buffer_head * di_bh ,
u64 new_i_size , u64 zero_to )
2007-08-29 04:13:23 +04:00
{
int ret ;
u32 clusters_to_add ;
struct ocfs2_inode_info * oi = OCFS2_I ( inode ) ;
2010-07-02 02:13:31 +04:00
/*
* Only quota files call this without a bh , and they can ' t be
* refcounted .
*/
2016-11-10 01:13:09 +03:00
BUG_ON ( ! di_bh & & ocfs2_is_refcount_inode ( inode ) ) ;
2010-07-02 02:13:31 +04:00
BUG_ON ( ! di_bh & & ! ( oi - > ip_flags & OCFS2_INODE_SYSTEM_FILE ) ) ;
2007-08-29 04:13:23 +04:00
clusters_to_add = ocfs2_clusters_for_bytes ( inode - > i_sb , new_i_size ) ;
if ( clusters_to_add < oi - > ip_clusters )
clusters_to_add = 0 ;
else
clusters_to_add - = oi - > ip_clusters ;
if ( clusters_to_add ) {
2018-06-08 03:04:38 +03:00
ret = ocfs2_extend_allocation ( inode , oi - > ip_clusters ,
clusters_to_add , 0 ) ;
2007-08-29 04:13:23 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
}
/*
* Call this even if we don ' t add any clusters to the tree . We
* still need to zero the area between the old i_size and the
* new i_size .
*/
2010-07-02 02:13:31 +04:00
ret = ocfs2_zero_extend ( inode , di_bh , zero_to ) ;
2007-08-29 04:13:23 +04:00
if ( ret < 0 )
mlog_errno ( ret ) ;
out :
return ret ;
}
2005-12-16 01:31:24 +03:00
static int ocfs2_extend_file ( struct inode * inode ,
struct buffer_head * di_bh ,
2007-08-29 04:13:23 +04:00
u64 new_i_size )
2005-12-16 01:31:24 +03:00
{
2007-10-19 02:23:46 +04:00
int ret = 0 ;
2007-09-08 01:46:51 +04:00
struct ocfs2_inode_info * oi = OCFS2_I ( inode ) ;
2005-12-16 01:31:24 +03:00
2007-08-29 04:13:23 +04:00
BUG_ON ( ! di_bh ) ;
2006-05-06 06:04:03 +04:00
2005-12-16 01:31:24 +03:00
/* setattr sometimes calls us like this. */
if ( new_i_size = = 0 )
goto out ;
if ( i_size_read ( inode ) = = new_i_size )
2010-07-02 02:13:31 +04:00
goto out ;
2005-12-16 01:31:24 +03:00
BUG_ON ( new_i_size < i_size_read ( inode ) ) ;
2006-10-04 04:44:42 +04:00
/*
2007-08-29 04:13:23 +04:00
* The alloc sem blocks people in read / write from reading our
* allocation until we ' re done changing it . We depend on
2022-03-23 00:38:45 +03:00
* i_rwsem to block other extend / truncate calls while we ' re
2010-07-02 02:13:31 +04:00
* here . We even have to hold it for sparse files because there
* might be some tail zeroing .
2006-10-04 04:44:42 +04:00
*/
2007-09-08 01:46:51 +04:00
down_write ( & oi - > ip_alloc_sem ) ;
if ( oi - > ip_dyn_features & OCFS2_INLINE_DATA_FL ) {
/*
* We can optimize small extends by keeping the inodes
* inline data .
*/
if ( ocfs2_size_fits_inline_data ( di_bh , new_i_size ) ) {
up_write ( & oi - > ip_alloc_sem ) ;
goto out_update_size ;
}
ret = ocfs2_convert_inline_data_to_extents ( inode , di_bh ) ;
if ( ret ) {
up_write ( & oi - > ip_alloc_sem ) ;
mlog_errno ( ret ) ;
2007-10-19 02:23:46 +04:00
goto out ;
2007-09-08 01:46:51 +04:00
}
}
2010-07-02 02:13:31 +04:00
if ( ocfs2_sparse_alloc ( OCFS2_SB ( inode - > i_sb ) ) )
ret = ocfs2_zero_extend ( inode , di_bh , new_i_size ) ;
else
ret = ocfs2_extend_no_holes ( inode , di_bh , new_i_size ,
new_i_size ) ;
2007-09-08 01:46:51 +04:00
up_write ( & oi - > ip_alloc_sem ) ;
2007-08-29 04:13:23 +04:00
2006-10-04 04:44:42 +04:00
if ( ret < 0 ) {
mlog_errno ( ret ) ;
2007-10-19 02:23:46 +04:00
goto out ;
2006-05-06 06:04:03 +04:00
}
2007-01-17 23:53:31 +03:00
out_update_size :
2007-08-29 04:13:23 +04:00
ret = ocfs2_simple_size_update ( inode , di_bh , new_i_size ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
2005-12-16 01:31:24 +03:00
out :
return ret ;
}
2021-01-21 16:19:43 +03:00
int ocfs2_setattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
struct iattr * attr )
2005-12-16 01:31:24 +03:00
{
int status = 0 , size_change ;
ocfs2: fix BUG_ON() in ocfs2_ci_checkpointed()
PID: 614 TASK: ffff882a739da580 CPU: 3 COMMAND: "ocfs2dc"
#0 [ffff882ecc3759b0] machine_kexec at ffffffff8103b35d
#1 [ffff882ecc375a20] crash_kexec at ffffffff810b95b5
#2 [ffff882ecc375af0] oops_end at ffffffff815091d8
#3 [ffff882ecc375b20] die at ffffffff8101868b
#4 [ffff882ecc375b50] do_trap at ffffffff81508bb0
#5 [ffff882ecc375ba0] do_invalid_op at ffffffff810165e5
#6 [ffff882ecc375c40] invalid_op at ffffffff815116fb
[exception RIP: ocfs2_ci_checkpointed+208]
RIP: ffffffffa0a7e940 RSP: ffff882ecc375cf0 RFLAGS: 00010002
RAX: 0000000000000001 RBX: 000000000000654b RCX: ffff8812dc83f1f8
RDX: 00000000000017d9 RSI: ffff8812dc83f1f8 RDI: ffffffffa0b2c318
RBP: ffff882ecc375d20 R8: ffff882ef6ecfa60 R9: ffff88301f272200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffffffffff
R13: ffff8812dc83f4f0 R14: 0000000000000000 R15: ffff8812dc83f1f8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff882ecc375d28] ocfs2_check_meta_downconvert at ffffffffa0a7edbd [ocfs2]
#8 [ffff882ecc375d38] ocfs2_unblock_lock at ffffffffa0a84af8 [ocfs2]
#9 [ffff882ecc375dc8] ocfs2_process_blocked_lock at ffffffffa0a85285 [ocfs2]
#10 [ffff882ecc375e18] ocfs2_downconvert_thread_do_work at ffffffffa0a85445 [ocfs2]
#11 [ffff882ecc375e68] ocfs2_downconvert_thread at ffffffffa0a854de [ocfs2]
#12 [ffff882ecc375ee8] kthread at ffffffff81090da7
#13 [ffff882ecc375f48] kernel_thread_helper at ffffffff81511884
assert is tripped because the tran is not checkpointed and the lock level is PR.
Some time ago, chmod command had been executed. As result, the following call
chain left the inode cluster lock in PR state, latter on causing the assert.
system_call_fastpath
-> my_chmod
-> sys_chmod
-> sys_fchmodat
-> notify_change
-> ocfs2_setattr
-> posix_acl_chmod
-> ocfs2_iop_set_acl
-> ocfs2_set_acl
-> ocfs2_acl_set_mode
Here is how.
1119 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1120 {
1247 ocfs2_inode_unlock(inode, 1); <<< WRONG thing to do.
..
1258 if (!status && attr->ia_valid & ATTR_MODE) {
1259 status = posix_acl_chmod(inode, inode->i_mode);
519 posix_acl_chmod(struct inode *inode, umode_t mode)
520 {
..
539 ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
287 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, ...
288 {
289 return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
224 int ocfs2_set_acl(handle_t *handle,
225 struct inode *inode, ...
231 {
..
252 ret = ocfs2_acl_set_mode(inode, di_bh,
253 handle, mode);
168 static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head ...
170 {
183 if (handle == NULL) {
>>> BUG: inode lock not held in ex at this point <<<
184 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
185 OCFS2_INODE_UPDATE_CREDITS);
ocfs2_setattr.#1247 we unlock and at #1259 call posix_acl_chmod. When we reach
ocfs2_acl_set_mode.#181 and do trans, the inode cluster lock is not held in EX
mode (it should be). How this could have happended?
We are the lock master, were holding lock EX and have released it in
ocfs2_setattr.#1247. Note that there are no holders of this lock at
this point. Another node needs the lock in PR, and we downconvert from
EX to PR. So the inode lock is PR when do the trans in
ocfs2_acl_set_mode.#184. The trans stays in core (not flushed to disc).
Now another node want the lock in EX, downconvert thread gets kicked
(the one that tripped assert abovt), finds an unflushed trans but the
lock is not EX (it is PR). If the lock was at EX, it would have flushed
the trans ocfs2_ci_checkpointed -> ocfs2_start_checkpoint before
downconverting (to NULL) for the request.
ocfs2_setattr must not drop inode lock ex in this code path. If it
does, takes it again before the trans, say in ocfs2_set_acl, another
cluster node can get in between, execute another setattr, overwriting
the one in progress on this node, resulting in a mode acl size combo
that is a mix of the two.
Orabug: 20189959
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 01:44:31 +03:00
int inode_locked = 0 ;
2015-03-18 01:25:59 +03:00
struct inode * inode = d_inode ( dentry ) ;
2005-12-16 01:31:24 +03:00
struct super_block * sb = inode - > i_sb ;
struct ocfs2_super * osb = OCFS2_SB ( sb ) ;
struct buffer_head * bh = NULL ;
2006-10-10 05:11:45 +04:00
handle_t * handle = NULL ;
2009-06-02 16:24:01 +04:00
struct dquot * transfer_to [ MAXQUOTAS ] = { } ;
2010-05-13 22:18:45 +04:00
int qtype ;
2017-02-23 02:40:44 +03:00
int had_lock ;
struct ocfs2_lock_holder oh ;
2005-12-16 01:31:24 +03:00
2011-02-22 17:14:41 +03:00
trace_ocfs2_setattr ( inode , dentry ,
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
dentry - > d_name . len , dentry - > d_name . name ,
attr - > ia_valid , attr - > ia_mode ,
2013-02-01 05:33:53 +04:00
from_kuid ( & init_user_ns , attr - > ia_uid ) ,
from_kgid ( & init_user_ns , attr - > ia_gid ) ) ;
2005-12-16 01:31:24 +03:00
2008-04-18 21:23:53 +04:00
/* ensuring we don't even attempt to truncate a symlink */
if ( S_ISLNK ( inode - > i_mode ) )
attr - > ia_valid & = ~ ATTR_SIZE ;
2005-12-16 01:31:24 +03:00
# define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
| ATTR_GID | ATTR_UID | ATTR_MODE )
2011-02-22 17:14:41 +03:00
if ( ! ( attr - > ia_valid & OCFS2_VALID_ATTRS ) )
2005-12-16 01:31:24 +03:00
return 0 ;
2021-01-21 16:19:26 +03:00
status = setattr_prepare ( & init_user_ns , dentry , attr ) ;
2005-12-16 01:31:24 +03:00
if ( status )
return status ;
2022-06-21 17:14:54 +03:00
if ( is_quota_modification ( mnt_userns , inode , attr ) ) {
2015-07-14 14:36:02 +03:00
status = dquot_initialize ( inode ) ;
if ( status )
return status ;
}
2005-12-16 01:31:24 +03:00
size_change = S_ISREG ( inode - > i_mode ) & & attr - > ia_valid & ATTR_SIZE ;
if ( size_change ) {
2017-11-16 04:31:40 +03:00
/*
* Here we should wait dio to finish before inode lock
* to avoid a deadlock between ocfs2_setattr ( ) and
* ocfs2_dio_end_io_write ( )
*/
inode_dio_wait ( inode ) ;
2005-12-16 01:31:24 +03:00
status = ocfs2_rw_lock ( inode , 1 ) ;
if ( status < 0 ) {
mlog_errno ( status ) ;
goto bail ;
}
}
2017-02-23 02:40:44 +03:00
had_lock = ocfs2_inode_lock_tracker ( inode , & bh , 1 , & oh ) ;
if ( had_lock < 0 ) {
status = had_lock ;
2005-12-16 01:31:24 +03:00
goto bail_unlock_rw ;
2017-02-23 02:40:44 +03:00
} else if ( had_lock ) {
/*
* As far as we know , ocfs2_setattr ( ) could only be the first
* VFS entry point in the call chain of recursive cluster
* locking issue .
*
* For instance :
* chmod_common ( )
* notify_change ( )
* ocfs2_setattr ( )
* posix_acl_chmod ( )
* ocfs2_iop_get_acl ( )
*
* But , we ' re not 100 % sure if it ' s always true , because the
* ordering of the VFS entry points in the call chain is out
* of our control . So , we ' d better dump the stack here to
* catch the other cases of recursive locking .
*/
mlog ( ML_ERROR , " Another case of recursive locking: \n " ) ;
dump_stack ( ) ;
2005-12-16 01:31:24 +03:00
}
ocfs2: fix BUG_ON() in ocfs2_ci_checkpointed()
PID: 614 TASK: ffff882a739da580 CPU: 3 COMMAND: "ocfs2dc"
#0 [ffff882ecc3759b0] machine_kexec at ffffffff8103b35d
#1 [ffff882ecc375a20] crash_kexec at ffffffff810b95b5
#2 [ffff882ecc375af0] oops_end at ffffffff815091d8
#3 [ffff882ecc375b20] die at ffffffff8101868b
#4 [ffff882ecc375b50] do_trap at ffffffff81508bb0
#5 [ffff882ecc375ba0] do_invalid_op at ffffffff810165e5
#6 [ffff882ecc375c40] invalid_op at ffffffff815116fb
[exception RIP: ocfs2_ci_checkpointed+208]
RIP: ffffffffa0a7e940 RSP: ffff882ecc375cf0 RFLAGS: 00010002
RAX: 0000000000000001 RBX: 000000000000654b RCX: ffff8812dc83f1f8
RDX: 00000000000017d9 RSI: ffff8812dc83f1f8 RDI: ffffffffa0b2c318
RBP: ffff882ecc375d20 R8: ffff882ef6ecfa60 R9: ffff88301f272200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffffffffff
R13: ffff8812dc83f4f0 R14: 0000000000000000 R15: ffff8812dc83f1f8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff882ecc375d28] ocfs2_check_meta_downconvert at ffffffffa0a7edbd [ocfs2]
#8 [ffff882ecc375d38] ocfs2_unblock_lock at ffffffffa0a84af8 [ocfs2]
#9 [ffff882ecc375dc8] ocfs2_process_blocked_lock at ffffffffa0a85285 [ocfs2]
#10 [ffff882ecc375e18] ocfs2_downconvert_thread_do_work at ffffffffa0a85445 [ocfs2]
#11 [ffff882ecc375e68] ocfs2_downconvert_thread at ffffffffa0a854de [ocfs2]
#12 [ffff882ecc375ee8] kthread at ffffffff81090da7
#13 [ffff882ecc375f48] kernel_thread_helper at ffffffff81511884
assert is tripped because the tran is not checkpointed and the lock level is PR.
Some time ago, chmod command had been executed. As result, the following call
chain left the inode cluster lock in PR state, latter on causing the assert.
system_call_fastpath
-> my_chmod
-> sys_chmod
-> sys_fchmodat
-> notify_change
-> ocfs2_setattr
-> posix_acl_chmod
-> ocfs2_iop_set_acl
-> ocfs2_set_acl
-> ocfs2_acl_set_mode
Here is how.
1119 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1120 {
1247 ocfs2_inode_unlock(inode, 1); <<< WRONG thing to do.
..
1258 if (!status && attr->ia_valid & ATTR_MODE) {
1259 status = posix_acl_chmod(inode, inode->i_mode);
519 posix_acl_chmod(struct inode *inode, umode_t mode)
520 {
..
539 ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
287 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, ...
288 {
289 return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
224 int ocfs2_set_acl(handle_t *handle,
225 struct inode *inode, ...
231 {
..
252 ret = ocfs2_acl_set_mode(inode, di_bh,
253 handle, mode);
168 static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head ...
170 {
183 if (handle == NULL) {
>>> BUG: inode lock not held in ex at this point <<<
184 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
185 OCFS2_INODE_UPDATE_CREDITS);
ocfs2_setattr.#1247 we unlock and at #1259 call posix_acl_chmod. When we reach
ocfs2_acl_set_mode.#181 and do trans, the inode cluster lock is not held in EX
mode (it should be). How this could have happended?
We are the lock master, were holding lock EX and have released it in
ocfs2_setattr.#1247. Note that there are no holders of this lock at
this point. Another node needs the lock in PR, and we downconvert from
EX to PR. So the inode lock is PR when do the trans in
ocfs2_acl_set_mode.#184. The trans stays in core (not flushed to disc).
Now another node want the lock in EX, downconvert thread gets kicked
(the one that tripped assert abovt), finds an unflushed trans but the
lock is not EX (it is PR). If the lock was at EX, it would have flushed
the trans ocfs2_ci_checkpointed -> ocfs2_start_checkpoint before
downconverting (to NULL) for the request.
ocfs2_setattr must not drop inode lock ex in this code path. If it
does, takes it again before the trans, say in ocfs2_set_acl, another
cluster node can get in between, execute another setattr, overwriting
the one in progress on this node, resulting in a mode acl size combo
that is a mix of the two.
Orabug: 20189959
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 01:44:31 +03:00
inode_locked = 1 ;
2005-12-16 01:31:24 +03:00
2014-02-11 02:25:51 +04:00
if ( size_change ) {
2010-02-26 13:18:25 +03:00
status = inode_newsize_ok ( inode , attr - > ia_size ) ;
if ( status )
2007-07-20 23:02:14 +04:00
goto bail_unlock ;
2014-02-11 02:25:51 +04:00
if ( i_size_read ( inode ) > = attr - > ia_size ) {
2008-09-04 07:03:41 +04:00
if ( ocfs2_should_order_data ( inode ) ) {
status = ocfs2_begin_ordered_truncate ( inode ,
attr - > ia_size ) ;
if ( status )
goto bail_unlock ;
}
2005-12-16 01:31:24 +03:00
status = ocfs2_truncate_file ( inode , bh , attr - > ia_size ) ;
2008-09-04 07:03:41 +04:00
} else
2007-08-29 04:13:23 +04:00
status = ocfs2_extend_file ( inode , bh , attr - > ia_size ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
if ( status ! = - ENOSPC )
mlog_errno ( status ) ;
status = - ENOSPC ;
goto bail_unlock ;
}
}
2013-02-01 05:35:29 +04:00
if ( ( attr - > ia_valid & ATTR_UID & & ! uid_eq ( attr - > ia_uid , inode - > i_uid ) ) | |
( attr - > ia_valid & ATTR_GID & & ! gid_eq ( attr - > ia_gid , inode - > i_gid ) ) ) {
2009-06-02 16:24:01 +04:00
/*
* Gather pointers to quota structures so that allocation /
* freeing of quota structures happens here and not inside
2010-03-03 17:05:03 +03:00
* dquot_transfer ( ) where we have problems with lock ordering
2009-06-02 16:24:01 +04:00
*/
2013-02-01 05:35:29 +04:00
if ( attr - > ia_valid & ATTR_UID & & ! uid_eq ( attr - > ia_uid , inode - > i_uid )
2008-10-09 21:38:40 +04:00
& & OCFS2_HAS_RO_COMPAT_FEATURE ( sb ,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA ) ) {
2012-09-16 14:11:50 +04:00
transfer_to [ USRQUOTA ] = dqget ( sb , make_kqid_uid ( attr - > ia_uid ) ) ;
2015-06-24 19:07:02 +03:00
if ( IS_ERR ( transfer_to [ USRQUOTA ] ) ) {
status = PTR_ERR ( transfer_to [ USRQUOTA ] ) ;
2019-10-19 06:19:47 +03:00
transfer_to [ USRQUOTA ] = NULL ;
2008-10-09 21:38:40 +04:00
goto bail_unlock ;
2009-06-02 16:24:01 +04:00
}
2008-10-09 21:38:40 +04:00
}
2013-02-01 05:35:29 +04:00
if ( attr - > ia_valid & ATTR_GID & & ! gid_eq ( attr - > ia_gid , inode - > i_gid )
2008-10-09 21:38:40 +04:00
& & OCFS2_HAS_RO_COMPAT_FEATURE ( sb ,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA ) ) {
2012-09-16 14:11:50 +04:00
transfer_to [ GRPQUOTA ] = dqget ( sb , make_kqid_gid ( attr - > ia_gid ) ) ;
2015-06-24 19:07:02 +03:00
if ( IS_ERR ( transfer_to [ GRPQUOTA ] ) ) {
status = PTR_ERR ( transfer_to [ GRPQUOTA ] ) ;
2019-10-19 06:19:47 +03:00
transfer_to [ GRPQUOTA ] = NULL ;
2008-10-09 21:38:40 +04:00
goto bail_unlock ;
2009-06-02 16:24:01 +04:00
}
2008-10-09 21:38:40 +04:00
}
2021-04-09 23:27:29 +03:00
down_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
2009-06-02 16:24:01 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS +
2 * ocfs2_quota_trans_credits ( sb ) ) ;
2008-10-09 21:38:40 +04:00
if ( IS_ERR ( handle ) ) {
status = PTR_ERR ( handle ) ;
mlog_errno ( status ) ;
2021-04-09 23:27:29 +03:00
goto bail_unlock_alloc ;
2008-10-09 21:38:40 +04:00
}
2010-05-13 22:18:45 +04:00
status = __dquot_transfer ( inode , transfer_to ) ;
2008-10-09 21:38:40 +04:00
if ( status < 0 )
goto bail_commit ;
} else {
2021-04-09 23:27:29 +03:00
down_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
2008-10-09 21:38:40 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
if ( IS_ERR ( handle ) ) {
status = PTR_ERR ( handle ) ;
mlog_errno ( status ) ;
2021-04-09 23:27:29 +03:00
goto bail_unlock_alloc ;
2008-10-09 21:38:40 +04:00
}
2005-12-16 01:31:24 +03:00
}
2021-01-21 16:19:26 +03:00
setattr_copy ( & init_user_ns , inode , attr ) ;
2010-06-04 13:30:02 +04:00
mark_inode_dirty ( inode ) ;
2005-12-16 01:31:24 +03:00
status = ocfs2_mark_inode_dirty ( handle , inode , bh ) ;
if ( status < 0 )
mlog_errno ( status ) ;
bail_commit :
2006-10-10 03:48:10 +04:00
ocfs2_commit_trans ( osb , handle ) ;
2021-04-09 23:27:29 +03:00
bail_unlock_alloc :
up_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
2005-12-16 01:31:24 +03:00
bail_unlock :
2017-02-23 02:40:44 +03:00
if ( status & & inode_locked ) {
ocfs2_inode_unlock_tracker ( inode , 1 , & oh , had_lock ) ;
ocfs2: fix BUG_ON() in ocfs2_ci_checkpointed()
PID: 614 TASK: ffff882a739da580 CPU: 3 COMMAND: "ocfs2dc"
#0 [ffff882ecc3759b0] machine_kexec at ffffffff8103b35d
#1 [ffff882ecc375a20] crash_kexec at ffffffff810b95b5
#2 [ffff882ecc375af0] oops_end at ffffffff815091d8
#3 [ffff882ecc375b20] die at ffffffff8101868b
#4 [ffff882ecc375b50] do_trap at ffffffff81508bb0
#5 [ffff882ecc375ba0] do_invalid_op at ffffffff810165e5
#6 [ffff882ecc375c40] invalid_op at ffffffff815116fb
[exception RIP: ocfs2_ci_checkpointed+208]
RIP: ffffffffa0a7e940 RSP: ffff882ecc375cf0 RFLAGS: 00010002
RAX: 0000000000000001 RBX: 000000000000654b RCX: ffff8812dc83f1f8
RDX: 00000000000017d9 RSI: ffff8812dc83f1f8 RDI: ffffffffa0b2c318
RBP: ffff882ecc375d20 R8: ffff882ef6ecfa60 R9: ffff88301f272200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffffffffff
R13: ffff8812dc83f4f0 R14: 0000000000000000 R15: ffff8812dc83f1f8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff882ecc375d28] ocfs2_check_meta_downconvert at ffffffffa0a7edbd [ocfs2]
#8 [ffff882ecc375d38] ocfs2_unblock_lock at ffffffffa0a84af8 [ocfs2]
#9 [ffff882ecc375dc8] ocfs2_process_blocked_lock at ffffffffa0a85285 [ocfs2]
#10 [ffff882ecc375e18] ocfs2_downconvert_thread_do_work at ffffffffa0a85445 [ocfs2]
#11 [ffff882ecc375e68] ocfs2_downconvert_thread at ffffffffa0a854de [ocfs2]
#12 [ffff882ecc375ee8] kthread at ffffffff81090da7
#13 [ffff882ecc375f48] kernel_thread_helper at ffffffff81511884
assert is tripped because the tran is not checkpointed and the lock level is PR.
Some time ago, chmod command had been executed. As result, the following call
chain left the inode cluster lock in PR state, latter on causing the assert.
system_call_fastpath
-> my_chmod
-> sys_chmod
-> sys_fchmodat
-> notify_change
-> ocfs2_setattr
-> posix_acl_chmod
-> ocfs2_iop_set_acl
-> ocfs2_set_acl
-> ocfs2_acl_set_mode
Here is how.
1119 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1120 {
1247 ocfs2_inode_unlock(inode, 1); <<< WRONG thing to do.
..
1258 if (!status && attr->ia_valid & ATTR_MODE) {
1259 status = posix_acl_chmod(inode, inode->i_mode);
519 posix_acl_chmod(struct inode *inode, umode_t mode)
520 {
..
539 ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
287 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, ...
288 {
289 return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
224 int ocfs2_set_acl(handle_t *handle,
225 struct inode *inode, ...
231 {
..
252 ret = ocfs2_acl_set_mode(inode, di_bh,
253 handle, mode);
168 static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head ...
170 {
183 if (handle == NULL) {
>>> BUG: inode lock not held in ex at this point <<<
184 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
185 OCFS2_INODE_UPDATE_CREDITS);
ocfs2_setattr.#1247 we unlock and at #1259 call posix_acl_chmod. When we reach
ocfs2_acl_set_mode.#181 and do trans, the inode cluster lock is not held in EX
mode (it should be). How this could have happended?
We are the lock master, were holding lock EX and have released it in
ocfs2_setattr.#1247. Note that there are no holders of this lock at
this point. Another node needs the lock in PR, and we downconvert from
EX to PR. So the inode lock is PR when do the trans in
ocfs2_acl_set_mode.#184. The trans stays in core (not flushed to disc).
Now another node want the lock in EX, downconvert thread gets kicked
(the one that tripped assert abovt), finds an unflushed trans but the
lock is not EX (it is PR). If the lock was at EX, it would have flushed
the trans ocfs2_ci_checkpointed -> ocfs2_start_checkpoint before
downconverting (to NULL) for the request.
ocfs2_setattr must not drop inode lock ex in this code path. If it
does, takes it again before the trans, say in ocfs2_set_acl, another
cluster node can get in between, execute another setattr, overwriting
the one in progress on this node, resulting in a mode acl size combo
that is a mix of the two.
Orabug: 20189959
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 01:44:31 +03:00
inode_locked = 0 ;
}
2005-12-16 01:31:24 +03:00
bail_unlock_rw :
if ( size_change )
ocfs2_rw_unlock ( inode , 1 ) ;
bail :
2009-06-02 16:24:01 +04:00
/* Release quota pointers in case we acquired them */
2014-09-10 23:06:39 +04:00
for ( qtype = 0 ; qtype < OCFS2_MAXQUOTAS ; qtype + + )
2009-06-02 16:24:01 +04:00
dqput ( transfer_to [ qtype ] ) ;
2008-11-14 06:17:29 +03:00
if ( ! status & & attr - > ia_valid & ATTR_MODE ) {
2016-05-13 01:42:15 +03:00
status = ocfs2_acl_chmod ( inode , bh ) ;
2008-11-14 06:17:29 +03:00
if ( status < 0 )
mlog_errno ( status ) ;
}
ocfs2: fix BUG_ON() in ocfs2_ci_checkpointed()
PID: 614 TASK: ffff882a739da580 CPU: 3 COMMAND: "ocfs2dc"
#0 [ffff882ecc3759b0] machine_kexec at ffffffff8103b35d
#1 [ffff882ecc375a20] crash_kexec at ffffffff810b95b5
#2 [ffff882ecc375af0] oops_end at ffffffff815091d8
#3 [ffff882ecc375b20] die at ffffffff8101868b
#4 [ffff882ecc375b50] do_trap at ffffffff81508bb0
#5 [ffff882ecc375ba0] do_invalid_op at ffffffff810165e5
#6 [ffff882ecc375c40] invalid_op at ffffffff815116fb
[exception RIP: ocfs2_ci_checkpointed+208]
RIP: ffffffffa0a7e940 RSP: ffff882ecc375cf0 RFLAGS: 00010002
RAX: 0000000000000001 RBX: 000000000000654b RCX: ffff8812dc83f1f8
RDX: 00000000000017d9 RSI: ffff8812dc83f1f8 RDI: ffffffffa0b2c318
RBP: ffff882ecc375d20 R8: ffff882ef6ecfa60 R9: ffff88301f272200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffffffffff
R13: ffff8812dc83f4f0 R14: 0000000000000000 R15: ffff8812dc83f1f8
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff882ecc375d28] ocfs2_check_meta_downconvert at ffffffffa0a7edbd [ocfs2]
#8 [ffff882ecc375d38] ocfs2_unblock_lock at ffffffffa0a84af8 [ocfs2]
#9 [ffff882ecc375dc8] ocfs2_process_blocked_lock at ffffffffa0a85285 [ocfs2]
#10 [ffff882ecc375e18] ocfs2_downconvert_thread_do_work at ffffffffa0a85445 [ocfs2]
#11 [ffff882ecc375e68] ocfs2_downconvert_thread at ffffffffa0a854de [ocfs2]
#12 [ffff882ecc375ee8] kthread at ffffffff81090da7
#13 [ffff882ecc375f48] kernel_thread_helper at ffffffff81511884
assert is tripped because the tran is not checkpointed and the lock level is PR.
Some time ago, chmod command had been executed. As result, the following call
chain left the inode cluster lock in PR state, latter on causing the assert.
system_call_fastpath
-> my_chmod
-> sys_chmod
-> sys_fchmodat
-> notify_change
-> ocfs2_setattr
-> posix_acl_chmod
-> ocfs2_iop_set_acl
-> ocfs2_set_acl
-> ocfs2_acl_set_mode
Here is how.
1119 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1120 {
1247 ocfs2_inode_unlock(inode, 1); <<< WRONG thing to do.
..
1258 if (!status && attr->ia_valid & ATTR_MODE) {
1259 status = posix_acl_chmod(inode, inode->i_mode);
519 posix_acl_chmod(struct inode *inode, umode_t mode)
520 {
..
539 ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
287 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, ...
288 {
289 return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
224 int ocfs2_set_acl(handle_t *handle,
225 struct inode *inode, ...
231 {
..
252 ret = ocfs2_acl_set_mode(inode, di_bh,
253 handle, mode);
168 static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head ...
170 {
183 if (handle == NULL) {
>>> BUG: inode lock not held in ex at this point <<<
184 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
185 OCFS2_INODE_UPDATE_CREDITS);
ocfs2_setattr.#1247 we unlock and at #1259 call posix_acl_chmod. When we reach
ocfs2_acl_set_mode.#181 and do trans, the inode cluster lock is not held in EX
mode (it should be). How this could have happended?
We are the lock master, were holding lock EX and have released it in
ocfs2_setattr.#1247. Note that there are no holders of this lock at
this point. Another node needs the lock in PR, and we downconvert from
EX to PR. So the inode lock is PR when do the trans in
ocfs2_acl_set_mode.#184. The trans stays in core (not flushed to disc).
Now another node want the lock in EX, downconvert thread gets kicked
(the one that tripped assert abovt), finds an unflushed trans but the
lock is not EX (it is PR). If the lock was at EX, it would have flushed
the trans ocfs2_ci_checkpointed -> ocfs2_start_checkpoint before
downconverting (to NULL) for the request.
ocfs2_setattr must not drop inode lock ex in this code path. If it
does, takes it again before the trans, say in ocfs2_set_acl, another
cluster node can get in between, execute another setattr, overwriting
the one in progress on this node, resulting in a mode acl size combo
that is a mix of the two.
Orabug: 20189959
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 01:44:31 +03:00
if ( inode_locked )
2017-02-23 02:40:44 +03:00
ocfs2_inode_unlock_tracker ( inode , 1 , & oh , had_lock ) ;
2008-11-14 06:17:29 +03:00
2016-05-13 01:42:15 +03:00
brelse ( bh ) ;
2005-12-16 01:31:24 +03:00
return status ;
}
2021-01-21 16:19:43 +03:00
int ocfs2_getattr ( struct user_namespace * mnt_userns , const struct path * path ,
struct kstat * stat , u32 request_mask , unsigned int flags )
2005-12-16 01:31:24 +03:00
{
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 19:46:22 +03:00
struct inode * inode = d_inode ( path - > dentry ) ;
struct super_block * sb = path - > dentry - > d_sb ;
2005-12-16 01:31:24 +03:00
struct ocfs2_super * osb = sb - > s_fs_info ;
int err ;
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 19:46:22 +03:00
err = ocfs2_inode_revalidate ( path - > dentry ) ;
2005-12-16 01:31:24 +03:00
if ( err ) {
if ( err ! = - ENOENT )
mlog_errno ( err ) ;
goto bail ;
}
2021-01-21 16:19:30 +03:00
generic_fillattr ( & init_user_ns , inode , stat ) ;
2016-01-15 02:17:12 +03:00
/*
* If there is inline data in the inode , the inode will normally not
* have data blocks allocated ( it may have an external xattr block ) .
* Report at least one sector for such files , so tools like tar , rsync ,
* others don ' t incorrectly think the file is completely sparse .
*/
if ( unlikely ( OCFS2_I ( inode ) - > ip_dyn_features & OCFS2_INLINE_DATA_FL ) )
stat - > blocks + = ( stat - > size + 511 ) > > 9 ;
2005-12-16 01:31:24 +03:00
/* We set the blksize from the cluster size for performance */
stat - > blksize = osb - > s_clustersize ;
bail :
return err ;
}
2021-01-21 16:19:43 +03:00
int ocfs2_permission ( struct user_namespace * mnt_userns , struct inode * inode ,
int mask )
2006-11-27 04:59:21 +03:00
{
2017-02-23 02:40:44 +03:00
int ret , had_lock ;
struct ocfs2_lock_holder oh ;
2006-11-27 04:59:21 +03:00
2011-06-21 03:28:19 +04:00
if ( mask & MAY_NOT_BLOCK )
2011-01-07 09:49:58 +03:00
return - ECHILD ;
2017-02-23 02:40:44 +03:00
had_lock = ocfs2_inode_lock_tracker ( inode , NULL , 0 , & oh ) ;
if ( had_lock < 0 ) {
ret = had_lock ;
2006-11-27 04:59:21 +03:00
goto out ;
2017-02-23 02:40:44 +03:00
} else if ( had_lock ) {
/* See comments in ocfs2_setattr() for details.
* The call chain of this case could be :
* do_sys_open ( )
* may_open ( )
* inode_permission ( )
* ocfs2_permission ( )
* ocfs2_iop_get_acl ( )
*/
mlog ( ML_ERROR , " Another case of recursive locking: \n " ) ;
dump_stack ( ) ;
2006-11-27 04:59:21 +03:00
}
2021-01-21 16:19:24 +03:00
ret = generic_permission ( & init_user_ns , inode , mask ) ;
2006-11-27 04:59:21 +03:00
2017-02-23 02:40:44 +03:00
ocfs2_inode_unlock_tracker ( inode , 0 , & oh , had_lock ) ;
2006-11-27 04:59:21 +03:00
out :
return ret ;
}
2007-03-10 03:53:21 +03:00
static int __ocfs2_write_remove_suid ( struct inode * inode ,
struct buffer_head * bh )
2005-12-16 01:31:24 +03:00
{
int ret ;
2006-10-10 05:11:45 +04:00
handle_t * handle ;
2005-12-16 01:31:24 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
struct ocfs2_dinode * di ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_write_remove_suid (
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
inode - > i_mode ) ;
2005-12-16 01:31:24 +03:00
2006-10-10 04:26:22 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
2008-10-20 21:23:51 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
2005-12-16 01:31:24 +03:00
mlog_errno ( ret ) ;
goto out ;
}
2009-02-13 03:41:25 +03:00
ret = ocfs2_journal_access_di ( handle , INODE_CACHE ( inode ) , bh ,
2008-10-18 06:25:01 +04:00
OCFS2_JOURNAL_ACCESS_WRITE ) ;
2005-12-16 01:31:24 +03:00
if ( ret < 0 ) {
mlog_errno ( ret ) ;
2007-03-10 03:53:21 +03:00
goto out_trans ;
2005-12-16 01:31:24 +03:00
}
inode - > i_mode & = ~ S_ISUID ;
if ( ( inode - > i_mode & S_ISGID ) & & ( inode - > i_mode & S_IXGRP ) )
inode - > i_mode & = ~ S_ISGID ;
di = ( struct ocfs2_dinode * ) bh - > b_data ;
di - > i_mode = cpu_to_le16 ( inode - > i_mode ) ;
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 0 ) ;
2005-12-16 01:31:24 +03:00
2010-03-20 00:13:52 +03:00
ocfs2_journal_dirty ( handle , bh ) ;
2007-03-10 03:53:21 +03:00
2005-12-16 01:31:24 +03:00
out_trans :
2006-10-10 03:48:10 +04:00
ocfs2_commit_trans ( osb , handle ) ;
2005-12-16 01:31:24 +03:00
out :
return ret ;
}
2007-03-10 03:53:21 +03:00
static int ocfs2_write_remove_suid ( struct inode * inode )
{
int ret ;
struct buffer_head * bh = NULL ;
2008-11-14 01:49:11 +03:00
ret = ocfs2_read_inode_block ( inode , & bh ) ;
2007-03-10 03:53:21 +03:00
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
ret = __ocfs2_write_remove_suid ( inode , bh ) ;
out :
brelse ( bh ) ;
return ret ;
}
2007-03-10 03:43:28 +03:00
/*
* Allocate enough extents to cover the region starting at byte offset
* start for len bytes . Existing extents are skipped , any extents
* added are marked as " unwritten " .
*/
static int ocfs2_allocate_unwritten_extents ( struct inode * inode ,
u64 start , u64 len )
{
int ret ;
u32 cpos , phys_cpos , clusters , alloc_size ;
2007-09-08 01:46:51 +04:00
u64 end = start + len ;
struct buffer_head * di_bh = NULL ;
if ( OCFS2_I ( inode ) - > ip_dyn_features & OCFS2_INLINE_DATA_FL ) {
2008-11-14 01:49:11 +03:00
ret = ocfs2_read_inode_block ( inode , & di_bh ) ;
2007-09-08 01:46:51 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
/*
* Nothing to do if the requested reservation range
* fits within the inode .
*/
if ( ocfs2_size_fits_inline_data ( di_bh , end ) )
goto out ;
ret = ocfs2_convert_inline_data_to_extents ( inode , di_bh ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
}
2007-03-10 03:43:28 +03:00
/*
* We consider both start and len to be inclusive .
*/
cpos = start > > OCFS2_SB ( inode - > i_sb ) - > s_clustersize_bits ;
clusters = ocfs2_clusters_for_bytes ( inode - > i_sb , start + len ) ;
clusters - = cpos ;
while ( clusters ) {
ret = ocfs2_get_clusters ( inode , cpos , & phys_cpos ,
& alloc_size , NULL ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
/*
* Hole or existing extent len can be arbitrary , so
* cap it to our own allocation request .
*/
if ( alloc_size > clusters )
alloc_size = clusters ;
if ( phys_cpos ) {
/*
* We already have an allocation at this
* region so we can safely skip it .
*/
goto next ;
}
2018-06-08 03:04:38 +03:00
ret = ocfs2_extend_allocation ( inode , cpos , alloc_size , 1 ) ;
2007-03-10 03:43:28 +03:00
if ( ret ) {
if ( ret ! = - ENOSPC )
mlog_errno ( ret ) ;
goto out ;
}
next :
cpos + = alloc_size ;
clusters - = alloc_size ;
}
ret = 0 ;
out :
2007-09-08 01:46:51 +04:00
brelse ( di_bh ) ;
2007-03-10 03:43:28 +03:00
return ret ;
}
2007-07-04 00:34:11 +04:00
/*
* Truncate a byte range , avoiding pages within partial clusters . This
* preserves those pages for the zeroing code to write to .
*/
static void ocfs2_truncate_cluster_pages ( struct inode * inode , u64 byte_start ,
u64 byte_len )
{
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
loff_t start , end ;
struct address_space * mapping = inode - > i_mapping ;
start = ( loff_t ) ocfs2_align_bytes_to_clusters ( inode - > i_sb , byte_start ) ;
end = byte_start + byte_len ;
end = end & ~ ( osb - > s_clustersize - 1 ) ;
if ( start < end ) {
unmap_mapping_range ( mapping , start , end - start , 0 ) ;
truncate_inode_pages_range ( mapping , start , end - 1 ) ;
}
}
ocfs2: issue zeroout to EOF blocks
For punch holes in EOF blocks, fallocate used buffer write to zero the
EOF blocks in last cluster. But since ->writepage will ignore EOF
pages, those zeros will not be flushed.
This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by
fallocate") will zero the EOF blocks when extend the file size, but it
isn't. The problem happened on those EOF pages, before writeback, those
pages had DIRTY flag set and all buffer_head in them also had DIRTY flag
set, when writeback run by write_cache_pages(), DIRTY flag on the page
was cleared, but DIRTY flag on the buffer_head not.
When next write happened to those EOF pages, since buffer_head already
had DIRTY flag set, it would not mark page DIRTY again. That made
writeback ignore them forever. That will cause data corruption. Even
directio write can't work because it will fail when trying to drop pages
caches before direct io, as it found the buffer_head for those pages
still had DIRTY flag set, then it will fall back to buffer io mode.
To make a summary of the issue, as writeback ingores EOF pages, once any
EOF page is generated, any write to it will only go to the page cache,
it will never be flushed to disk even file size extends and that page is
not EOF page any more. The fix is to avoid zero EOF blocks with buffer
write.
The following code snippet from qemu-img could trigger the corruption.
656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11
...
660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
660 fallocate(11, 0, 2275868672, 327680) = 0
658 pwrite64(11, "
Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-30 00:53:41 +03:00
/*
* zero out partial blocks of one cluster .
*
* start : file offset where zero starts , will be made upper block aligned .
* len : it will be trimmed to the end of current cluster if " start + len "
* is bigger than it .
*/
static int ocfs2_zeroout_partial_cluster ( struct inode * inode ,
u64 start , u64 len )
{
int ret ;
u64 start_block , end_block , nr_blocks ;
u64 p_block , offset ;
u32 cluster , p_cluster , nr_clusters ;
struct super_block * sb = inode - > i_sb ;
u64 end = ocfs2_align_bytes_to_clusters ( sb , start ) ;
if ( start + len < end )
end = start + len ;
start_block = ocfs2_blocks_for_bytes ( sb , start ) ;
end_block = ocfs2_blocks_for_bytes ( sb , end ) ;
nr_blocks = end_block - start_block ;
if ( ! nr_blocks )
return 0 ;
cluster = ocfs2_bytes_to_clusters ( sb , start ) ;
ret = ocfs2_get_clusters ( inode , cluster , & p_cluster ,
& nr_clusters , NULL ) ;
if ( ret )
return ret ;
if ( ! p_cluster )
return 0 ;
offset = start_block - ocfs2_clusters_to_blocks ( sb , cluster ) ;
p_block = ocfs2_clusters_to_blocks ( sb , p_cluster ) + offset ;
return sb_issue_zeroout ( sb , p_block , nr_blocks , GFP_NOFS ) ;
}
2007-07-04 00:34:11 +04:00
static int ocfs2_zero_partial_clusters ( struct inode * inode ,
u64 start , u64 len )
{
int ret = 0 ;
2016-09-20 00:44:42 +03:00
u64 tmpend = 0 ;
u64 end = start + len ;
2007-07-04 00:34:11 +04:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
unsigned int csize = osb - > s_clustersize ;
handle_t * handle ;
ocfs2: issue zeroout to EOF blocks
For punch holes in EOF blocks, fallocate used buffer write to zero the
EOF blocks in last cluster. But since ->writepage will ignore EOF
pages, those zeros will not be flushed.
This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by
fallocate") will zero the EOF blocks when extend the file size, but it
isn't. The problem happened on those EOF pages, before writeback, those
pages had DIRTY flag set and all buffer_head in them also had DIRTY flag
set, when writeback run by write_cache_pages(), DIRTY flag on the page
was cleared, but DIRTY flag on the buffer_head not.
When next write happened to those EOF pages, since buffer_head already
had DIRTY flag set, it would not mark page DIRTY again. That made
writeback ignore them forever. That will cause data corruption. Even
directio write can't work because it will fail when trying to drop pages
caches before direct io, as it found the buffer_head for those pages
still had DIRTY flag set, then it will fall back to buffer io mode.
To make a summary of the issue, as writeback ingores EOF pages, once any
EOF page is generated, any write to it will only go to the page cache,
it will never be flushed to disk even file size extends and that page is
not EOF page any more. The fix is to avoid zero EOF blocks with buffer
write.
The following code snippet from qemu-img could trigger the corruption.
656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11
...
660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
660 fallocate(11, 0, 2275868672, 327680) = 0
658 pwrite64(11, "
Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-30 00:53:41 +03:00
loff_t isize = i_size_read ( inode ) ;
2007-07-04 00:34:11 +04:00
/*
* The " start " and " end " values are NOT necessarily part of
* the range whose allocation is being deleted . Rather , this
* is what the user passed in with the request . We must zero
* partial clusters here . There ' s no need to worry about
* physical allocation - the zeroing code knows to skip holes .
*/
2011-02-22 17:14:41 +03:00
trace_ocfs2_zero_partial_clusters (
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) start , ( unsigned long long ) end ) ;
2007-07-04 00:34:11 +04:00
/*
* If both edges are on a cluster boundary then there ' s no
* zeroing required as the region is part of the allocation to
* be truncated .
*/
if ( ( start & ( csize - 1 ) ) = = 0 & & ( end & ( csize - 1 ) ) = = 0 )
goto out ;
ocfs2: issue zeroout to EOF blocks
For punch holes in EOF blocks, fallocate used buffer write to zero the
EOF blocks in last cluster. But since ->writepage will ignore EOF
pages, those zeros will not be flushed.
This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by
fallocate") will zero the EOF blocks when extend the file size, but it
isn't. The problem happened on those EOF pages, before writeback, those
pages had DIRTY flag set and all buffer_head in them also had DIRTY flag
set, when writeback run by write_cache_pages(), DIRTY flag on the page
was cleared, but DIRTY flag on the buffer_head not.
When next write happened to those EOF pages, since buffer_head already
had DIRTY flag set, it would not mark page DIRTY again. That made
writeback ignore them forever. That will cause data corruption. Even
directio write can't work because it will fail when trying to drop pages
caches before direct io, as it found the buffer_head for those pages
still had DIRTY flag set, then it will fall back to buffer io mode.
To make a summary of the issue, as writeback ingores EOF pages, once any
EOF page is generated, any write to it will only go to the page cache,
it will never be flushed to disk even file size extends and that page is
not EOF page any more. The fix is to avoid zero EOF blocks with buffer
write.
The following code snippet from qemu-img could trigger the corruption.
656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11
...
660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
660 fallocate(11, 0, 2275868672, 327680) = 0
658 pwrite64(11, "
Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-30 00:53:41 +03:00
/* No page cache for EOF blocks, issue zero out to disk. */
if ( end > isize ) {
/*
* zeroout eof blocks in last cluster starting from
* " isize " even " start " > " isize " because it is
* complicated to zeroout just at " start " as " start "
* may be not aligned with block size , buffer write
* would be required to do that , but out of eof buffer
* write is not supported .
*/
ret = ocfs2_zeroout_partial_cluster ( inode , isize ,
end - isize ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
if ( start > = isize )
goto out ;
end = isize ;
}
2007-07-04 00:34:11 +04:00
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
2008-10-20 21:23:51 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
2007-07-04 00:34:11 +04:00
mlog_errno ( ret ) ;
goto out ;
}
/*
2016-09-20 00:44:42 +03:00
* If start is on a cluster boundary and end is somewhere in another
* cluster , we have not COWed the cluster starting at start , unless
* end is also within the same cluster . So , in this case , we skip this
* first call to ocfs2_zero_range_for_truncate ( ) truncate and move on
* to the next one .
2007-07-04 00:34:11 +04:00
*/
2016-09-20 00:44:42 +03:00
if ( ( start & ( csize - 1 ) ) ! = 0 ) {
/*
* We want to get the byte offset of the end of the 1 st
* cluster .
*/
tmpend = ( u64 ) osb - > s_clustersize +
( start & ~ ( osb - > s_clustersize - 1 ) ) ;
if ( tmpend > end )
tmpend = end ;
2007-07-04 00:34:11 +04:00
2016-09-20 00:44:42 +03:00
trace_ocfs2_zero_partial_clusters_range1 (
( unsigned long long ) start ,
( unsigned long long ) tmpend ) ;
2007-07-04 00:34:11 +04:00
2016-09-20 00:44:42 +03:00
ret = ocfs2_zero_range_for_truncate ( inode , handle , start ,
tmpend ) ;
if ( ret )
mlog_errno ( ret ) ;
}
2007-07-04 00:34:11 +04:00
if ( tmpend < end ) {
/*
* This may make start and end equal , but the zeroing
* code will skip any work in that case so there ' s no
* need to catch it up here .
*/
start = end & ~ ( osb - > s_clustersize - 1 ) ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_zero_partial_clusters_range2 (
( unsigned long long ) start , ( unsigned long long ) end ) ;
2007-07-04 00:34:11 +04:00
ret = ocfs2_zero_range_for_truncate ( inode , handle , start , end ) ;
if ( ret )
mlog_errno ( ret ) ;
}
2014-04-04 01:47:08 +04:00
ocfs2_update_inode_fsync_trans ( handle , inode , 1 ) ;
2007-07-04 00:34:11 +04:00
ocfs2_commit_trans ( osb , handle ) ;
out :
return ret ;
}
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
static int ocfs2_find_rec ( struct ocfs2_extent_list * el , u32 pos )
{
int i ;
struct ocfs2_extent_rec * rec = NULL ;
for ( i = le16_to_cpu ( el - > l_next_free_rec ) - 1 ; i > = 0 ; i - - ) {
rec = & el - > l_recs [ i ] ;
if ( le32_to_cpu ( rec - > e_cpos ) < pos )
break ;
}
return i ;
}
/*
* Helper to calculate the punching pos and length in one run , we handle the
* following three cases in order :
*
* - remove the entire record
* - remove a partial record
* - no record needs to be removed ( hole - punching completed )
*/
static void ocfs2_calc_trunc_pos ( struct inode * inode ,
struct ocfs2_extent_list * el ,
struct ocfs2_extent_rec * rec ,
u32 trunc_start , u32 * trunc_cpos ,
u32 * trunc_len , u32 * trunc_end ,
u64 * blkno , int * done )
{
int ret = 0 ;
u32 coff , range ;
range = le32_to_cpu ( rec - > e_cpos ) + ocfs2_rec_clusters ( el , rec ) ;
if ( le32_to_cpu ( rec - > e_cpos ) > = trunc_start ) {
2011-05-12 16:47:07 +04:00
/*
* remove an entire extent record .
*/
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
* trunc_cpos = le32_to_cpu ( rec - > e_cpos ) ;
/*
* Skip holes if any .
*/
if ( range < * trunc_end )
* trunc_end = range ;
* trunc_len = * trunc_end - le32_to_cpu ( rec - > e_cpos ) ;
* blkno = le64_to_cpu ( rec - > e_blkno ) ;
* trunc_end = le32_to_cpu ( rec - > e_cpos ) ;
} else if ( range > trunc_start ) {
2011-05-12 16:47:07 +04:00
/*
* remove a partial extent record , which means we ' re
* removing the last extent record .
*/
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
* trunc_cpos = trunc_start ;
2011-05-12 16:47:07 +04:00
/*
* skip hole if any .
*/
if ( range < * trunc_end )
* trunc_end = range ;
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
* trunc_len = * trunc_end - trunc_start ;
coff = trunc_start - le32_to_cpu ( rec - > e_cpos ) ;
* blkno = le64_to_cpu ( rec - > e_blkno ) +
ocfs2_clusters_to_blocks ( inode - > i_sb , coff ) ;
* trunc_end = trunc_start ;
} else {
/*
* It may have two following possibilities :
*
* - last record has been removed
* - trunc_start was within a hole
*
* both two cases mean the completion of hole punching .
*/
ret = 1 ;
}
* done = ret ;
}
2016-11-10 01:13:11 +03:00
int ocfs2_remove_inode_range ( struct inode * inode ,
struct buffer_head * di_bh , u64 byte_start ,
u64 byte_len )
2007-07-04 00:34:11 +04:00
{
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
int ret = 0 , flags = 0 , done = 0 , i ;
u32 trunc_start , trunc_len , trunc_end , trunc_cpos , phys_cpos ;
u32 cluster_in_el ;
2007-07-04 00:34:11 +04:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
struct ocfs2_cached_dealloc_ctxt dealloc ;
2007-11-20 22:56:39 +03:00
struct address_space * mapping = inode - > i_mapping ;
2008-11-13 02:16:38 +03:00
struct ocfs2_extent_tree et ;
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
struct ocfs2_path * path = NULL ;
struct ocfs2_extent_list * el = NULL ;
struct ocfs2_extent_rec * rec = NULL ;
2010-05-11 13:54:43 +04:00
struct ocfs2_dinode * di = ( struct ocfs2_dinode * ) di_bh - > b_data ;
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
u64 blkno , refcount_loc = le64_to_cpu ( di - > i_refcount_loc ) ;
2007-07-04 00:34:11 +04:00
2009-02-13 14:54:22 +03:00
ocfs2_init_dinode_extent_tree ( & et , INODE_CACHE ( inode ) , di_bh ) ;
2007-07-04 00:34:11 +04:00
ocfs2_init_dealloc_ctxt ( & dealloc ) ;
2011-02-22 17:14:41 +03:00
trace_ocfs2_remove_inode_range (
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
( unsigned long long ) byte_start ,
( unsigned long long ) byte_len ) ;
2007-07-04 00:34:11 +04:00
if ( byte_len = = 0 )
return 0 ;
2007-09-08 01:46:51 +04:00
if ( OCFS2_I ( inode ) - > ip_dyn_features & OCFS2_INLINE_DATA_FL ) {
ret = ocfs2_truncate_inline ( inode , di_bh , byte_start ,
2007-11-20 22:56:39 +03:00
byte_start + byte_len , 0 ) ;
if ( ret ) {
2007-09-08 01:46:51 +04:00
mlog_errno ( ret ) ;
2007-11-20 22:56:39 +03:00
goto out ;
}
/*
* There ' s no need to get fancy with the page cache
* truncate of an inline - data inode . We ' re talking
* about less than a page here , which will be cached
* in the dinode buffer anyway .
*/
unmap_mapping_range ( mapping , 0 , 0 , 0 ) ;
truncate_inode_pages ( mapping , 0 ) ;
goto out ;
2007-09-08 01:46:51 +04:00
}
2010-05-11 13:54:43 +04:00
/*
* For reflinks , we may need to CoW 2 clusters which might be
* partially zero ' d later , if hole ' s start and end offset were
* within one cluster ( means is not exactly aligned to clustersize ) .
*/
2016-11-10 01:13:09 +03:00
if ( ocfs2_is_refcount_inode ( inode ) ) {
2010-05-11 13:54:43 +04:00
ret = ocfs2_cow_file_pos ( inode , di_bh , byte_start ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
ret = ocfs2_cow_file_pos ( inode , di_bh , byte_start + byte_len ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
}
2007-07-04 00:34:11 +04:00
trunc_start = ocfs2_clusters_for_bytes ( osb - > sb , byte_start ) ;
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
trunc_end = ( byte_start + byte_len ) > > osb - > s_clustersize_bits ;
cluster_in_el = trunc_end ;
2007-07-04 00:34:11 +04:00
ret = ocfs2_zero_partial_clusters ( inode , byte_start , byte_len ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
path = ocfs2_new_path_from_et ( & et ) ;
if ( ! path ) {
ret = - ENOMEM ;
mlog_errno ( ret ) ;
goto out ;
}
while ( trunc_end > trunc_start ) {
ret = ocfs2_find_path ( INODE_CACHE ( inode ) , path ,
cluster_in_el ) ;
2007-07-04 00:34:11 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
el = path_leaf_el ( path ) ;
2007-07-04 00:34:11 +04:00
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
i = ocfs2_find_rec ( el , trunc_end ) ;
/*
* Need to go to previous extent block .
*/
if ( i < 0 ) {
if ( path - > p_tree_depth = = 0 )
break ;
2007-07-04 00:34:11 +04:00
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
ret = ocfs2_find_cpos_for_left_leaf ( inode - > i_sb ,
path ,
& cluster_in_el ) ;
2007-07-04 00:34:11 +04:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
/*
* We ' ve reached the leftmost extent block ,
* it ' s safe to leave .
*/
if ( cluster_in_el = = 0 )
break ;
/*
* The ' pos ' searched for previous extent block is
* always one cluster less than actual trunc_end .
*/
trunc_end = cluster_in_el + 1 ;
ocfs2_reinit_path ( path , 1 ) ;
continue ;
} else
rec = & el - > l_recs [ i ] ;
ocfs2_calc_trunc_pos ( inode , el , rec , trunc_start , & trunc_cpos ,
& trunc_len , & trunc_end , & blkno , & done ) ;
if ( done )
break ;
flags = rec - > e_flags ;
phys_cpos = ocfs2_blocks_to_clusters ( inode - > i_sb , blkno ) ;
ret = ocfs2_remove_btree_range ( inode , & et , trunc_cpos ,
phys_cpos , trunc_len , flags ,
2014-12-19 03:17:32 +03:00
& dealloc , refcount_loc , false ) ;
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
2007-07-04 00:34:11 +04:00
}
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 13:54:45 +04:00
cluster_in_el = trunc_end ;
ocfs2_reinit_path ( path , 1 ) ;
2007-07-04 00:34:11 +04:00
}
ocfs2_truncate_cluster_pages ( inode , byte_start , byte_len ) ;
out :
2013-09-12 01:19:59 +04:00
ocfs2_free_path ( path ) ;
2007-07-04 00:34:11 +04:00
ocfs2_schedule_truncate_log_flush ( osb , 1 ) ;
ocfs2_run_deallocs ( osb , & dealloc ) ;
return ret ;
}
2007-03-10 03:53:21 +03:00
/*
* Parts of this function taken from xfs_change_file_space ( )
*/
2007-07-19 11:14:38 +04:00
static int __ocfs2_change_file_space ( struct file * file , struct inode * inode ,
loff_t f_pos , unsigned int cmd ,
struct ocfs2_space_resv * sr ,
int change_size )
2007-03-10 03:53:21 +03:00
{
int ret ;
s64 llen ;
ocfs2: fix data corruption by fallocate
When fallocate punches holes out of inode size, if original isize is in
the middle of last cluster, then the part from isize to the end of the
cluster will be zeroed with buffer write, at that time isize is not yet
updated to match the new size, if writeback is kicked in, it will invoke
ocfs2_writepage()->block_write_full_page() where the pages out of inode
size will be dropped. That will cause file corruption. Fix this by
zero out eof blocks when extending the inode size.
Running the following command with qemu-image 4.2.1 can get a corrupted
coverted image file easily.
qemu-img convert -p -t none -T none -f qcow2 $qcow_image \
-O qcow2 -o compat=1.1 $qcow_image.conv
The usage of fallocate in qemu is like this, it first punches holes out
of inode size, then extend the inode size.
fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0
fallocate(11, 0, 2276196352, 65536) = 0
v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html
v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/T/
Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-06-05 06:01:42 +03:00
loff_t size , orig_isize ;
2007-03-10 03:53:21 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
struct buffer_head * di_bh = NULL ;
handle_t * handle ;
2007-07-20 22:28:30 +04:00
unsigned long long max_off = inode - > i_sb - > s_maxbytes ;
2007-03-10 03:53:21 +03:00
if ( ocfs2_is_hard_readonly ( osb ) | | ocfs2_is_soft_readonly ( osb ) )
return - EROFS ;
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2007-03-10 03:53:21 +03:00
/*
* This prevents concurrent writes on other nodes
*/
ret = ocfs2_rw_lock ( inode , 1 ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out ;
}
2007-10-19 02:30:42 +04:00
ret = ocfs2_inode_lock ( inode , & di_bh , 1 ) ;
2007-03-10 03:53:21 +03:00
if ( ret ) {
mlog_errno ( ret ) ;
goto out_rw_unlock ;
}
if ( inode - > i_flags & ( S_IMMUTABLE | S_APPEND ) ) {
ret = - EPERM ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
switch ( sr - > l_whence ) {
case 0 : /*SEEK_SET*/
break ;
case 1 : /*SEEK_CUR*/
2007-07-19 11:14:38 +04:00
sr - > l_start + = f_pos ;
2007-03-10 03:53:21 +03:00
break ;
case 2 : /*SEEK_END*/
2021-07-30 00:53:38 +03:00
sr - > l_start + = i_size_read ( inode ) ;
2007-03-10 03:53:21 +03:00
break ;
default :
ret = - EINVAL ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
sr - > l_whence = 0 ;
llen = sr - > l_len > 0 ? sr - > l_len - 1 : sr - > l_len ;
if ( sr - > l_start < 0
| | sr - > l_start > max_off
| | ( sr - > l_start + llen ) < 0
| | ( sr - > l_start + llen ) > max_off ) {
ret = - EINVAL ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
2007-07-19 11:14:38 +04:00
size = sr - > l_start + sr - > l_len ;
2007-03-10 03:53:21 +03:00
2014-01-22 03:48:38 +04:00
if ( cmd = = OCFS2_IOC_RESVSP | | cmd = = OCFS2_IOC_RESVSP64 | |
cmd = = OCFS2_IOC_UNRESVSP | | cmd = = OCFS2_IOC_UNRESVSP64 ) {
2007-03-10 03:53:21 +03:00
if ( sr - > l_len < = 0 ) {
ret = - EINVAL ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
}
attr: use consistent sgid stripping checks
Currently setgid stripping in file_remove_privs()'s should_remove_suid()
helper is inconsistent with other parts of the vfs. Specifically, it only
raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the
inode isn't in the caller's groups and the caller isn't privileged over the
inode although we require this already in setattr_prepare() and
setattr_copy() and so all filesystem implement this requirement implicitly
because they have to use setattr_{prepare,copy}() anyway.
But the inconsistency shows up in setgid stripping bugs for overlayfs in
xfstests (e.g., generic/673, generic/683, generic/685, generic/686,
generic/687). For example, we test whether suid and setgid stripping works
correctly when performing various write-like operations as an unprivileged
user (fallocate, reflink, write, etc.):
echo "Test 1 - qa_user, non-exec file $verb"
setup_testfile
chmod a+rws $junk_file
commit_and_check "$qa_user" "$verb" 64k 64k
The test basically creates a file with 6666 permissions. While the file has
the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a
regular filesystem like xfs what will happen is:
sys_fallocate()
-> vfs_fallocate()
-> xfs_file_fallocate()
-> file_modified()
-> __file_remove_privs()
-> dentry_needs_remove_privs()
-> should_remove_suid()
-> __remove_privs()
newattrs.ia_valid = ATTR_FORCE | kill;
-> notify_change()
-> setattr_copy()
In should_remove_suid() we can see that ATTR_KILL_SUID is raised
unconditionally because the file in the test has S_ISUID set.
But we also see that ATTR_KILL_SGID won't be set because while the file
is S_ISGID it is not S_IXGRP (see above) which is a condition for
ATTR_KILL_SGID being raised.
So by the time we call notify_change() we have attr->ia_valid set to
ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that
ATTR_KILL_SUID is set and does:
ia_valid = attr->ia_valid |= ATTR_MODE
attr->ia_mode = (inode->i_mode & ~S_ISUID);
which means that when we call setattr_copy() later we will definitely
update inode->i_mode. Note that attr->ia_mode still contains S_ISGID.
Now we call into the filesystem's ->setattr() inode operation which will
end up calling setattr_copy(). Since ATTR_MODE is set we will hit:
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
and since the caller in the test is neither capable nor in the group of the
inode the S_ISGID bit is stripped.
But assume the file isn't suid then ATTR_KILL_SUID won't be raised which
has the consequence that neither the setgid nor the suid bits are stripped
even though it should be stripped because the inode isn't in the caller's
groups and the caller isn't privileged over the inode.
If overlayfs is in the mix things become a bit more complicated and the bug
shows up more clearly. When e.g., ovl_setattr() is hit from
ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and
ATTR_KILL_SGID might be raised but because the check in notify_change() is
questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be
stripped the S_ISGID bit isn't removed even though it should be stripped:
sys_fallocate()
-> vfs_fallocate()
-> ovl_fallocate()
-> file_remove_privs()
-> dentry_needs_remove_privs()
-> should_remove_suid()
-> __remove_privs()
newattrs.ia_valid = ATTR_FORCE | kill;
-> notify_change()
-> ovl_setattr()
// TAKE ON MOUNTER'S CREDS
-> ovl_do_notify_change()
-> notify_change()
// GIVE UP MOUNTER'S CREDS
// TAKE ON MOUNTER'S CREDS
-> vfs_fallocate()
-> xfs_file_fallocate()
-> file_modified()
-> __file_remove_privs()
-> dentry_needs_remove_privs()
-> should_remove_suid()
-> __remove_privs()
newattrs.ia_valid = attr_force | kill;
-> notify_change()
The fix for all of this is to make file_remove_privs()'s
should_remove_suid() helper to perform the same checks as we already
require in setattr_prepare() and setattr_copy() and have notify_change()
not pointlessly requiring S_IXGRP again. It doesn't make any sense in the
first place because the caller must calculate the flags via
should_remove_suid() anyway which would raise ATTR_KILL_SGID.
While we're at it we move should_remove_suid() from inode.c to attr.c
where it belongs with the rest of the iattr helpers. Especially since it
returns ATTR_KILL_S{G,U}ID flags. We also rename it to
setattr_should_drop_suidgid() to better reflect that it indicates both
setuid and setgid bit removal and also that it returns attr flags.
Running xfstests with this doesn't report any regressions. We should really
try and use consistent checks.
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-10-17 18:06:37 +03:00
if ( file & & setattr_should_drop_suidgid ( & init_user_ns , file_inode ( file ) ) ) {
2007-03-10 03:53:21 +03:00
ret = __ocfs2_write_remove_suid ( inode , di_bh ) ;
if ( ret ) {
mlog_errno ( ret ) ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
}
down_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
switch ( cmd ) {
case OCFS2_IOC_RESVSP :
case OCFS2_IOC_RESVSP64 :
/*
* This takes unsigned offsets , but the signed ones we
* pass have been checked against overflow above .
*/
ret = ocfs2_allocate_unwritten_extents ( inode , sr - > l_start ,
sr - > l_len ) ;
break ;
case OCFS2_IOC_UNRESVSP :
case OCFS2_IOC_UNRESVSP64 :
ret = ocfs2_remove_inode_range ( inode , di_bh , sr - > l_start ,
sr - > l_len ) ;
break ;
default :
ret = - EINVAL ;
}
ocfs2: fix data corruption by fallocate
When fallocate punches holes out of inode size, if original isize is in
the middle of last cluster, then the part from isize to the end of the
cluster will be zeroed with buffer write, at that time isize is not yet
updated to match the new size, if writeback is kicked in, it will invoke
ocfs2_writepage()->block_write_full_page() where the pages out of inode
size will be dropped. That will cause file corruption. Fix this by
zero out eof blocks when extending the inode size.
Running the following command with qemu-image 4.2.1 can get a corrupted
coverted image file easily.
qemu-img convert -p -t none -T none -f qcow2 $qcow_image \
-O qcow2 -o compat=1.1 $qcow_image.conv
The usage of fallocate in qemu is like this, it first punches holes out
of inode size, then extend the inode size.
fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0
fallocate(11, 0, 2276196352, 65536) = 0
v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html
v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/T/
Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-06-05 06:01:42 +03:00
2021-07-30 00:53:38 +03:00
orig_isize = i_size_read ( inode ) ;
ocfs2: fix data corruption by fallocate
When fallocate punches holes out of inode size, if original isize is in
the middle of last cluster, then the part from isize to the end of the
cluster will be zeroed with buffer write, at that time isize is not yet
updated to match the new size, if writeback is kicked in, it will invoke
ocfs2_writepage()->block_write_full_page() where the pages out of inode
size will be dropped. That will cause file corruption. Fix this by
zero out eof blocks when extending the inode size.
Running the following command with qemu-image 4.2.1 can get a corrupted
coverted image file easily.
qemu-img convert -p -t none -T none -f qcow2 $qcow_image \
-O qcow2 -o compat=1.1 $qcow_image.conv
The usage of fallocate in qemu is like this, it first punches holes out
of inode size, then extend the inode size.
fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0
fallocate(11, 0, 2276196352, 65536) = 0
v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html
v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/T/
Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-06-05 06:01:42 +03:00
/* zeroout eof blocks in the cluster. */
if ( ! ret & & change_size & & orig_isize < size ) {
ret = ocfs2_zeroout_partial_cluster ( inode , orig_isize ,
size - orig_isize ) ;
if ( ! ret )
i_size_write ( inode , size ) ;
}
2007-03-10 03:53:21 +03:00
up_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
if ( ret ) {
mlog_errno ( ret ) ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
/*
* We update c / mtime for these changes
*/
handle = ocfs2_start_trans ( osb , OCFS2_INODE_UPDATE_CREDITS ) ;
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
mlog_errno ( ret ) ;
2007-10-19 02:30:42 +04:00
goto out_inode_unlock ;
2007-03-10 03:53:21 +03:00
}
2016-09-14 17:48:04 +03:00
inode - > i_ctime = inode - > i_mtime = current_time ( inode ) ;
2007-03-10 03:53:21 +03:00
ret = ocfs2_mark_inode_dirty ( handle , inode , di_bh ) ;
if ( ret < 0 )
mlog_errno ( ret ) ;
2012-07-12 01:02:10 +04:00
if ( file & & ( file - > f_flags & O_SYNC ) )
2011-11-17 00:03:10 +04:00
handle - > h_sync = 1 ;
2007-03-10 03:53:21 +03:00
ocfs2_commit_trans ( osb , handle ) ;
2007-10-19 02:30:42 +04:00
out_inode_unlock :
2007-03-10 03:53:21 +03:00
brelse ( di_bh ) ;
2007-10-19 02:30:42 +04:00
ocfs2_inode_unlock ( inode , 1 ) ;
2007-03-10 03:53:21 +03:00
out_rw_unlock :
ocfs2_rw_unlock ( inode , 1 ) ;
out :
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2007-03-10 03:53:21 +03:00
return ret ;
}
2007-07-19 11:14:38 +04:00
int ocfs2_change_file_space ( struct file * file , unsigned int cmd ,
struct ocfs2_space_resv * sr )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2009-01-08 05:09:08 +03:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2012-06-12 18:20:40 +04:00
int ret ;
2007-07-19 11:14:38 +04:00
if ( ( cmd = = OCFS2_IOC_RESVSP | | cmd = = OCFS2_IOC_RESVSP64 ) & &
! ocfs2_writes_unwritten_extents ( osb ) )
return - ENOTTY ;
else if ( ( cmd = = OCFS2_IOC_UNRESVSP | | cmd = = OCFS2_IOC_UNRESVSP64 ) & &
! ocfs2_sparse_alloc ( osb ) )
return - ENOTTY ;
if ( ! S_ISREG ( inode - > i_mode ) )
return - EINVAL ;
if ( ! ( file - > f_mode & FMODE_WRITE ) )
return - EBADF ;
2012-06-12 18:20:40 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
ret = __ocfs2_change_file_space ( file , inode , file - > f_pos , cmd , sr , 0 ) ;
mnt_drop_write_file ( file ) ;
return ret ;
2007-07-19 11:14:38 +04:00
}
2011-01-14 15:07:43 +03:00
static long ocfs2_fallocate ( struct file * file , int mode , loff_t offset ,
2007-07-19 11:14:38 +04:00
loff_t len )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2007-07-19 11:14:38 +04:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
struct ocfs2_space_resv sr ;
int change_size = 1 ;
2010-11-18 04:46:17 +03:00
int cmd = OCFS2_IOC_RESVSP64 ;
2007-07-19 11:14:38 +04:00
2011-01-14 15:07:30 +03:00
if ( mode & ~ ( FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE ) )
return - EOPNOTSUPP ;
2007-07-19 11:14:38 +04:00
if ( ! ocfs2_writes_unwritten_extents ( osb ) )
return - EOPNOTSUPP ;
if ( mode & FALLOC_FL_KEEP_SIZE )
change_size = 0 ;
2010-11-18 04:46:17 +03:00
if ( mode & FALLOC_FL_PUNCH_HOLE )
cmd = OCFS2_IOC_UNRESVSP64 ;
2007-07-19 11:14:38 +04:00
sr . l_whence = 0 ;
sr . l_start = ( s64 ) offset ;
sr . l_len = ( s64 ) len ;
2010-11-18 04:46:17 +03:00
return __ocfs2_change_file_space ( NULL , inode , offset , cmd , & sr ,
change_size ) ;
2007-07-19 11:14:38 +04:00
}
2009-08-25 04:02:48 +04:00
int ocfs2_check_range_for_refcount ( struct inode * inode , loff_t pos ,
size_t count )
{
int ret = 0 ;
unsigned int extent_flags ;
u32 cpos , clusters , extent_len , phys_cpos ;
struct super_block * sb = inode - > i_sb ;
if ( ! ocfs2_refcount_tree ( OCFS2_SB ( inode - > i_sb ) ) | |
2016-11-10 01:13:09 +03:00
! ocfs2_is_refcount_inode ( inode ) | |
2009-10-15 07:10:49 +04:00
OCFS2_I ( inode ) - > ip_dyn_features & OCFS2_INLINE_DATA_FL )
2009-08-25 04:02:48 +04:00
return 0 ;
cpos = pos > > OCFS2_SB ( sb ) - > s_clustersize_bits ;
clusters = ocfs2_clusters_for_bytes ( sb , pos + count ) - cpos ;
while ( clusters ) {
ret = ocfs2_get_clusters ( inode , cpos , & phys_cpos , & extent_len ,
& extent_flags ) ;
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
if ( phys_cpos & & ( extent_flags & OCFS2_EXT_REFCOUNTED ) ) {
ret = 1 ;
break ;
}
if ( extent_len > clusters )
extent_len = clusters ;
clusters - = extent_len ;
cpos + = extent_len ;
}
out :
return ret ;
}
2011-06-23 01:23:38 +04:00
static int ocfs2_is_io_unaligned ( struct inode * inode , size_t count , loff_t pos )
{
int blockmask = inode - > i_sb - > s_blocksize - 1 ;
loff_t final_size = pos + count ;
if ( ( pos & blockmask ) | | ( final_size & blockmask ) )
return 1 ;
return 0 ;
}
2019-11-06 08:16:34 +03:00
static int ocfs2_inode_lock_for_extent_tree ( struct inode * inode ,
struct buffer_head * * di_bh ,
int meta_level ,
int write_sem ,
int wait )
2009-08-25 04:02:48 +04:00
{
2019-11-06 08:16:34 +03:00
int ret = 0 ;
2009-08-25 04:02:48 +04:00
2019-11-06 08:16:34 +03:00
if ( wait )
2020-02-04 04:33:45 +03:00
ret = ocfs2_inode_lock ( inode , di_bh , meta_level ) ;
2019-11-06 08:16:34 +03:00
else
2020-02-04 04:33:45 +03:00
ret = ocfs2_try_inode_lock ( inode , di_bh , meta_level ) ;
2019-11-06 08:16:34 +03:00
if ( ret < 0 )
2009-08-25 04:02:48 +04:00
goto out ;
2019-11-06 08:16:34 +03:00
if ( wait ) {
if ( write_sem )
down_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
else
down_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
} else {
if ( write_sem )
ret = down_write_trylock ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
else
ret = down_read_trylock ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
if ( ! ret ) {
ret = - EAGAIN ;
goto out_unlock ;
}
2009-08-25 04:02:48 +04:00
}
2019-11-06 08:16:34 +03:00
return ret ;
2009-08-25 04:02:48 +04:00
2019-11-06 08:16:34 +03:00
out_unlock :
brelse ( * di_bh ) ;
2020-02-04 04:33:45 +03:00
* di_bh = NULL ;
2019-11-06 08:16:34 +03:00
ocfs2_inode_unlock ( inode , meta_level ) ;
2009-08-25 04:02:48 +04:00
out :
return ret ;
}
2019-11-06 08:16:34 +03:00
static void ocfs2_inode_unlock_for_extent_tree ( struct inode * inode ,
struct buffer_head * * di_bh ,
int meta_level ,
int write_sem )
{
if ( write_sem )
up_write ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
else
up_read ( & OCFS2_I ( inode ) - > ip_alloc_sem ) ;
brelse ( * di_bh ) ;
* di_bh = NULL ;
if ( meta_level > = 0 )
ocfs2_inode_unlock ( inode , meta_level ) ;
}
2010-08-12 06:27:14 +04:00
static int ocfs2_prepare_inode_for_write ( struct file * file ,
2018-02-01 03:15:25 +03:00
loff_t pos , size_t count , int wait )
2005-12-16 01:31:24 +03:00
{
2018-02-01 03:15:25 +03:00
int ret = 0 , meta_level = 0 , overwrite_io = 0 ;
2019-11-06 08:16:34 +03:00
int write_sem = 0 ;
2010-08-12 06:27:14 +04:00
struct dentry * dentry = file - > f_path . dentry ;
2015-03-18 01:25:59 +03:00
struct inode * inode = d_inode ( dentry ) ;
2018-02-01 03:15:25 +03:00
struct buffer_head * di_bh = NULL ;
2019-11-06 08:16:34 +03:00
u32 cpos ;
u32 clusters ;
2005-12-16 01:31:24 +03:00
2010-01-26 03:57:38 +03:00
/*
2007-08-29 04:13:23 +04:00
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here .
2005-12-16 01:31:24 +03:00
*/
for ( ; ; ) {
2019-11-06 08:16:34 +03:00
ret = ocfs2_inode_lock_for_extent_tree ( inode ,
& di_bh ,
meta_level ,
write_sem ,
wait ) ;
2005-12-16 01:31:24 +03:00
if ( ret < 0 ) {
2018-02-01 03:15:25 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2005-12-16 01:31:24 +03:00
goto out ;
}
2018-02-01 03:15:25 +03:00
/*
* Check if IO will overwrite allocated blocks in case
* IOCB_NOWAIT flag is set .
*/
if ( ! wait & & ! overwrite_io ) {
overwrite_io = 1 ;
ret = ocfs2_overwrite_io ( inode , di_bh , pos , count ) ;
if ( ret < 0 ) {
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
goto out_unlock ;
}
}
2005-12-16 01:31:24 +03:00
/* Clear suid / sgid if necessary. We do this here
* instead of later in the write path because
* remove_suid ( ) calls - > setattr without any hint that
* we may have already done our cluster locking . Since
* ocfs2_setattr ( ) * must * take cluster locks to
2011-11-29 08:31:00 +04:00
* proceed , this will lead us to recursively lock the
2005-12-16 01:31:24 +03:00
* inode . There ' s also the dinode i_size state which
* can be lost via setattr during extending writes ( we
* set inode - > i_size at the end of a write . */
attr: use consistent sgid stripping checks
Currently setgid stripping in file_remove_privs()'s should_remove_suid()
helper is inconsistent with other parts of the vfs. Specifically, it only
raises ATTR_KILL_SGID if the inode is S_ISGID and S_IXGRP but not if the
inode isn't in the caller's groups and the caller isn't privileged over the
inode although we require this already in setattr_prepare() and
setattr_copy() and so all filesystem implement this requirement implicitly
because they have to use setattr_{prepare,copy}() anyway.
But the inconsistency shows up in setgid stripping bugs for overlayfs in
xfstests (e.g., generic/673, generic/683, generic/685, generic/686,
generic/687). For example, we test whether suid and setgid stripping works
correctly when performing various write-like operations as an unprivileged
user (fallocate, reflink, write, etc.):
echo "Test 1 - qa_user, non-exec file $verb"
setup_testfile
chmod a+rws $junk_file
commit_and_check "$qa_user" "$verb" 64k 64k
The test basically creates a file with 6666 permissions. While the file has
the S_ISUID and S_ISGID bits set it does not have the S_IXGRP set. On a
regular filesystem like xfs what will happen is:
sys_fallocate()
-> vfs_fallocate()
-> xfs_file_fallocate()
-> file_modified()
-> __file_remove_privs()
-> dentry_needs_remove_privs()
-> should_remove_suid()
-> __remove_privs()
newattrs.ia_valid = ATTR_FORCE | kill;
-> notify_change()
-> setattr_copy()
In should_remove_suid() we can see that ATTR_KILL_SUID is raised
unconditionally because the file in the test has S_ISUID set.
But we also see that ATTR_KILL_SGID won't be set because while the file
is S_ISGID it is not S_IXGRP (see above) which is a condition for
ATTR_KILL_SGID being raised.
So by the time we call notify_change() we have attr->ia_valid set to
ATTR_KILL_SUID | ATTR_FORCE. Now notify_change() sees that
ATTR_KILL_SUID is set and does:
ia_valid = attr->ia_valid |= ATTR_MODE
attr->ia_mode = (inode->i_mode & ~S_ISUID);
which means that when we call setattr_copy() later we will definitely
update inode->i_mode. Note that attr->ia_mode still contains S_ISGID.
Now we call into the filesystem's ->setattr() inode operation which will
end up calling setattr_copy(). Since ATTR_MODE is set we will hit:
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
and since the caller in the test is neither capable nor in the group of the
inode the S_ISGID bit is stripped.
But assume the file isn't suid then ATTR_KILL_SUID won't be raised which
has the consequence that neither the setgid nor the suid bits are stripped
even though it should be stripped because the inode isn't in the caller's
groups and the caller isn't privileged over the inode.
If overlayfs is in the mix things become a bit more complicated and the bug
shows up more clearly. When e.g., ovl_setattr() is hit from
ovl_fallocate()'s call to file_remove_privs() then ATTR_KILL_SUID and
ATTR_KILL_SGID might be raised but because the check in notify_change() is
questioning the ATTR_KILL_SGID flag again by requiring S_IXGRP for it to be
stripped the S_ISGID bit isn't removed even though it should be stripped:
sys_fallocate()
-> vfs_fallocate()
-> ovl_fallocate()
-> file_remove_privs()
-> dentry_needs_remove_privs()
-> should_remove_suid()
-> __remove_privs()
newattrs.ia_valid = ATTR_FORCE | kill;
-> notify_change()
-> ovl_setattr()
// TAKE ON MOUNTER'S CREDS
-> ovl_do_notify_change()
-> notify_change()
// GIVE UP MOUNTER'S CREDS
// TAKE ON MOUNTER'S CREDS
-> vfs_fallocate()
-> xfs_file_fallocate()
-> file_modified()
-> __file_remove_privs()
-> dentry_needs_remove_privs()
-> should_remove_suid()
-> __remove_privs()
newattrs.ia_valid = attr_force | kill;
-> notify_change()
The fix for all of this is to make file_remove_privs()'s
should_remove_suid() helper to perform the same checks as we already
require in setattr_prepare() and setattr_copy() and have notify_change()
not pointlessly requiring S_IXGRP again. It doesn't make any sense in the
first place because the caller must calculate the flags via
should_remove_suid() anyway which would raise ATTR_KILL_SGID.
While we're at it we move should_remove_suid() from inode.c to attr.c
where it belongs with the rest of the iattr helpers. Especially since it
returns ATTR_KILL_S{G,U}ID flags. We also rename it to
setattr_should_drop_suidgid() to better reflect that it indicates both
setuid and setgid bit removal and also that it returns attr flags.
Running xfstests with this doesn't report any regressions. We should really
try and use consistent checks.
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
2022-10-17 18:06:37 +03:00
if ( setattr_should_drop_suidgid ( & init_user_ns , inode ) ) {
2005-12-16 01:31:24 +03:00
if ( meta_level = = 0 ) {
2019-11-06 08:16:34 +03:00
ocfs2_inode_unlock_for_extent_tree ( inode ,
& di_bh ,
meta_level ,
write_sem ) ;
2005-12-16 01:31:24 +03:00
meta_level = 1 ;
continue ;
}
ret = ocfs2_write_remove_suid ( inode ) ;
if ( ret < 0 ) {
mlog_errno ( ret ) ;
2006-10-18 05:29:52 +04:00
goto out_unlock ;
2005-12-16 01:31:24 +03:00
}
}
2015-04-09 18:14:45 +03:00
ret = ocfs2_check_range_for_refcount ( inode , pos , count ) ;
2009-08-25 04:02:48 +04:00
if ( ret = = 1 ) {
2019-11-06 08:16:34 +03:00
ocfs2_inode_unlock_for_extent_tree ( inode ,
& di_bh ,
meta_level ,
write_sem ) ;
2020-02-04 04:33:45 +03:00
meta_level = 1 ;
write_sem = 1 ;
2019-11-06 08:16:34 +03:00
ret = ocfs2_inode_lock_for_extent_tree ( inode ,
& di_bh ,
meta_level ,
2020-02-04 04:33:45 +03:00
write_sem ,
2019-11-06 08:16:34 +03:00
wait ) ;
if ( ret < 0 ) {
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
goto out ;
}
cpos = pos > > OCFS2_SB ( inode - > i_sb ) - > s_clustersize_bits ;
clusters =
ocfs2_clusters_for_bytes ( inode - > i_sb , pos + count ) - cpos ;
ret = ocfs2_refcount_cow ( inode , di_bh , cpos , clusters , UINT_MAX ) ;
2009-08-25 04:02:48 +04:00
}
if ( ret < 0 ) {
2019-11-06 08:16:34 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2009-08-25 04:02:48 +04:00
goto out_unlock ;
}
2005-12-16 01:31:24 +03:00
break ;
}
2006-10-18 05:29:52 +04:00
out_unlock :
2011-02-22 17:14:41 +03:00
trace_ocfs2_prepare_inode_for_write ( OCFS2_I ( inode ) - > ip_blkno ,
2018-02-01 03:15:25 +03:00
pos , count , wait ) ;
2019-11-06 08:16:34 +03:00
ocfs2_inode_unlock_for_extent_tree ( inode ,
& di_bh ,
meta_level ,
write_sem ) ;
2006-10-18 05:29:52 +04:00
out :
return ret ;
}
2014-04-03 22:25:22 +04:00
static ssize_t ocfs2_file_write_iter ( struct kiocb * iocb ,
struct iov_iter * from )
2006-10-18 05:29:52 +04:00
{
2018-02-01 03:15:25 +03:00
int rw_level ;
2007-02-10 07:24:12 +03:00
ssize_t written = 0 ;
2015-04-09 19:55:47 +03:00
ssize_t ret ;
2016-03-26 00:21:12 +03:00
size_t count = iov_iter_count ( from ) ;
2007-02-10 07:24:12 +03:00
struct file * file = iocb - > ki_filp ;
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2007-10-19 01:14:45 +04:00
struct ocfs2_super * osb = OCFS2_SB ( inode - > i_sb ) ;
2010-10-11 12:46:39 +04:00
int full_coherency = ! ( osb - > s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED ) ;
2016-03-26 00:21:15 +03:00
void * saved_ki_complete = NULL ;
2015-09-05 01:43:40 +03:00
int append_write = ( ( iocb - > ki_pos + count ) > =
i_size_read ( inode ) ? 1 : 0 ) ;
2018-02-01 03:15:25 +03:00
int direct_io = iocb - > ki_flags & IOCB_DIRECT ? 1 : 0 ;
int nowait = iocb - > ki_flags & IOCB_NOWAIT ? 1 : 0 ;
2007-02-10 07:24:12 +03:00
2018-04-06 02:18:45 +03:00
trace_ocfs2_file_write_iter ( inode , file , file - > f_path . dentry ,
2011-02-22 17:14:41 +03:00
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
file - > f_path . dentry - > d_name . len ,
file - > f_path . dentry - > d_name . name ,
2014-04-03 22:25:22 +04:00
( unsigned int ) from - > nr_segs ) ; /* GRRRRR */
2006-10-18 05:29:52 +04:00
2018-02-01 03:15:25 +03:00
if ( ! direct_io & & nowait )
return - EOPNOTSUPP ;
2015-02-11 21:56:46 +03:00
if ( count = = 0 )
2006-10-18 05:29:52 +04:00
return 0 ;
2018-02-01 03:15:25 +03:00
if ( nowait ) {
if ( ! inode_trylock ( inode ) )
return - EAGAIN ;
} else
inode_lock ( inode ) ;
2007-02-10 07:24:12 +03:00
2010-10-11 12:46:39 +04:00
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option " coherency=buffered " .
2015-09-05 01:43:40 +03:00
* For append write , we must take rw EX .
2010-10-11 12:46:39 +04:00
*/
2015-09-05 01:43:40 +03:00
rw_level = ( ! direct_io | | full_coherency | | append_write ) ;
2010-10-11 12:46:39 +04:00
2018-02-01 03:15:25 +03:00
if ( nowait )
ret = ocfs2_try_rw_lock ( inode , rw_level ) ;
else
ret = ocfs2_rw_lock ( inode , rw_level ) ;
2006-10-18 05:29:52 +04:00
if ( ret < 0 ) {
2018-02-01 03:15:25 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2015-06-25 02:55:15 +03:00
goto out_mutex ;
2006-10-18 05:29:52 +04:00
}
2010-10-11 12:46:39 +04:00
/*
* O_DIRECT writes with " coherency=full " need to take EX cluster
* inode_lock to guarantee coherency .
*/
if ( direct_io & & full_coherency ) {
/*
* We need to take and drop the inode lock to force
* other nodes to drop their caches . Buffered I / O
* already does this in write_begin ( ) .
*/
2018-02-01 03:15:25 +03:00
if ( nowait )
ret = ocfs2_try_inode_lock ( inode , NULL , 1 ) ;
else
ret = ocfs2_inode_lock ( inode , NULL , 1 ) ;
2010-10-11 12:46:39 +04:00
if ( ret < 0 ) {
2018-02-01 03:15:25 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2013-05-25 02:55:12 +04:00
goto out ;
2010-10-11 12:46:39 +04:00
}
ocfs2_inode_unlock ( inode , 1 ) ;
}
2015-04-09 19:55:47 +03:00
ret = generic_write_checks ( iocb , from ) ;
if ( ret < = 0 ) {
if ( ret )
mlog_errno ( ret ) ;
2015-04-09 18:14:45 +03:00
goto out ;
}
2015-04-09 19:55:47 +03:00
count = ret ;
2015-04-09 18:14:45 +03:00
2018-02-01 03:15:25 +03:00
ret = ocfs2_prepare_inode_for_write ( file , iocb - > ki_pos , count , ! nowait ) ;
2006-10-18 05:29:52 +04:00
if ( ret < 0 ) {
2018-02-01 03:15:25 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2006-10-18 05:29:52 +04:00
goto out ;
}
2005-12-16 01:31:24 +03:00
2016-03-26 00:21:15 +03:00
if ( direct_io & & ! is_sync_kiocb ( iocb ) & &
ocfs2_is_io_unaligned ( inode , count , iocb - > ki_pos ) ) {
2011-06-23 01:23:38 +04:00
/*
2016-03-26 00:21:15 +03:00
* Make it a sync io if it ' s an unaligned aio .
2011-06-23 01:23:38 +04:00
*/
2016-03-26 00:21:15 +03:00
saved_ki_complete = xchg ( & iocb - > ki_complete , NULL ) ;
2011-06-23 01:23:38 +04:00
}
2005-12-16 01:31:24 +03:00
/* communicate with ocfs2_dio_end_io */
2007-04-17 04:28:51 +04:00
ocfs2_iocb_set_rw_locked ( iocb , rw_level ) ;
2005-12-16 01:31:24 +03:00
2015-04-09 21:01:33 +03:00
written = __generic_file_write_iter ( iocb , from ) ;
2005-12-16 01:31:24 +03:00
/* buffered aio wouldn't have proper lock coverage today */
2018-11-03 01:48:11 +03:00
BUG_ON ( written = = - EIOCBQUEUED & & ! direct_io ) ;
2005-12-16 01:31:24 +03:00
2015-09-05 01:42:36 +03:00
/*
* deep in g_f_a_w_n ( ) - > ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
* it can unlock our rw lock .
* Unfortunately there are error cases which call end_io and others
* that don ' t . so we don ' t have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an
* error has already done it .
*/
if ( ( written = = - EIOCBQUEUED ) | | ( ! ocfs2_iocb_is_rw_locked ( iocb ) ) ) {
rw_level = - 1 ;
}
2015-04-09 00:00:32 +03:00
if ( unlikely ( written < = 0 ) )
2016-03-26 00:21:15 +03:00
goto out ;
2015-04-09 00:00:32 +03:00
2015-04-09 21:01:33 +03:00
if ( ( ( file - > f_flags & O_DSYNC ) & & ! direct_io ) | |
2016-03-26 00:21:12 +03:00
IS_SYNC ( inode ) ) {
2015-04-09 00:00:32 +03:00
ret = filemap_fdatawrite_range ( file - > f_mapping ,
iocb - > ki_pos - written ,
iocb - > ki_pos - 1 ) ;
2009-08-17 20:50:08 +04:00
if ( ret < 0 )
written = ret ;
2014-12-11 02:41:53 +03:00
if ( ! ret ) {
2008-09-04 07:03:41 +04:00
ret = jbd2_journal_force_commit ( osb - > journal - > j_journal ) ;
2007-10-19 01:14:45 +04:00
if ( ret < 0 )
written = ret ;
}
2009-08-17 20:50:08 +04:00
if ( ! ret )
2015-04-09 00:00:32 +03:00
ret = filemap_fdatawait_range ( file - > f_mapping ,
iocb - > ki_pos - written ,
iocb - > ki_pos - 1 ) ;
2007-10-19 01:14:45 +04:00
}
2005-12-16 01:31:24 +03:00
out :
2016-03-26 00:21:15 +03:00
if ( saved_ki_complete )
xchg ( & iocb - > ki_complete , saved_ki_complete ) ;
2007-02-10 07:24:12 +03:00
if ( rw_level ! = - 1 )
ocfs2_rw_unlock ( inode , rw_level ) ;
2015-06-25 02:55:15 +03:00
out_mutex :
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2005-12-16 01:31:24 +03:00
2009-07-10 09:26:04 +04:00
if ( written )
ret = written ;
return ret ;
2005-12-16 01:31:24 +03:00
}
2014-04-02 22:44:18 +04:00
static ssize_t ocfs2_file_read_iter ( struct kiocb * iocb ,
struct iov_iter * to )
2005-12-16 01:31:24 +03:00
{
2015-06-25 02:55:15 +03:00
int ret = 0 , rw_level = - 1 , lock_level = 0 ;
2005-12-16 01:31:24 +03:00
struct file * filp = iocb - > ki_filp ;
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( filp ) ;
2018-02-01 03:15:25 +03:00
int direct_io = iocb - > ki_flags & IOCB_DIRECT ? 1 : 0 ;
int nowait = iocb - > ki_flags & IOCB_NOWAIT ? 1 : 0 ;
2005-12-16 01:31:24 +03:00
2018-04-06 02:18:45 +03:00
trace_ocfs2_file_read_iter ( inode , filp , filp - > f_path . dentry ,
2011-02-22 17:14:41 +03:00
( unsigned long long ) OCFS2_I ( inode ) - > ip_blkno ,
filp - > f_path . dentry - > d_name . len ,
2014-04-02 22:44:18 +04:00
filp - > f_path . dentry - > d_name . name ,
to - > nr_segs ) ; /* GRRRRR */
2011-02-22 17:14:41 +03:00
2005-12-16 01:31:24 +03:00
if ( ! inode ) {
ret = - EINVAL ;
mlog_errno ( ret ) ;
goto bail ;
}
2018-02-01 03:15:25 +03:00
if ( ! direct_io & & nowait )
return - EOPNOTSUPP ;
2010-01-26 03:57:38 +03:00
/*
2022-04-29 18:12:16 +03:00
* buffered reads protect themselves in - > read_folio ( ) . O_DIRECT reads
2005-12-16 01:31:24 +03:00
* need locks to protect pending reads from racing with truncate .
*/
2018-02-01 03:15:25 +03:00
if ( direct_io ) {
if ( nowait )
ret = ocfs2_try_rw_lock ( inode , 0 ) ;
else
ret = ocfs2_rw_lock ( inode , 0 ) ;
2005-12-16 01:31:24 +03:00
if ( ret < 0 ) {
2018-02-01 03:15:25 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2005-12-16 01:31:24 +03:00
goto bail ;
}
rw_level = 0 ;
/* communicate with ocfs2_dio_end_io */
2007-04-17 04:28:51 +04:00
ocfs2_iocb_set_rw_locked ( iocb , rw_level ) ;
2005-12-16 01:31:24 +03:00
}
2006-05-06 06:04:35 +04:00
/*
* We ' re fine letting folks race truncates and extending
* writes with read across the cluster , just like they can
* locally . Hence no rw_lock during read .
2010-01-26 03:57:38 +03:00
*
2006-05-06 06:04:35 +04:00
* Take and drop the meta data lock to update inode fields
* like i_size . This allows the checks down below
2018-04-06 02:18:45 +03:00
* generic_file_read_iter ( ) a chance of actually working .
2006-05-06 06:04:35 +04:00
*/
2018-02-01 03:15:25 +03:00
ret = ocfs2_inode_lock_atime ( inode , filp - > f_path . mnt , & lock_level ,
! nowait ) ;
2006-05-06 06:04:35 +04:00
if ( ret < 0 ) {
2018-02-01 03:15:25 +03:00
if ( ret ! = - EAGAIN )
mlog_errno ( ret ) ;
2006-05-06 06:04:35 +04:00
goto bail ;
}
2007-10-19 02:30:42 +04:00
ocfs2_inode_unlock ( inode , lock_level ) ;
2006-05-06 06:04:35 +04:00
2014-04-02 22:44:18 +04:00
ret = generic_file_read_iter ( iocb , to ) ;
2018-04-06 02:18:45 +03:00
trace_generic_file_read_iter_ret ( ret ) ;
2005-12-16 01:31:24 +03:00
/* buffered aio wouldn't have proper lock coverage today */
2018-11-03 01:48:11 +03:00
BUG_ON ( ret = = - EIOCBQUEUED & & ! direct_io ) ;
2005-12-16 01:31:24 +03:00
2014-04-03 22:25:22 +04:00
/* see ocfs2_file_write_iter */
2005-12-16 01:31:24 +03:00
if ( ret = = - EIOCBQUEUED | | ! ocfs2_iocb_is_rw_locked ( iocb ) ) {
rw_level = - 1 ;
}
bail :
2010-01-26 03:57:38 +03:00
if ( rw_level ! = - 1 )
2005-12-16 01:31:24 +03:00
ocfs2_rw_unlock ( inode , rw_level ) ;
return ret ;
}
2011-07-26 01:58:15 +04:00
/* Refer generic_file_llseek_unlocked() */
2012-12-18 03:59:39 +04:00
static loff_t ocfs2_file_llseek ( struct file * file , loff_t offset , int whence )
2011-07-26 01:58:15 +04:00
{
struct inode * inode = file - > f_mapping - > host ;
int ret = 0 ;
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2011-07-26 01:58:15 +04:00
2012-12-18 03:59:39 +04:00
switch ( whence ) {
2011-07-26 01:58:15 +04:00
case SEEK_SET :
break ;
case SEEK_END :
2014-04-04 01:47:01 +04:00
/* SEEK_END requires the OCFS2 inode lock for the file
* because it references the file ' s size .
*/
ret = ocfs2_inode_lock ( inode , NULL , 0 ) ;
if ( ret < 0 ) {
mlog_errno ( ret ) ;
goto out ;
}
offset + = i_size_read ( inode ) ;
ocfs2_inode_unlock ( inode , 0 ) ;
2011-07-26 01:58:15 +04:00
break ;
case SEEK_CUR :
if ( offset = = 0 ) {
offset = file - > f_pos ;
goto out ;
}
offset + = file - > f_pos ;
break ;
case SEEK_DATA :
case SEEK_HOLE :
2012-12-18 03:59:39 +04:00
ret = ocfs2_seek_data_hole_offset ( file , & offset , whence ) ;
2011-07-26 01:58:15 +04:00
if ( ret )
goto out ;
break ;
default :
ret = - EINVAL ;
goto out ;
}
2013-06-25 08:02:13 +04:00
offset = vfs_setpos ( file , offset , inode - > i_sb - > s_maxbytes ) ;
2011-07-26 01:58:15 +04:00
out :
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2011-07-26 01:58:15 +04:00
if ( ret )
return ret ;
return offset ;
}
2018-10-30 02:41:49 +03:00
static loff_t ocfs2_remap_file_range ( struct file * file_in , loff_t pos_in ,
struct file * file_out , loff_t pos_out ,
loff_t len , unsigned int remap_flags )
2016-11-10 01:13:11 +03:00
{
2018-10-30 02:45:48 +03:00
struct inode * inode_in = file_inode ( file_in ) ;
struct inode * inode_out = file_inode ( file_out ) ;
struct ocfs2_super * osb = OCFS2_SB ( inode_in - > i_sb ) ;
struct buffer_head * in_bh = NULL , * out_bh = NULL ;
bool same_inode = ( inode_in = = inode_out ) ;
loff_t remapped = 0 ;
ssize_t ret ;
2018-10-30 02:41:21 +03:00
if ( remap_flags & ~ ( REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY ) )
return - EINVAL ;
2018-10-30 02:45:48 +03:00
if ( ! ocfs2_refcount_tree ( osb ) )
return - EOPNOTSUPP ;
if ( ocfs2_is_hard_readonly ( osb ) | | ocfs2_is_soft_readonly ( osb ) )
return - EROFS ;
2016-11-10 01:13:11 +03:00
2018-10-30 02:45:48 +03:00
/* Lock both files against IO */
ret = ocfs2_reflink_inodes_lock ( inode_in , & in_bh , inode_out , & out_bh ) ;
if ( ret )
return ret ;
/* Check file eligibility and prepare for block sharing. */
ret = - EINVAL ;
if ( ( OCFS2_I ( inode_in ) - > ip_flags & OCFS2_INODE_SYSTEM_FILE ) | |
( OCFS2_I ( inode_out ) - > ip_flags & OCFS2_INODE_SYSTEM_FILE ) )
goto out_unlock ;
ret = generic_remap_file_range_prep ( file_in , pos_in , file_out , pos_out ,
& len , remap_flags ) ;
if ( ret < 0 | | len = = 0 )
goto out_unlock ;
/* Lock out changes to the allocation maps and remap. */
down_write ( & OCFS2_I ( inode_in ) - > ip_alloc_sem ) ;
if ( ! same_inode )
down_write_nested ( & OCFS2_I ( inode_out ) - > ip_alloc_sem ,
SINGLE_DEPTH_NESTING ) ;
/* Zap any page cache for the destination file's range. */
truncate_inode_pages_range ( & inode_out - > i_data ,
round_down ( pos_out , PAGE_SIZE ) ,
round_up ( pos_out + len , PAGE_SIZE ) - 1 ) ;
remapped = ocfs2_reflink_remap_blocks ( inode_in , in_bh , pos_in ,
inode_out , out_bh , pos_out , len ) ;
up_write ( & OCFS2_I ( inode_in ) - > ip_alloc_sem ) ;
if ( ! same_inode )
up_write ( & OCFS2_I ( inode_out ) - > ip_alloc_sem ) ;
if ( remapped < 0 ) {
ret = remapped ;
mlog_errno ( ret ) ;
goto out_unlock ;
}
/*
* Empty the extent map so that we may get the right extent
* record from the disk .
*/
ocfs2_extent_map_trunc ( inode_in , 0 ) ;
ocfs2_extent_map_trunc ( inode_out , 0 ) ;
ret = ocfs2_reflink_update_dest ( inode_out , out_bh , pos_out + len ) ;
if ( ret ) {
mlog_errno ( ret ) ;
goto out_unlock ;
}
out_unlock :
ocfs2_reflink_inodes_unlock ( inode_in , in_bh , inode_out , out_bh ) ;
return remapped > 0 ? remapped : ret ;
2016-11-10 01:13:11 +03:00
}
2007-02-12 11:55:39 +03:00
const struct inode_operations ocfs2_file_iops = {
2005-12-16 01:31:24 +03:00
. setattr = ocfs2_setattr ,
. getattr = ocfs2_getattr ,
2006-11-27 04:59:21 +03:00
. permission = ocfs2_permission ,
2008-08-18 13:11:00 +04:00
. listxattr = ocfs2_listxattr ,
2008-10-04 01:32:11 +04:00
. fiemap = ocfs2_fiemap ,
2011-07-23 19:37:31 +04:00
. get_acl = ocfs2_iop_get_acl ,
2013-12-20 17:16:48 +04:00
. set_acl = ocfs2_iop_set_acl ,
2021-04-07 15:36:44 +03:00
. fileattr_get = ocfs2_fileattr_get ,
. fileattr_set = ocfs2_fileattr_set ,
2005-12-16 01:31:24 +03:00
} ;
2007-02-12 11:55:39 +03:00
const struct inode_operations ocfs2_special_file_iops = {
2005-12-16 01:31:24 +03:00
. setattr = ocfs2_setattr ,
. getattr = ocfs2_getattr ,
2006-11-27 04:59:21 +03:00
. permission = ocfs2_permission ,
2011-07-23 19:37:31 +04:00
. get_acl = ocfs2_iop_get_acl ,
2013-12-20 17:16:48 +04:00
. set_acl = ocfs2_iop_set_acl ,
2005-12-16 01:31:24 +03:00
} ;
2008-07-22 01:29:16 +04:00
/*
* Other than - > lock , keep ocfs2_fops and ocfs2_dops in sync with
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks !
*/
2006-03-28 13:56:42 +04:00
const struct file_operations ocfs2_fops = {
2011-07-26 01:58:15 +04:00
. llseek = ocfs2_file_llseek ,
2005-12-16 01:31:24 +03:00
. mmap = ocfs2_mmap ,
. fsync = ocfs2_sync_file ,
. release = ocfs2_file_release ,
. open = ocfs2_file_open ,
2014-04-02 22:44:18 +04:00
. read_iter = ocfs2_file_read_iter ,
2014-04-03 22:25:22 +04:00
. write_iter = ocfs2_file_write_iter ,
2008-01-27 05:17:17 +03:00
. unlocked_ioctl = ocfs2_ioctl ,
2007-03-10 02:56:28 +03:00
# ifdef CONFIG_COMPAT
. compat_ioctl = ocfs2_compat_ioctl ,
# endif
2008-07-22 01:29:16 +04:00
. lock = ocfs2_lock ,
2007-12-21 03:49:04 +03:00
. flock = ocfs2_flock ,
2016-09-23 06:35:42 +03:00
. splice_read = generic_file_splice_read ,
2014-04-05 12:34:23 +04:00
. splice_write = iter_file_splice_write ,
2011-01-14 15:07:43 +03:00
. fallocate = ocfs2_fallocate ,
2018-10-30 02:41:21 +03:00
. remap_file_range = ocfs2_remap_file_range ,
2005-12-16 01:31:24 +03:00
} ;
2006-03-28 13:56:42 +04:00
const struct file_operations ocfs2_dops = {
2007-12-19 17:24:52 +03:00
. llseek = generic_file_llseek ,
2005-12-16 01:31:24 +03:00
. read = generic_read_dir ,
2013-05-23 05:06:00 +04:00
. iterate = ocfs2_readdir ,
2005-12-16 01:31:24 +03:00
. fsync = ocfs2_sync_file ,
2007-12-21 03:49:04 +03:00
. release = ocfs2_dir_release ,
. open = ocfs2_dir_open ,
2008-01-27 05:17:17 +03:00
. unlocked_ioctl = ocfs2_ioctl ,
2007-03-10 02:56:28 +03:00
# ifdef CONFIG_COMPAT
. compat_ioctl = ocfs2_compat_ioctl ,
2008-07-22 01:29:16 +04:00
# endif
. lock = ocfs2_lock ,
. flock = ocfs2_flock ,
} ;
/*
* POSIX - lockless variants of our file_operations .
*
* These will be used if the underlying cluster stack does not support
* posix file locking , if the user passes the " localflocks " mount
* option , or if we have a local - only fs .
*
* ocfs2_flock is in here because all stacks handle UNIX file locks ,
* so we still want it in the case of no stack support for
* plocks . Internally , it will do the right thing when asked to ignore
* the cluster .
*/
const struct file_operations ocfs2_fops_no_plocks = {
2011-07-26 01:58:15 +04:00
. llseek = ocfs2_file_llseek ,
2008-07-22 01:29:16 +04:00
. mmap = ocfs2_mmap ,
. fsync = ocfs2_sync_file ,
. release = ocfs2_file_release ,
. open = ocfs2_file_open ,
2014-04-02 22:44:18 +04:00
. read_iter = ocfs2_file_read_iter ,
2014-04-03 22:25:22 +04:00
. write_iter = ocfs2_file_write_iter ,
2008-07-22 01:29:16 +04:00
. unlocked_ioctl = ocfs2_ioctl ,
# ifdef CONFIG_COMPAT
. compat_ioctl = ocfs2_compat_ioctl ,
# endif
. flock = ocfs2_flock ,
2016-09-23 06:35:42 +03:00
. splice_read = generic_file_splice_read ,
2014-04-05 12:34:23 +04:00
. splice_write = iter_file_splice_write ,
2011-05-23 11:57:26 +04:00
. fallocate = ocfs2_fallocate ,
2018-10-30 02:41:21 +03:00
. remap_file_range = ocfs2_remap_file_range ,
2008-07-22 01:29:16 +04:00
} ;
const struct file_operations ocfs2_dops_no_plocks = {
. llseek = generic_file_llseek ,
. read = generic_read_dir ,
2013-05-23 05:06:00 +04:00
. iterate = ocfs2_readdir ,
2008-07-22 01:29:16 +04:00
. fsync = ocfs2_sync_file ,
. release = ocfs2_dir_release ,
. open = ocfs2_dir_open ,
. unlocked_ioctl = ocfs2_ioctl ,
# ifdef CONFIG_COMPAT
. compat_ioctl = ocfs2_compat_ioctl ,
2007-03-10 02:56:28 +03:00
# endif
2007-12-21 03:49:04 +03:00
. flock = ocfs2_flock ,
2005-12-16 01:31:24 +03:00
} ;