2019-05-24 13:04:05 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-12-16 01:31:24 +03:00
/* -*- mode: c; c-basic-offset: 8; -*-
* vim : noexpandtab sw = 8 ts = 8 sts = 0 :
*
* sysfile . c
*
* Initialize , read , write , etc . system files .
*
* Copyright ( C ) 2002 , 2004 Oracle . All rights reserved .
*/
# include <linux/fs.h>
# include <linux/types.h>
# include <linux/highmem.h>
# include <cluster/masklog.h>
2006-09-09 01:43:18 +04:00
# include "ocfs2.h"
2005-12-16 01:31:24 +03:00
# include "alloc.h"
# include "dir.h"
# include "inode.h"
# include "journal.h"
# include "sysfile.h"
# include "buffer_head_io.h"
static struct inode * _ocfs2_get_system_file_inode ( struct ocfs2_super * osb ,
int type ,
u32 slot ) ;
2009-06-18 09:12:06 +04:00
# ifdef CONFIG_DEBUG_LOCK_ALLOC
2009-06-04 17:26:50 +04:00
static struct lock_class_key ocfs2_sysfile_cluster_lock_key [ NUM_SYSTEM_INODES ] ;
2009-06-18 09:12:06 +04:00
# endif
2009-06-04 17:26:50 +04:00
2005-12-16 01:31:24 +03:00
static inline int is_global_system_inode ( int type )
{
return type > = OCFS2_FIRST_ONLINE_SYSTEM_INODE & &
type < = OCFS2_LAST_GLOBAL_SYSTEM_INODE ;
}
ocfs2: Cache system inodes of other slots.
Durring orphan scan, if we are slot 0, and we are replaying
orphan_dir:0001, the general process is that for every file
in this dir:
1. we will iget orphan_dir:0001, since there is no inode for it.
we will have to create an inode and read it from the disk.
2. do the normal work, such as delete_inode and remove it from
the dir if it is allowed.
3. call iput orphan_dir:0001 when we are done. In this case,
since we have no dcache for this inode, i_count will
reach 0, and VFS will have to call clear_inode and in
ocfs2_clear_inode we will checkpoint the inode which will let
ocfs2_cmt and journald begin to work.
4. We loop back to 1 for the next file.
So you see, actually for every deleted file, we have to read the
orphan dir from the disk and checkpoint the journal. It is very
time consuming and cause a lot of journal checkpoint I/O.
A better solution is that we can have another reference for these
inodes in ocfs2_super. So if there is no other race among
nodes(which will let dlmglue to checkpoint the inode), for step 3,
clear_inode won't be called and for step 1, we may only need to
read the inode for the 1st time. This is a big win for us.
So this patch will try to cache system inodes of other slots so
that we will have one more reference for these inodes and avoid
the extra inode read and journal checkpoint.
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-08-16 12:58:21 +04:00
static struct inode * * get_local_system_inode ( struct ocfs2_super * osb ,
int type ,
u32 slot )
2005-12-16 01:31:24 +03:00
{
ocfs2: Cache system inodes of other slots.
Durring orphan scan, if we are slot 0, and we are replaying
orphan_dir:0001, the general process is that for every file
in this dir:
1. we will iget orphan_dir:0001, since there is no inode for it.
we will have to create an inode and read it from the disk.
2. do the normal work, such as delete_inode and remove it from
the dir if it is allowed.
3. call iput orphan_dir:0001 when we are done. In this case,
since we have no dcache for this inode, i_count will
reach 0, and VFS will have to call clear_inode and in
ocfs2_clear_inode we will checkpoint the inode which will let
ocfs2_cmt and journald begin to work.
4. We loop back to 1 for the next file.
So you see, actually for every deleted file, we have to read the
orphan dir from the disk and checkpoint the journal. It is very
time consuming and cause a lot of journal checkpoint I/O.
A better solution is that we can have another reference for these
inodes in ocfs2_super. So if there is no other race among
nodes(which will let dlmglue to checkpoint the inode), for step 3,
clear_inode won't be called and for step 1, we may only need to
read the inode for the 1st time. This is a big win for us.
So this patch will try to cache system inodes of other slots so
that we will have one more reference for these inodes and avoid
the extra inode read and journal checkpoint.
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-08-16 12:58:21 +04:00
int index ;
struct inode * * local_system_inodes , * * free = NULL ;
BUG_ON ( slot = = OCFS2_INVALID_SLOT ) ;
BUG_ON ( type < OCFS2_FIRST_LOCAL_SYSTEM_INODE | |
type > OCFS2_LAST_LOCAL_SYSTEM_INODE ) ;
spin_lock ( & osb - > osb_lock ) ;
local_system_inodes = osb - > local_system_inodes ;
spin_unlock ( & osb - > osb_lock ) ;
if ( unlikely ( ! local_system_inodes ) ) {
treewide: kzalloc() -> kcalloc()
The kzalloc() function has a 2-factor argument form, kcalloc(). This
patch replaces cases of:
kzalloc(a * b, gfp)
with:
kcalloc(a * b, gfp)
as well as handling cases of:
kzalloc(a * b * c, gfp)
with:
kzalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kzalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kzalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kzalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kzalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kzalloc
+ kcalloc
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kzalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kzalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kzalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kzalloc(sizeof(THING) * C2, ...)
|
kzalloc(sizeof(TYPE) * C2, ...)
|
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(C1 * C2, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 00:03:40 +03:00
local_system_inodes =
kzalloc ( array3_size ( sizeof ( struct inode * ) ,
NUM_LOCAL_SYSTEM_INODES ,
osb - > max_slots ) ,
GFP_NOFS ) ;
ocfs2: Cache system inodes of other slots.
Durring orphan scan, if we are slot 0, and we are replaying
orphan_dir:0001, the general process is that for every file
in this dir:
1. we will iget orphan_dir:0001, since there is no inode for it.
we will have to create an inode and read it from the disk.
2. do the normal work, such as delete_inode and remove it from
the dir if it is allowed.
3. call iput orphan_dir:0001 when we are done. In this case,
since we have no dcache for this inode, i_count will
reach 0, and VFS will have to call clear_inode and in
ocfs2_clear_inode we will checkpoint the inode which will let
ocfs2_cmt and journald begin to work.
4. We loop back to 1 for the next file.
So you see, actually for every deleted file, we have to read the
orphan dir from the disk and checkpoint the journal. It is very
time consuming and cause a lot of journal checkpoint I/O.
A better solution is that we can have another reference for these
inodes in ocfs2_super. So if there is no other race among
nodes(which will let dlmglue to checkpoint the inode), for step 3,
clear_inode won't be called and for step 1, we may only need to
read the inode for the 1st time. This is a big win for us.
So this patch will try to cache system inodes of other slots so
that we will have one more reference for these inodes and avoid
the extra inode read and journal checkpoint.
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-08-16 12:58:21 +04:00
if ( ! local_system_inodes ) {
mlog_errno ( - ENOMEM ) ;
/*
* return NULL here so that ocfs2_get_sytem_file_inodes
* will try to create an inode and use it . We will try
* to initialize local_system_inodes next time .
*/
return NULL ;
}
spin_lock ( & osb - > osb_lock ) ;
if ( osb - > local_system_inodes ) {
/* Someone has initialized it for us. */
free = local_system_inodes ;
local_system_inodes = osb - > local_system_inodes ;
} else
osb - > local_system_inodes = local_system_inodes ;
spin_unlock ( & osb - > osb_lock ) ;
2013-02-22 04:42:44 +04:00
kfree ( free ) ;
ocfs2: Cache system inodes of other slots.
Durring orphan scan, if we are slot 0, and we are replaying
orphan_dir:0001, the general process is that for every file
in this dir:
1. we will iget orphan_dir:0001, since there is no inode for it.
we will have to create an inode and read it from the disk.
2. do the normal work, such as delete_inode and remove it from
the dir if it is allowed.
3. call iput orphan_dir:0001 when we are done. In this case,
since we have no dcache for this inode, i_count will
reach 0, and VFS will have to call clear_inode and in
ocfs2_clear_inode we will checkpoint the inode which will let
ocfs2_cmt and journald begin to work.
4. We loop back to 1 for the next file.
So you see, actually for every deleted file, we have to read the
orphan dir from the disk and checkpoint the journal. It is very
time consuming and cause a lot of journal checkpoint I/O.
A better solution is that we can have another reference for these
inodes in ocfs2_super. So if there is no other race among
nodes(which will let dlmglue to checkpoint the inode), for step 3,
clear_inode won't be called and for step 1, we may only need to
read the inode for the 1st time. This is a big win for us.
So this patch will try to cache system inodes of other slots so
that we will have one more reference for these inodes and avoid
the extra inode read and journal checkpoint.
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-08-16 12:58:21 +04:00
}
index = ( slot * NUM_LOCAL_SYSTEM_INODES ) +
( type - OCFS2_FIRST_LOCAL_SYSTEM_INODE ) ;
return & local_system_inodes [ index ] ;
2005-12-16 01:31:24 +03:00
}
struct inode * ocfs2_get_system_file_inode ( struct ocfs2_super * osb ,
int type ,
u32 slot )
{
struct inode * inode = NULL ;
struct inode * * arr = NULL ;
/* avoid the lookup if cached in local system file array */
ocfs2: Cache system inodes of other slots.
Durring orphan scan, if we are slot 0, and we are replaying
orphan_dir:0001, the general process is that for every file
in this dir:
1. we will iget orphan_dir:0001, since there is no inode for it.
we will have to create an inode and read it from the disk.
2. do the normal work, such as delete_inode and remove it from
the dir if it is allowed.
3. call iput orphan_dir:0001 when we are done. In this case,
since we have no dcache for this inode, i_count will
reach 0, and VFS will have to call clear_inode and in
ocfs2_clear_inode we will checkpoint the inode which will let
ocfs2_cmt and journald begin to work.
4. We loop back to 1 for the next file.
So you see, actually for every deleted file, we have to read the
orphan dir from the disk and checkpoint the journal. It is very
time consuming and cause a lot of journal checkpoint I/O.
A better solution is that we can have another reference for these
inodes in ocfs2_super. So if there is no other race among
nodes(which will let dlmglue to checkpoint the inode), for step 3,
clear_inode won't be called and for step 1, we may only need to
read the inode for the 1st time. This is a big win for us.
So this patch will try to cache system inodes of other slots so
that we will have one more reference for these inodes and avoid
the extra inode read and journal checkpoint.
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-08-16 12:58:21 +04:00
if ( is_global_system_inode ( type ) ) {
arr = & ( osb - > global_system_inodes [ type ] ) ;
} else
arr = get_local_system_inode ( osb , type , slot ) ;
2005-12-16 01:31:24 +03:00
2014-04-04 01:47:13 +04:00
mutex_lock ( & osb - > system_file_mutex ) ;
2005-12-16 01:31:24 +03:00
if ( arr & & ( ( inode = * arr ) ! = NULL ) ) {
/* get a ref in addition to the array ref */
inode = igrab ( inode ) ;
2014-04-04 01:47:13 +04:00
mutex_unlock ( & osb - > system_file_mutex ) ;
2006-01-27 12:32:52 +03:00
BUG_ON ( ! inode ) ;
2005-12-16 01:31:24 +03:00
return inode ;
}
/* this gets one ref thru iget */
inode = _ocfs2_get_system_file_inode ( osb , type , slot ) ;
/* add one more if putting into array for first time */
if ( arr & & inode ) {
* arr = igrab ( inode ) ;
2006-01-27 12:32:52 +03:00
BUG_ON ( ! * arr ) ;
2005-12-16 01:31:24 +03:00
}
2014-04-04 01:47:13 +04:00
mutex_unlock ( & osb - > system_file_mutex ) ;
2005-12-16 01:31:24 +03:00
return inode ;
}
static struct inode * _ocfs2_get_system_file_inode ( struct ocfs2_super * osb ,
int type ,
u32 slot )
{
char namebuf [ 40 ] ;
struct inode * inode = NULL ;
u64 blkno ;
int status = 0 ;
ocfs2_sprintf_system_inode_name ( namebuf ,
sizeof ( namebuf ) ,
type , slot ) ;
2007-09-12 02:22:06 +04:00
status = ocfs2_lookup_ino_from_name ( osb - > sys_root_inode , namebuf ,
strlen ( namebuf ) , & blkno ) ;
2005-12-16 01:31:24 +03:00
if ( status < 0 ) {
goto bail ;
}
2008-01-11 02:11:45 +03:00
inode = ocfs2_iget ( osb , blkno , OCFS2_FI_FLAG_SYSFILE , type ) ;
2005-12-16 01:31:24 +03:00
if ( IS_ERR ( inode ) ) {
mlog_errno ( PTR_ERR ( inode ) ) ;
inode = NULL ;
goto bail ;
}
2009-06-04 17:26:50 +04:00
# ifdef CONFIG_DEBUG_LOCK_ALLOC
if ( type = = LOCAL_USER_QUOTA_SYSTEM_INODE | |
type = = LOCAL_GROUP_QUOTA_SYSTEM_INODE | |
type = = JOURNAL_SYSTEM_INODE ) {
/* Ignore inode lock on these inodes as the lock does not
* really belong to any process and lockdep cannot handle
* that */
OCFS2_I ( inode ) - > ip_inode_lockres . l_lockdep_map . key = NULL ;
} else {
lockdep_init_map ( & OCFS2_I ( inode ) - > ip_inode_lockres .
l_lockdep_map ,
ocfs2_system_inodes [ type ] . si_name ,
& ocfs2_sysfile_cluster_lock_key [ type ] , 0 ) ;
}
# endif
2005-12-16 01:31:24 +03:00
bail :
2007-09-12 02:22:06 +04:00
2005-12-16 01:31:24 +03:00
return inode ;
}