2009-12-18 05:24:27 +03:00
/*
* Copyright ( C ) 2008 Red Hat , Inc . , Eric Paris < eparis @ redhat . com >
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 , or ( at your option )
* any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; see the file COPYING . If not , write to
* the Free Software Foundation , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
*/
/*
* fsnotify inode mark locking / lifetime / and refcnting
*
* REFCNT :
2013-07-09 02:59:46 +04:00
* The group - > recnt and mark - > refcnt tell how many " things " in the kernel
* currently are referencing the objects . Both kind of objects typically will
* live inside the kernel with a refcnt of 2 , one for its creation and one for
* the reference a group and a mark hold to each other .
* If you are holding the appropriate locks , you can take a reference and the
* object itself is guaranteed to survive until the reference is dropped .
2009-12-18 05:24:27 +03:00
*
* LOCKING :
2013-07-09 02:59:46 +04:00
* There are 3 locks involved with fsnotify inode marks and they MUST be taken
* in order as follows :
2009-12-18 05:24:27 +03:00
*
2013-07-09 02:59:46 +04:00
* group - > mark_mutex
2009-12-18 05:24:27 +03:00
* mark - > lock
* inode - > i_lock
*
2013-07-09 02:59:46 +04:00
* group - > mark_mutex protects the marks_list anchored inside a given group and
* each mark is hooked via the g_list . It also protects the groups private
* data ( i . e group limits ) .
* mark - > lock protects the marks attributes like its masks and flags .
* Furthermore it protects the access to a reference of the group that the mark
* is assigned to as well as the access to a reference of the inode / vfsmount
* that is being watched by the mark .
2009-12-18 05:24:27 +03:00
*
* inode - > i_lock protects the i_fsnotify_marks list anchored inside a
* given inode and each mark is hooked via the i_list . ( and sorta the
* free_i_list )
*
*
* LIFETIME :
* Inode marks survive between when they are added to an inode and when their
2016-12-16 12:53:32 +03:00
* refcnt = = 0. Marks are also protected by fsnotify_mark_srcu .
2009-12-18 05:24:27 +03:00
*
* The inode mark can be cleared for a number of different reasons including :
* - The inode is unlinked for the last time . ( fsnotify_inode_remove )
* - The inode is being evicted from cache . ( fsnotify_inode_delete )
* - The fs the inode is on is unmounted . ( fsnotify_inode_delete / fsnotify_unmount_inodes )
* - Something explicitly requests that it be removed . ( fsnotify_destroy_mark )
* - The fsnotify_group associated with the mark is going away and all such marks
* need to be cleaned up . ( fsnotify_clear_marks_by_group )
*
* This has the very interesting property of being able to run concurrently with
* any ( or all ) other directions .
*/
# include <linux/fs.h>
# include <linux/init.h>
# include <linux/kernel.h>
2010-07-28 18:18:38 +04:00
# include <linux/kthread.h>
2009-12-18 05:24:27 +03:00
# include <linux/module.h>
# include <linux/mutex.h>
# include <linux/slab.h>
# include <linux/spinlock.h>
2010-07-28 18:18:38 +04:00
# include <linux/srcu.h>
2009-12-18 05:24:27 +03:00
2011-07-27 03:09:06 +04:00
# include <linux/atomic.h>
2009-12-18 05:24:27 +03:00
# include <linux/fsnotify_backend.h>
# include "fsnotify.h"
2016-02-18 00:11:21 +03:00
# define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
2010-07-28 18:18:38 +04:00
struct srcu_struct fsnotify_mark_srcu ;
2017-03-14 14:31:02 +03:00
struct kmem_cache * fsnotify_mark_connector_cachep ;
2016-02-18 00:11:18 +03:00
static DEFINE_SPINLOCK ( destroy_lock ) ;
static LIST_HEAD ( destroy_list ) ;
2016-02-18 00:11:21 +03:00
2016-05-20 03:08:59 +03:00
static void fsnotify_mark_destroy_workfn ( struct work_struct * work ) ;
static DECLARE_DELAYED_WORK ( reaper_work , fsnotify_mark_destroy_workfn ) ;
2010-07-28 18:18:38 +04:00
2009-12-18 05:24:27 +03:00
void fsnotify_get_mark ( struct fsnotify_mark * mark )
{
atomic_inc ( & mark - > refcnt ) ;
}
void fsnotify_put_mark ( struct fsnotify_mark * mark )
{
2011-06-14 19:29:47 +04:00
if ( atomic_dec_and_test ( & mark - > refcnt ) ) {
if ( mark - > group )
fsnotify_put_group ( mark - > group ) ;
2009-12-18 05:24:27 +03:00
mark - > free_mark ( mark ) ;
2011-06-14 19:29:47 +04:00
}
2009-12-18 05:24:27 +03:00
}
2014-12-13 03:58:36 +03:00
/* Calculate mask of events for a list of marks */
2017-03-14 14:31:02 +03:00
u32 fsnotify_recalc_mask ( struct fsnotify_mark_connector * conn )
2014-12-13 03:58:36 +03:00
{
u32 new_mask = 0 ;
struct fsnotify_mark * mark ;
2017-03-14 14:31:02 +03:00
if ( ! conn )
return 0 ;
hlist_for_each_entry ( mark , & conn - > list , obj_list )
2014-12-13 03:58:36 +03:00
new_mask | = mark - > mask ;
return new_mask ;
}
2009-12-18 05:24:27 +03:00
/*
2015-09-05 01:43:12 +03:00
* Remove mark from inode / vfsmount list , group list , drop inode reference
* if we got one .
*
* Must be called with group - > mark_mutex held .
2009-12-18 05:24:27 +03:00
*/
2015-09-05 01:43:12 +03:00
void fsnotify_detach_mark ( struct fsnotify_mark * mark )
2009-12-18 05:24:27 +03:00
{
2009-12-18 05:24:27 +03:00
struct inode * inode = NULL ;
2015-09-05 01:43:12 +03:00
struct fsnotify_group * group = mark - > group ;
2009-12-18 05:24:27 +03:00
2011-06-14 19:29:52 +04:00
BUG_ON ( ! mutex_is_locked ( & group - > mark_mutex ) ) ;
2011-06-14 19:29:48 +04:00
spin_lock ( & mark - > lock ) ;
2009-12-18 05:24:27 +03:00
2010-07-28 18:18:38 +04:00
/* something else already called this function on this mark */
2015-09-05 01:43:12 +03:00
if ( ! ( mark - > flags & FSNOTIFY_MARK_FLAG_ATTACHED ) ) {
2009-12-18 05:24:27 +03:00
spin_unlock ( & mark - > lock ) ;
2011-06-14 19:29:51 +04:00
return ;
2009-12-18 05:24:27 +03:00
}
2015-09-05 01:43:12 +03:00
mark - > flags & = ~ FSNOTIFY_MARK_FLAG_ATTACHED ;
2010-07-28 18:18:38 +04:00
2017-03-14 16:48:00 +03:00
if ( mark - > connector - > flags & FSNOTIFY_OBJ_TYPE_INODE )
inode = fsnotify_destroy_inode_mark ( mark ) ;
else if ( mark - > connector - > flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT )
2009-12-18 05:24:27 +03:00
fsnotify_destroy_vfsmount_mark ( mark ) ;
2009-12-18 05:24:27 +03:00
else
BUG ( ) ;
2015-09-05 01:43:12 +03:00
/*
* Note that we didn ' t update flags telling whether inode cares about
* what ' s happening with children . We update these flags from
* __fsnotify_parent ( ) lazily when next event happens on one of our
* children .
*/
2009-12-18 05:24:27 +03:00
list_del_init ( & mark - > g_list ) ;
2015-07-22 02:06:53 +03:00
2009-12-18 05:24:27 +03:00
spin_unlock ( & mark - > lock ) ;
2011-06-14 19:29:52 +04:00
2017-03-14 16:48:00 +03:00
if ( inode )
fsnotify: change locking order
On Mon, Aug 01, 2011 at 04:38:22PM -0400, Eric Paris wrote:
>
> I finally built and tested a v3.0 kernel with these patches (I know I'm
> SOOOOOO far behind). Not what I hoped for:
>
> > [ 150.937798] VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day...
> > [ 150.945290] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070
> > [ 150.946012] IP: [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] PGD 2bf9e067 PUD 2bf9f067 PMD 0
> > [ 150.946012] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
> > [ 150.946012] CPU 0
> > [ 150.946012] Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ext4 jbd2 crc16 joydev ata_piix i2c_piix4 pcspkr uinput ipv6 autofs4 usbhid [last unloaded: scsi_wait_scan]
> > [ 150.946012]
> > [ 150.946012] Pid: 2764, comm: syscall_thrash Not tainted 3.0.0+ #1 Red Hat KVM
> > [ 150.946012] RIP: 0010:[<ffffffff810ffd58>] [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] RSP: 0018:ffff88002c2e5df8 EFLAGS: 00010282
> > [ 150.946012] RAX: 000000004e370d9f RBX: 0000000000000000 RCX: ffff88003a029438
> > [ 150.946012] RDX: 0000000033630a5f RSI: 0000000000000000 RDI: ffff88003491c240
> > [ 150.946012] RBP: ffff88002c2e5e08 R08: 0000000000000000 R09: 0000000000000000
> > [ 150.946012] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a029428
> > [ 150.946012] R13: ffff88003a029428 R14: ffff88003a029428 R15: ffff88003499a610
> > [ 150.946012] FS: 00007f5a05420700(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000
> > [ 150.946012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> > [ 150.946012] CR2: 0000000000000070 CR3: 000000002a662000 CR4: 00000000000006f0
> > [ 150.946012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [ 150.946012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> > [ 150.946012] Process syscall_thrash (pid: 2764, threadinfo ffff88002c2e4000, task ffff88002bfbc760)
> > [ 150.946012] Stack:
> > [ 150.946012] ffff88003a029438 ffff88003a029428 ffff88002c2e5e38 ffffffff81102f76
> > [ 150.946012] ffff88003a029438 ffff88003a029598 ffffffff8160f9c0 ffff88002c221250
> > [ 150.946012] ffff88002c2e5e68 ffffffff8115e9be ffff88002c2e5e68 ffff88003a029438
> > [ 150.946012] Call Trace:
> > [ 150.946012] [<ffffffff81102f76>] shmem_evict_inode+0x76/0x130
> > [ 150.946012] [<ffffffff8115e9be>] evict+0x7e/0x170
> > [ 150.946012] [<ffffffff8115ee40>] iput_final+0xd0/0x190
> > [ 150.946012] [<ffffffff8115ef33>] iput+0x33/0x40
> > [ 150.946012] [<ffffffff81180205>] fsnotify_destroy_mark_locked+0x145/0x160
> > [ 150.946012] [<ffffffff81180316>] fsnotify_destroy_mark+0x36/0x50
> > [ 150.946012] [<ffffffff81181937>] sys_inotify_rm_watch+0x77/0xd0
> > [ 150.946012] [<ffffffff815aca52>] system_call_fastpath+0x16/0x1b
> > [ 150.946012] Code: 67 4a 00 b8 e4 ff ff ff eb aa 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 8b 9f 40 05 00 00
> > [ 150.946012] 83 7b 70 00 74 1c 4c 8d a3 80 00 00 00 4c 89 e7 e8 d2 5d 4a
> > [ 150.946012] RIP [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] RSP <ffff88002c2e5df8>
> > [ 150.946012] CR2: 0000000000000070
>
> Looks at aweful lot like the problem from:
> http://www.spinics.net/lists/linux-fsdevel/msg46101.html
>
I tried to reproduce this bug with your test program, but without success.
However, if I understand correctly, this occurs since we dont hold any locks when
we call iput() in mark_destroy(), right?
With the patches you tested, iput() is also not called within any lock, since the
groups mark_mutex is released temporarily before iput() is called. This is, since
the original codes behaviour is similar.
However since we now have a mutex as the biggest lock, we can do what you
suggested (http://www.spinics.net/lists/linux-fsdevel/msg46107.html) and
call iput() with the mutex held to avoid the race.
The patch below implements this. It uses nested locking to avoid deadlock in case
we do the final iput() on an inode which still holds marks and thus would take
the mutex again when calling fsnotify_inode_delete() in destroy_inode().
Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
2011-08-12 03:13:31 +04:00
iput ( inode ) ;
2015-09-05 01:43:12 +03:00
atomic_dec ( & group - > num_marks ) ;
}
/*
2016-05-20 03:08:59 +03:00
* Prepare mark for freeing and add it to the list of marks prepared for
* freeing . The actual freeing must happen after SRCU period ends and the
* caller is responsible for this .
*
* The function returns true if the mark was added to the list of marks for
* freeing . The function returns false if someone else has already called
* __fsnotify_free_mark ( ) for the mark .
2015-09-05 01:43:12 +03:00
*/
2016-05-20 03:08:59 +03:00
static bool __fsnotify_free_mark ( struct fsnotify_mark * mark )
2015-09-05 01:43:12 +03:00
{
struct fsnotify_group * group = mark - > group ;
spin_lock ( & mark - > lock ) ;
/* something else already called this function on this mark */
if ( ! ( mark - > flags & FSNOTIFY_MARK_FLAG_ALIVE ) ) {
spin_unlock ( & mark - > lock ) ;
2016-05-20 03:08:59 +03:00
return false ;
2015-09-05 01:43:12 +03:00
}
mark - > flags & = ~ FSNOTIFY_MARK_FLAG_ALIVE ;
spin_unlock ( & mark - > lock ) ;
2009-12-18 05:24:27 +03:00
2015-07-22 02:06:53 +03:00
/*
* Some groups like to know that marks are being freed . This is a
* callback to the group function to let it know that this mark
* is being freed .
*/
if ( group - > ops - > freeing_mark )
group - > ops - > freeing_mark ( mark , group ) ;
2016-05-20 03:08:59 +03:00
spin_lock ( & destroy_lock ) ;
list_add ( & mark - > g_list , & destroy_list ) ;
spin_unlock ( & destroy_lock ) ;
return true ;
}
/*
* Free fsnotify mark . The freeing is actually happening from a workqueue which
* first waits for srcu period end . Caller must have a reference to the mark
* or be protected by fsnotify_mark_srcu .
*/
void fsnotify_free_mark ( struct fsnotify_mark * mark )
{
if ( __fsnotify_free_mark ( mark ) ) {
queue_delayed_work ( system_unbound_wq , & reaper_work ,
FSNOTIFY_REAPER_DELAY ) ;
}
2011-06-14 19:29:52 +04:00
}
void fsnotify_destroy_mark ( struct fsnotify_mark * mark ,
struct fsnotify_group * group )
{
fsnotify: change locking order
On Mon, Aug 01, 2011 at 04:38:22PM -0400, Eric Paris wrote:
>
> I finally built and tested a v3.0 kernel with these patches (I know I'm
> SOOOOOO far behind). Not what I hoped for:
>
> > [ 150.937798] VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day...
> > [ 150.945290] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070
> > [ 150.946012] IP: [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] PGD 2bf9e067 PUD 2bf9f067 PMD 0
> > [ 150.946012] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
> > [ 150.946012] CPU 0
> > [ 150.946012] Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ext4 jbd2 crc16 joydev ata_piix i2c_piix4 pcspkr uinput ipv6 autofs4 usbhid [last unloaded: scsi_wait_scan]
> > [ 150.946012]
> > [ 150.946012] Pid: 2764, comm: syscall_thrash Not tainted 3.0.0+ #1 Red Hat KVM
> > [ 150.946012] RIP: 0010:[<ffffffff810ffd58>] [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] RSP: 0018:ffff88002c2e5df8 EFLAGS: 00010282
> > [ 150.946012] RAX: 000000004e370d9f RBX: 0000000000000000 RCX: ffff88003a029438
> > [ 150.946012] RDX: 0000000033630a5f RSI: 0000000000000000 RDI: ffff88003491c240
> > [ 150.946012] RBP: ffff88002c2e5e08 R08: 0000000000000000 R09: 0000000000000000
> > [ 150.946012] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a029428
> > [ 150.946012] R13: ffff88003a029428 R14: ffff88003a029428 R15: ffff88003499a610
> > [ 150.946012] FS: 00007f5a05420700(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000
> > [ 150.946012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> > [ 150.946012] CR2: 0000000000000070 CR3: 000000002a662000 CR4: 00000000000006f0
> > [ 150.946012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [ 150.946012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> > [ 150.946012] Process syscall_thrash (pid: 2764, threadinfo ffff88002c2e4000, task ffff88002bfbc760)
> > [ 150.946012] Stack:
> > [ 150.946012] ffff88003a029438 ffff88003a029428 ffff88002c2e5e38 ffffffff81102f76
> > [ 150.946012] ffff88003a029438 ffff88003a029598 ffffffff8160f9c0 ffff88002c221250
> > [ 150.946012] ffff88002c2e5e68 ffffffff8115e9be ffff88002c2e5e68 ffff88003a029438
> > [ 150.946012] Call Trace:
> > [ 150.946012] [<ffffffff81102f76>] shmem_evict_inode+0x76/0x130
> > [ 150.946012] [<ffffffff8115e9be>] evict+0x7e/0x170
> > [ 150.946012] [<ffffffff8115ee40>] iput_final+0xd0/0x190
> > [ 150.946012] [<ffffffff8115ef33>] iput+0x33/0x40
> > [ 150.946012] [<ffffffff81180205>] fsnotify_destroy_mark_locked+0x145/0x160
> > [ 150.946012] [<ffffffff81180316>] fsnotify_destroy_mark+0x36/0x50
> > [ 150.946012] [<ffffffff81181937>] sys_inotify_rm_watch+0x77/0xd0
> > [ 150.946012] [<ffffffff815aca52>] system_call_fastpath+0x16/0x1b
> > [ 150.946012] Code: 67 4a 00 b8 e4 ff ff ff eb aa 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 8b 9f 40 05 00 00
> > [ 150.946012] 83 7b 70 00 74 1c 4c 8d a3 80 00 00 00 4c 89 e7 e8 d2 5d 4a
> > [ 150.946012] RIP [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] RSP <ffff88002c2e5df8>
> > [ 150.946012] CR2: 0000000000000070
>
> Looks at aweful lot like the problem from:
> http://www.spinics.net/lists/linux-fsdevel/msg46101.html
>
I tried to reproduce this bug with your test program, but without success.
However, if I understand correctly, this occurs since we dont hold any locks when
we call iput() in mark_destroy(), right?
With the patches you tested, iput() is also not called within any lock, since the
groups mark_mutex is released temporarily before iput() is called. This is, since
the original codes behaviour is similar.
However since we now have a mutex as the biggest lock, we can do what you
suggested (http://www.spinics.net/lists/linux-fsdevel/msg46107.html) and
call iput() with the mutex held to avoid the race.
The patch below implements this. It uses nested locking to avoid deadlock in case
we do the final iput() on an inode which still holds marks and thus would take
the mutex again when calling fsnotify_inode_delete() in destroy_inode().
Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
2011-08-12 03:13:31 +04:00
mutex_lock_nested ( & group - > mark_mutex , SINGLE_DEPTH_NESTING ) ;
2015-09-05 01:43:12 +03:00
fsnotify_detach_mark ( mark ) ;
2011-06-14 19:29:52 +04:00
mutex_unlock ( & group - > mark_mutex ) ;
2015-09-05 01:43:12 +03:00
fsnotify_free_mark ( mark ) ;
2009-12-18 05:24:27 +03:00
}
2017-03-14 14:31:02 +03:00
void fsnotify_destroy_marks ( struct fsnotify_mark_connector * conn ,
spinlock_t * lock )
2014-12-13 03:58:36 +03:00
{
2015-09-05 01:43:09 +03:00
struct fsnotify_mark * mark ;
2014-12-13 03:58:36 +03:00
2017-03-14 14:31:02 +03:00
if ( ! conn )
return ;
2015-09-05 01:43:09 +03:00
while ( 1 ) {
/*
* We have to be careful since we can race with e . g .
* fsnotify_clear_marks_by_group ( ) and once we drop ' lock ' ,
* mark can get removed from the obj_list and destroyed . But
* we are holding mark reference so mark cannot be freed and
* calling fsnotify_destroy_mark ( ) more than once is fine .
*/
spin_lock ( lock ) ;
2017-03-14 14:31:02 +03:00
if ( hlist_empty ( & conn - > list ) ) {
2015-09-05 01:43:09 +03:00
spin_unlock ( lock ) ;
break ;
}
2017-03-14 14:31:02 +03:00
mark = hlist_entry ( conn - > list . first , struct fsnotify_mark ,
obj_list ) ;
2015-09-05 01:43:09 +03:00
/*
* We don ' t update i_fsnotify_mask / mnt_fsnotify_mask here
* since inode / mount is going away anyway . So just remove
* mark from the list .
*/
hlist_del_init_rcu ( & mark - > obj_list ) ;
fsnotify_get_mark ( mark ) ;
spin_unlock ( lock ) ;
fsnotify_destroy_mark ( mark , mark - > group ) ;
2014-12-13 03:58:36 +03:00
fsnotify_put_mark ( mark ) ;
}
}
2017-03-14 14:31:02 +03:00
void fsnotify_connector_free ( struct fsnotify_mark_connector * * connp )
{
if ( * connp ) {
kmem_cache_free ( fsnotify_mark_connector_cachep , * connp ) ;
* connp = NULL ;
}
}
2009-12-18 05:24:33 +03:00
void fsnotify_set_mark_mask_locked ( struct fsnotify_mark * mark , __u32 mask )
{
assert_spin_locked ( & mark - > lock ) ;
mark - > mask = mask ;
}
2009-12-18 05:24:33 +03:00
void fsnotify_set_mark_ignored_mask_locked ( struct fsnotify_mark * mark , __u32 mask )
{
assert_spin_locked ( & mark - > lock ) ;
mark - > ignored_mask = mask ;
}
2009-12-18 05:24:33 +03:00
2014-11-14 02:19:33 +03:00
/*
* Sorting function for lists of fsnotify marks .
*
* Fanotify supports different notification classes ( reflected as priority of
* notification group ) . Events shall be passed to notification groups in
* decreasing priority order . To achieve this marks in notification lists for
* inodes and vfsmounts are sorted so that priorities of corresponding groups
* are descending .
*
* Furthermore correct handling of the ignore mask requires processing inode
* and vfsmount marks of each group together . Using the group address as
* further sort criterion provides a unique sorting order and thus we can
* merge inode and vfsmount lists of marks in linear time and find groups
* present in both lists .
*
* A return value of 1 signifies that b has priority over a .
* A return value of 0 signifies that the two marks have to be handled together .
* A return value of - 1 signifies that a has priority over b .
*/
int fsnotify_compare_groups ( struct fsnotify_group * a , struct fsnotify_group * b )
{
if ( a = = b )
return 0 ;
if ( ! a )
return 1 ;
if ( ! b )
return - 1 ;
if ( a - > priority < b - > priority )
return 1 ;
if ( a - > priority > b - > priority )
return - 1 ;
if ( a < b )
return 1 ;
return - 1 ;
}
2017-03-14 14:31:02 +03:00
static int fsnotify_attach_connector_to_object (
2017-03-14 16:29:35 +03:00
struct fsnotify_mark_connector * * connp ,
2017-03-14 18:11:23 +03:00
spinlock_t * lock ,
2017-03-14 16:29:35 +03:00
struct inode * inode ,
struct vfsmount * mnt )
2017-03-14 14:31:02 +03:00
{
struct fsnotify_mark_connector * conn ;
2017-03-14 18:11:23 +03:00
conn = kmem_cache_alloc ( fsnotify_mark_connector_cachep , GFP_KERNEL ) ;
2017-03-14 14:31:02 +03:00
if ( ! conn )
return - ENOMEM ;
INIT_HLIST_HEAD ( & conn - > list ) ;
2017-03-14 16:29:35 +03:00
if ( inode ) {
conn - > flags = FSNOTIFY_OBJ_TYPE_INODE ;
conn - > inode = inode ;
} else {
conn - > flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT ;
conn - > mnt = mnt ;
}
2017-03-14 14:31:02 +03:00
/*
* Make sure ' conn ' initialization is visible . Matches
* lockless_dereference ( ) in fsnotify ( ) .
*/
smp_wmb ( ) ;
2017-03-14 18:11:23 +03:00
spin_lock ( lock ) ;
if ( ! * connp )
* connp = conn ;
else
kmem_cache_free ( fsnotify_mark_connector_cachep , conn ) ;
spin_unlock ( lock ) ;
2017-03-14 14:31:02 +03:00
return 0 ;
}
/*
* Add mark into proper place in given list of marks . These marks may be used
* for the fsnotify backend to determine which event types should be delivered
* to which group and for which inodes . These marks are ordered according to
* priority , highest number first , and then by the group ' s location in memory .
*/
2017-03-14 18:11:23 +03:00
static int fsnotify_add_mark_list ( struct fsnotify_mark * mark ,
struct inode * inode , struct vfsmount * mnt ,
int allow_dups )
2014-12-13 03:58:36 +03:00
{
struct fsnotify_mark * lmark , * last = NULL ;
2017-03-14 14:31:02 +03:00
struct fsnotify_mark_connector * conn ;
2017-03-14 18:11:23 +03:00
struct fsnotify_mark_connector * * connp ;
spinlock_t * lock ;
2014-12-13 03:58:36 +03:00
int cmp ;
2017-03-14 18:11:23 +03:00
int err = 0 ;
if ( WARN_ON ( ! inode & & ! mnt ) )
return - EINVAL ;
if ( inode ) {
connp = & inode - > i_fsnotify_marks ;
lock = & inode - > i_lock ;
} else {
connp = & real_mount ( mnt ) - > mnt_fsnotify_marks ;
lock = & mnt - > mnt_root - > d_lock ;
}
2017-03-14 14:31:02 +03:00
if ( ! * connp ) {
2017-03-14 18:11:23 +03:00
err = fsnotify_attach_connector_to_object ( connp , lock ,
inode , mnt ) ;
2017-03-14 14:31:02 +03:00
if ( err )
return err ;
}
2017-03-14 18:11:23 +03:00
spin_lock ( & mark - > lock ) ;
spin_lock ( lock ) ;
2017-03-14 14:31:02 +03:00
conn = * connp ;
2014-12-13 03:58:36 +03:00
/* is mark the first mark? */
2017-03-14 14:31:02 +03:00
if ( hlist_empty ( & conn - > list ) ) {
hlist_add_head_rcu ( & mark - > obj_list , & conn - > list ) ;
2017-03-14 16:48:00 +03:00
if ( inode )
__iget ( inode ) ;
2017-03-14 16:29:35 +03:00
goto added ;
2014-12-13 03:58:36 +03:00
}
/* should mark be in the middle of the current list? */
2017-03-14 14:31:02 +03:00
hlist_for_each_entry ( lmark , & conn - > list , obj_list ) {
2014-12-13 03:58:36 +03:00
last = lmark ;
2017-03-14 18:11:23 +03:00
if ( ( lmark - > group = = mark - > group ) & & ! allow_dups ) {
err = - EEXIST ;
goto out_err ;
}
2014-12-13 03:58:36 +03:00
cmp = fsnotify_compare_groups ( lmark - > group , mark - > group ) ;
if ( cmp > = 0 ) {
hlist_add_before_rcu ( & mark - > obj_list , & lmark - > obj_list ) ;
2017-03-14 16:29:35 +03:00
goto added ;
2014-12-13 03:58:36 +03:00
}
}
BUG_ON ( last = = NULL ) ;
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu ( & mark - > obj_list , & last - > obj_list ) ;
2017-03-14 16:29:35 +03:00
added :
mark - > connector = conn ;
2017-03-14 18:11:23 +03:00
out_err :
spin_unlock ( lock ) ;
spin_unlock ( & mark - > lock ) ;
return err ;
2014-12-13 03:58:36 +03:00
}
2009-12-18 05:24:27 +03:00
/*
* Attach an initialized mark to a given group and fs object .
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group .
*/
2011-06-14 19:29:52 +04:00
int fsnotify_add_mark_locked ( struct fsnotify_mark * mark ,
struct fsnotify_group * group , struct inode * inode ,
struct vfsmount * mnt , int allow_dups )
2009-12-18 05:24:27 +03:00
{
int ret = 0 ;
BUG_ON ( inode & & mnt ) ;
BUG_ON ( ! inode & & ! mnt ) ;
2011-06-14 19:29:52 +04:00
BUG_ON ( ! mutex_is_locked ( & group - > mark_mutex ) ) ;
2009-12-18 05:24:27 +03:00
/*
* LOCKING ORDER ! ! ! !
2011-06-14 19:29:50 +04:00
* group - > mark_mutex
2011-06-14 19:29:48 +04:00
* mark - > lock
2009-12-18 05:24:27 +03:00
* inode - > i_lock
*/
2011-06-14 19:29:48 +04:00
spin_lock ( & mark - > lock ) ;
2015-09-05 01:43:12 +03:00
mark - > flags | = FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED ;
2010-07-28 18:18:38 +04:00
2011-06-14 19:29:47 +04:00
fsnotify_get_group ( group ) ;
2009-12-18 05:24:27 +03:00
mark - > group = group ;
list_add ( & mark - > g_list , & group - > marks_list ) ;
atomic_inc ( & group - > num_marks ) ;
fsnotify_get_mark ( mark ) ; /* for i_list and g_list */
spin_unlock ( & mark - > lock ) ;
2017-03-14 18:11:23 +03:00
ret = fsnotify_add_mark_list ( mark , inode , mnt , allow_dups ) ;
if ( ret )
goto err ;
2009-12-18 05:24:27 +03:00
if ( inode )
2017-03-14 18:11:23 +03:00
fsnotify_recalc_inode_mask ( inode ) ;
else
fsnotify_recalc_vfsmount_mask ( mnt ) ;
2009-12-18 05:24:27 +03:00
return ret ;
err :
2010-07-28 18:18:38 +04:00
mark - > flags & = ~ FSNOTIFY_MARK_FLAG_ALIVE ;
2009-12-18 05:24:27 +03:00
list_del_init ( & mark - > g_list ) ;
2011-06-14 19:29:47 +04:00
fsnotify_put_group ( group ) ;
2010-07-28 18:18:38 +04:00
mark - > group = NULL ;
2009-12-18 05:24:27 +03:00
atomic_dec ( & group - > num_marks ) ;
spin_unlock ( & mark - > lock ) ;
2016-02-18 00:11:18 +03:00
spin_lock ( & destroy_lock ) ;
list_add ( & mark - > g_list , & destroy_list ) ;
spin_unlock ( & destroy_lock ) ;
2016-02-18 00:11:21 +03:00
queue_delayed_work ( system_unbound_wq , & reaper_work ,
FSNOTIFY_REAPER_DELAY ) ;
2016-02-18 00:11:18 +03:00
2009-12-18 05:24:27 +03:00
return ret ;
}
2011-06-14 19:29:52 +04:00
int fsnotify_add_mark ( struct fsnotify_mark * mark , struct fsnotify_group * group ,
struct inode * inode , struct vfsmount * mnt , int allow_dups )
{
int ret ;
mutex_lock ( & group - > mark_mutex ) ;
ret = fsnotify_add_mark_locked ( mark , group , inode , mnt , allow_dups ) ;
mutex_unlock ( & group - > mark_mutex ) ;
return ret ;
}
2014-12-13 03:58:36 +03:00
/*
* Given a list of marks , find the mark associated with given group . If found
* take a reference to that mark and return it , else return NULL .
*/
2017-03-14 14:31:02 +03:00
struct fsnotify_mark * fsnotify_find_mark ( struct fsnotify_mark_connector * conn ,
2014-12-13 03:58:36 +03:00
struct fsnotify_group * group )
{
struct fsnotify_mark * mark ;
2017-03-14 14:31:02 +03:00
if ( ! conn )
return NULL ;
hlist_for_each_entry ( mark , & conn - > list , obj_list ) {
2014-12-13 03:58:36 +03:00
if ( mark - > group = = group ) {
fsnotify_get_mark ( mark ) ;
return mark ;
}
}
return NULL ;
}
2009-12-18 05:24:27 +03:00
/*
2015-07-22 02:06:53 +03:00
* clear any marks in a group in which mark - > flags & flags is true
2009-12-18 05:24:27 +03:00
*/
2009-12-18 05:24:34 +03:00
void fsnotify_clear_marks_by_group_flags ( struct fsnotify_group * group ,
unsigned int flags )
2009-12-18 05:24:27 +03:00
{
struct fsnotify_mark * lmark , * mark ;
2015-08-07 01:46:42 +03:00
LIST_HEAD ( to_free ) ;
2009-12-18 05:24:27 +03:00
2015-08-07 01:46:42 +03:00
/*
* We have to be really careful here . Anytime we drop mark_mutex , e . g .
* fsnotify_clear_marks_by_inode ( ) can come and free marks . Even in our
* to_free list so we have to use mark_mutex even when accessing that
* list . And freeing mark requires us to drop mark_mutex . So we can
* reliably free only the first mark in the list . That ' s why we first
* move marks to free to to_free list in one go and then free marks in
* to_free list one by one .
*/
fsnotify: change locking order
On Mon, Aug 01, 2011 at 04:38:22PM -0400, Eric Paris wrote:
>
> I finally built and tested a v3.0 kernel with these patches (I know I'm
> SOOOOOO far behind). Not what I hoped for:
>
> > [ 150.937798] VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day...
> > [ 150.945290] BUG: unable to handle kernel NULL pointer dereference at 0000000000000070
> > [ 150.946012] IP: [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] PGD 2bf9e067 PUD 2bf9f067 PMD 0
> > [ 150.946012] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
> > [ 150.946012] CPU 0
> > [ 150.946012] Modules linked in: nfs lockd fscache auth_rpcgss nfs_acl sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ext4 jbd2 crc16 joydev ata_piix i2c_piix4 pcspkr uinput ipv6 autofs4 usbhid [last unloaded: scsi_wait_scan]
> > [ 150.946012]
> > [ 150.946012] Pid: 2764, comm: syscall_thrash Not tainted 3.0.0+ #1 Red Hat KVM
> > [ 150.946012] RIP: 0010:[<ffffffff810ffd58>] [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] RSP: 0018:ffff88002c2e5df8 EFLAGS: 00010282
> > [ 150.946012] RAX: 000000004e370d9f RBX: 0000000000000000 RCX: ffff88003a029438
> > [ 150.946012] RDX: 0000000033630a5f RSI: 0000000000000000 RDI: ffff88003491c240
> > [ 150.946012] RBP: ffff88002c2e5e08 R08: 0000000000000000 R09: 0000000000000000
> > [ 150.946012] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a029428
> > [ 150.946012] R13: ffff88003a029428 R14: ffff88003a029428 R15: ffff88003499a610
> > [ 150.946012] FS: 00007f5a05420700(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000
> > [ 150.946012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> > [ 150.946012] CR2: 0000000000000070 CR3: 000000002a662000 CR4: 00000000000006f0
> > [ 150.946012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [ 150.946012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> > [ 150.946012] Process syscall_thrash (pid: 2764, threadinfo ffff88002c2e4000, task ffff88002bfbc760)
> > [ 150.946012] Stack:
> > [ 150.946012] ffff88003a029438 ffff88003a029428 ffff88002c2e5e38 ffffffff81102f76
> > [ 150.946012] ffff88003a029438 ffff88003a029598 ffffffff8160f9c0 ffff88002c221250
> > [ 150.946012] ffff88002c2e5e68 ffffffff8115e9be ffff88002c2e5e68 ffff88003a029438
> > [ 150.946012] Call Trace:
> > [ 150.946012] [<ffffffff81102f76>] shmem_evict_inode+0x76/0x130
> > [ 150.946012] [<ffffffff8115e9be>] evict+0x7e/0x170
> > [ 150.946012] [<ffffffff8115ee40>] iput_final+0xd0/0x190
> > [ 150.946012] [<ffffffff8115ef33>] iput+0x33/0x40
> > [ 150.946012] [<ffffffff81180205>] fsnotify_destroy_mark_locked+0x145/0x160
> > [ 150.946012] [<ffffffff81180316>] fsnotify_destroy_mark+0x36/0x50
> > [ 150.946012] [<ffffffff81181937>] sys_inotify_rm_watch+0x77/0xd0
> > [ 150.946012] [<ffffffff815aca52>] system_call_fastpath+0x16/0x1b
> > [ 150.946012] Code: 67 4a 00 b8 e4 ff ff ff eb aa 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 8b 9f 40 05 00 00
> > [ 150.946012] 83 7b 70 00 74 1c 4c 8d a3 80 00 00 00 4c 89 e7 e8 d2 5d 4a
> > [ 150.946012] RIP [<ffffffff810ffd58>] shmem_free_inode+0x18/0x50
> > [ 150.946012] RSP <ffff88002c2e5df8>
> > [ 150.946012] CR2: 0000000000000070
>
> Looks at aweful lot like the problem from:
> http://www.spinics.net/lists/linux-fsdevel/msg46101.html
>
I tried to reproduce this bug with your test program, but without success.
However, if I understand correctly, this occurs since we dont hold any locks when
we call iput() in mark_destroy(), right?
With the patches you tested, iput() is also not called within any lock, since the
groups mark_mutex is released temporarily before iput() is called. This is, since
the original codes behaviour is similar.
However since we now have a mutex as the biggest lock, we can do what you
suggested (http://www.spinics.net/lists/linux-fsdevel/msg46107.html) and
call iput() with the mutex held to avoid the race.
The patch below implements this. It uses nested locking to avoid deadlock in case
we do the final iput() on an inode which still holds marks and thus would take
the mutex again when calling fsnotify_inode_delete() in destroy_inode().
Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
2011-08-12 03:13:31 +04:00
mutex_lock_nested ( & group - > mark_mutex , SINGLE_DEPTH_NESTING ) ;
2009-12-18 05:24:27 +03:00
list_for_each_entry_safe ( mark , lmark , & group - > marks_list , g_list ) {
2017-03-14 16:29:35 +03:00
if ( mark - > connector - > flags & flags )
2015-08-07 01:46:42 +03:00
list_move ( & mark - > g_list , & to_free ) ;
2009-12-18 05:24:27 +03:00
}
2011-06-14 19:29:50 +04:00
mutex_unlock ( & group - > mark_mutex ) ;
2015-08-07 01:46:42 +03:00
while ( 1 ) {
mutex_lock_nested ( & group - > mark_mutex , SINGLE_DEPTH_NESTING ) ;
if ( list_empty ( & to_free ) ) {
mutex_unlock ( & group - > mark_mutex ) ;
break ;
}
mark = list_first_entry ( & to_free , struct fsnotify_mark , g_list ) ;
fsnotify_get_mark ( mark ) ;
2015-09-05 01:43:12 +03:00
fsnotify_detach_mark ( mark ) ;
2015-08-07 01:46:42 +03:00
mutex_unlock ( & group - > mark_mutex ) ;
2015-09-05 01:43:12 +03:00
fsnotify_free_mark ( mark ) ;
2015-08-07 01:46:42 +03:00
fsnotify_put_mark ( mark ) ;
}
2009-12-18 05:24:27 +03:00
}
2009-12-18 05:24:34 +03:00
/*
2016-05-20 03:08:59 +03:00
* Given a group , prepare for freeing all the marks associated with that group .
* The marks are attached to the list of marks prepared for destruction , the
* caller is responsible for freeing marks in that list after SRCU period has
* ended .
2009-12-18 05:24:34 +03:00
*/
2016-05-20 03:08:59 +03:00
void fsnotify_detach_group_marks ( struct fsnotify_group * group )
2009-12-18 05:24:34 +03:00
{
2016-05-20 03:08:59 +03:00
struct fsnotify_mark * mark ;
while ( 1 ) {
mutex_lock_nested ( & group - > mark_mutex , SINGLE_DEPTH_NESTING ) ;
if ( list_empty ( & group - > marks_list ) ) {
mutex_unlock ( & group - > mark_mutex ) ;
break ;
}
mark = list_first_entry ( & group - > marks_list ,
struct fsnotify_mark , g_list ) ;
fsnotify_get_mark ( mark ) ;
fsnotify_detach_mark ( mark ) ;
mutex_unlock ( & group - > mark_mutex ) ;
__fsnotify_free_mark ( mark ) ;
fsnotify_put_mark ( mark ) ;
}
2009-12-18 05:24:34 +03:00
}
2009-12-18 05:24:27 +03:00
/*
* Nothing fancy , just initialize lists and locks and counters .
*/
void fsnotify_init_mark ( struct fsnotify_mark * mark ,
void ( * free_mark ) ( struct fsnotify_mark * mark ) )
{
2009-12-18 05:24:27 +03:00
memset ( mark , 0 , sizeof ( * mark ) ) ;
2009-12-18 05:24:27 +03:00
spin_lock_init ( & mark - > lock ) ;
atomic_set ( & mark - > refcnt , 1 ) ;
mark - > free_mark = free_mark ;
}
2016-02-18 00:11:18 +03:00
2016-05-20 03:08:59 +03:00
/*
* Destroy all marks in destroy_list , waits for SRCU period to finish before
* actually freeing marks .
*/
void fsnotify_mark_destroy_list ( void )
2016-02-18 00:11:18 +03:00
{
struct fsnotify_mark * mark , * next ;
struct list_head private_destroy_list ;
2016-02-18 00:11:21 +03:00
spin_lock ( & destroy_lock ) ;
/* exchange the list head */
list_replace_init ( & destroy_list , & private_destroy_list ) ;
spin_unlock ( & destroy_lock ) ;
2016-02-18 00:11:18 +03:00
2016-02-18 00:11:21 +03:00
synchronize_srcu ( & fsnotify_mark_srcu ) ;
2016-02-18 00:11:18 +03:00
2016-02-18 00:11:21 +03:00
list_for_each_entry_safe ( mark , next , & private_destroy_list , g_list ) {
list_del_init ( & mark - > g_list ) ;
fsnotify_put_mark ( mark ) ;
2016-02-18 00:11:18 +03:00
}
}
2016-05-20 03:08:59 +03:00
static void fsnotify_mark_destroy_workfn ( struct work_struct * work )
{
fsnotify_mark_destroy_list ( ) ;
}