2009-08-23 00:56:45 +04:00
/*
* Read - Copy Update mechanism for mutual exclusion ( tree - based version )
* Internal non - public definitions .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
2013-12-03 22:02:52 +04:00
* along with this program ; if not , you can access it online at
* http : //www.gnu.org/licenses/gpl-2.0.html.
2009-08-23 00:56:45 +04:00
*
* Copyright IBM Corporation , 2008
*
* Author : Ingo Molnar < mingo @ elte . hu >
* Paul E . McKenney < paulmck @ linux . vnet . ibm . com >
*/
# include <linux/cache.h>
# include <linux/spinlock.h>
# include <linux/threads.h>
# include <linux/cpumask.h>
# include <linux/seqlock.h>
2015-06-25 21:27:10 +03:00
# include <linux/stop_machine.h>
2009-08-23 00:56:45 +04:00
/*
2012-04-19 23:20:14 +04:00
* Define shape of hierarchy based on NR_CPUS , CONFIG_RCU_FANOUT , and
* CONFIG_RCU_FANOUT_LEAF .
2009-08-23 00:56:45 +04:00
* In theory , it should be possible to add more levels straightforwardly .
2010-12-15 03:07:52 +03:00
* In practice , this did work well going from three levels to four .
* Of course , your mileage may vary .
2009-08-23 00:56:45 +04:00
*/
2015-04-21 00:27:43 +03:00
# ifdef CONFIG_RCU_FANOUT
# define RCU_FANOUT CONFIG_RCU_FANOUT
# else /* #ifdef CONFIG_RCU_FANOUT */
# ifdef CONFIG_64BIT
# define RCU_FANOUT 64
# else
# define RCU_FANOUT 32
# endif
# endif /* #else #ifdef CONFIG_RCU_FANOUT */
2015-04-21 19:12:13 +03:00
# ifdef CONFIG_RCU_FANOUT_LEAF
# define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
# else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
# ifdef CONFIG_64BIT
# define RCU_FANOUT_LEAF 64
# else
# define RCU_FANOUT_LEAF 32
# endif
# endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
# define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
2015-04-21 00:27:43 +03:00
# define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
# define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
# define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
2009-08-23 00:56:45 +04:00
2010-12-15 03:07:52 +03:00
# if NR_CPUS <= RCU_FANOUT_1
2012-04-24 02:52:53 +04:00
# define RCU_NUM_LVLS 1
2009-08-23 00:56:45 +04:00
# define NUM_RCU_LVL_0 1
2015-06-03 09:18:31 +03:00
# define NUM_RCU_NODES NUM_RCU_LVL_0
2015-06-03 09:18:30 +03:00
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
2015-06-25 00:20:08 +03:00
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
2015-07-20 01:13:40 +03:00
# define RCU_EXP_SCHED_NAME_INIT \
{ " rcu_node_exp_sched_0 " }
2010-12-15 03:07:52 +03:00
# elif NR_CPUS <= RCU_FANOUT_2
2012-04-24 02:52:53 +04:00
# define RCU_NUM_LVLS 2
2009-08-23 00:56:45 +04:00
# define NUM_RCU_LVL_0 1
2010-12-15 03:07:52 +03:00
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
2015-06-03 09:18:31 +03:00
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
2015-06-03 09:18:30 +03:00
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
2015-06-25 00:20:08 +03:00
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
2015-07-20 01:13:40 +03:00
# define RCU_EXP_SCHED_NAME_INIT \
{ " rcu_node_exp_sched_0 " , " rcu_node_exp_sched_1 " }
2010-12-15 03:07:52 +03:00
# elif NR_CPUS <= RCU_FANOUT_3
2012-04-24 02:52:53 +04:00
# define RCU_NUM_LVLS 3
2009-08-23 00:56:45 +04:00
# define NUM_RCU_LVL_0 1
2010-12-15 03:07:52 +03:00
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
2015-06-03 09:18:31 +03:00
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
2015-06-03 09:18:30 +03:00
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
2015-06-25 00:20:08 +03:00
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
2015-07-20 01:13:40 +03:00
# define RCU_EXP_SCHED_NAME_INIT \
{ " rcu_node_exp_sched_0 " , " rcu_node_exp_sched_1 " , " rcu_node_exp_sched_2 " }
2010-12-15 03:07:52 +03:00
# elif NR_CPUS <= RCU_FANOUT_4
2012-04-24 02:52:53 +04:00
# define RCU_NUM_LVLS 4
2009-12-02 23:10:14 +03:00
# define NUM_RCU_LVL_0 1
2010-12-15 03:07:52 +03:00
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
2015-06-03 09:18:31 +03:00
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
2015-06-03 09:18:30 +03:00
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
2015-06-25 00:20:08 +03:00
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
2015-07-20 01:13:40 +03:00
# define RCU_EXP_SCHED_NAME_INIT \
{ " rcu_node_exp_sched_0 " , " rcu_node_exp_sched_1 " , " rcu_node_exp_sched_2 " , " rcu_node_exp_sched_3 " }
2009-08-23 00:56:45 +04:00
# else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
2010-12-15 03:07:52 +03:00
# endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
2009-08-23 00:56:45 +04:00
2012-04-24 02:52:53 +04:00
extern int rcu_num_lvls ;
extern int rcu_num_nodes ;
2009-08-23 00:56:45 +04:00
/*
* Dynticks per - CPU state .
*/
struct rcu_dynticks {
rcu: Track idleness independent of idle tasks
Earlier versions of RCU used the scheduling-clock tick to detect idleness
by checking for the idle task, but handled idleness differently for
CONFIG_NO_HZ=y. But there are now a number of uses of RCU read-side
critical sections in the idle task, for example, for tracing. A more
fine-grained detection of idleness is therefore required.
This commit presses the old dyntick-idle code into full-time service,
so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is
always invoked at the beginning of an idle loop iteration. Similarly,
rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked
at the end of an idle-loop iteration. This allows the idle task to
use RCU everywhere except between consecutive rcu_idle_enter() and
rcu_idle_exit() calls, in turn allowing architecture maintainers to
specify exactly where in the idle loop that RCU may be used.
Because some of the userspace upcall uses can result in what looks
to RCU like half of an interrupt, it is not possible to expect that
the irq_enter() and irq_exit() hooks will give exact counts. This
patch therefore expands the ->dynticks_nesting counter to 64 bits
and uses two separate bitfields to count process/idle transitions
and interrupt entry/exit transitions. It is presumed that userspace
upcalls do not happen in the idle loop or from usermode execution
(though usermode might do a system call that results in an upcall).
The counter is hard-reset on each process/idle transition, which
avoids the interrupt entry/exit error from accumulating. Overflow
is avoided by the 64-bitness of the ->dyntick_nesting counter.
This commit also adds warnings if a non-idle task asks RCU to enter
idle state (and these checks will need some adjustment before applying
Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246).
In addition, validation of ->dynticks and ->dynticks_nesting is added.
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-09-30 23:10:22 +04:00
long long dynticks_nesting ; /* Track irq/process nesting level. */
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting ; /* Track NMI nesting level. */
atomic_t dynticks ; /* Even value for idle, else odd. */
nohz_full: Add rcu_dyntick data for scalable detection of all-idle state
This commit adds fields to the rcu_dyntick structure that are used to
detect idle CPUs. These new fields differ from the existing ones in
that the existing ones consider a CPU executing in user mode to be idle,
where the new ones consider CPUs executing in user mode to be busy.
The handling of these new fields is otherwise quite similar to that for
the exiting fields. This commit also adds the initialization required
for these fields.
So, why is usermode execution treated differently, with RCU considering
it a quiescent state equivalent to idle, while in contrast the new
full-system idle state detection considers usermode execution to be
non-idle?
It turns out that although one of RCU's quiescent states is usermode
execution, it is not a full-system idle state. This is because the
purpose of the full-system idle state is not RCU, but rather determining
when accurate timekeeping can safely be disabled. Whenever accurate
timekeeping is required in a CONFIG_NO_HZ_FULL kernel, at least one
CPU must keep the scheduling-clock tick going. If even one CPU is
executing in user mode, accurate timekeeping is requires, particularly for
architectures where gettimeofday() and friends do not enter the kernel.
Only when all CPUs are really and truly idle can accurate timekeeping be
disabled, allowing all CPUs to turn off the scheduling clock interrupt,
thus greatly improving energy efficiency.
This naturally raises the question "Why is this code in RCU rather than in
timekeeping?", and the answer is that RCU has the data and infrastructure
to efficiently make this determination.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2013-06-21 23:34:33 +04:00
# ifdef CONFIG_NO_HZ_FULL_SYSIDLE
long long dynticks_idle_nesting ;
/* irq/process nesting level from idle. */
atomic_t dynticks_idle ; /* Even value for idle, else odd. */
/* "Idle" excludes userspace execution. */
unsigned long dynticks_idle_jiffies ;
/* End of last non-NMI non-idle period. */
# endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2012-05-09 23:07:05 +04:00
# ifdef CONFIG_RCU_FAST_NO_HZ
2012-12-28 23:30:36 +04:00
bool all_lazy ; /* Are all CPU's CBs lazy? */
2012-05-09 23:07:05 +04:00
unsigned long nonlazy_posted ;
/* # times non-lazy CBs posted to CPU. */
unsigned long nonlazy_posted_snap ;
/* idle-period nonlazy_posted snapshot. */
2012-12-28 23:30:36 +04:00
unsigned long last_accelerate ;
/* Last jiffy CBs were accelerated. */
2013-08-26 08:20:47 +04:00
unsigned long last_advance_all ;
/* Last jiffy CBs were all advanced. */
2012-06-24 21:15:02 +04:00
int tick_nohz_enabled_snap ; /* Previously seen value from sysfs. */
2012-05-09 23:07:05 +04:00
# endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2009-08-23 00:56:45 +04:00
} ;
2011-03-30 04:48:28 +04:00
/* RCU's kthread states for tracing. */
# define RCU_KTHREAD_STOPPED 0
# define RCU_KTHREAD_RUNNING 1
# define RCU_KTHREAD_WAITING 2
2011-04-07 03:01:16 +04:00
# define RCU_KTHREAD_OFFCPU 3
# define RCU_KTHREAD_YIELDING 4
# define RCU_KTHREAD_MAX 4
2011-03-30 04:48:28 +04:00
2009-08-23 00:56:45 +04:00
/*
* Definition for node within the RCU grace - period - detection hierarchy .
*/
struct rcu_node {
2010-02-23 04:05:02 +03:00
raw_spinlock_t lock ; /* Root rcu_node's lock protects some */
2009-09-23 20:50:42 +04:00
/* rcu_state fields as well as following. */
2010-02-23 04:05:01 +03:00
unsigned long gpnum ; /* Current grace period for this node. */
2009-08-28 02:00:12 +04:00
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
2010-02-23 04:05:01 +03:00
unsigned long completed ; /* Last GP completed for this node. */
2009-11-03 00:52:28 +03:00
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
2009-08-23 00:56:45 +04:00
unsigned long qsmask ; /* CPUs or groups that need to switch in */
/* order for current grace period to proceed.*/
2009-09-23 20:50:42 +04:00
/* In leaf rcu_node, each bit corresponds to */
/* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */
/* structure. */
2010-11-30 08:56:39 +03:00
unsigned long expmask ; /* Groups that have ->blkd_tasks */
2009-12-02 23:10:15 +03:00
/* elements that need to drain to allow the */
/* current expedited grace period to */
2014-09-22 22:00:48 +04:00
/* complete (only for PREEMPT_RCU). */
2009-08-23 00:56:45 +04:00
unsigned long qsmaskinit ;
2009-12-02 23:10:15 +03:00
/* Per-GP initial value for qsmask & expmask. */
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 08:52:37 +03:00
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext ;
/* Online CPUs for next grace period. */
2009-08-23 00:56:45 +04:00
unsigned long grpmask ; /* Mask to apply to parent qsmask. */
2009-09-23 20:50:42 +04:00
/* Only one bit will be set in this mask. */
2009-08-23 00:56:45 +04:00
int grplo ; /* lowest-numbered CPU or group here. */
int grphi ; /* highest-numbered CPU or group here. */
u8 grpnum ; /* CPU/group number for next level up. */
u8 level ; /* root is at level 0. */
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 08:52:37 +03:00
bool wait_blkd_tasks ; /* Necessary to wait for blocked tasks to */
/* exit RCU read-side critical sections */
/* before propagating offline up the */
/* rcu_node tree? */
2009-08-23 00:56:45 +04:00
struct rcu_node * parent ;
2010-11-30 08:56:39 +03:00
struct list_head blkd_tasks ;
/* Tasks blocked in RCU read-side critical */
/* section. Tasks are placed at the head */
/* of this list and age towards the tail. */
struct list_head * gp_tasks ;
/* Pointer to the first task blocking the */
/* current grace period, or NULL if there */
/* is no such task. */
struct list_head * exp_tasks ;
/* Pointer to the first task blocking the */
/* current expedited grace period, or NULL */
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there can cannot be any such task. */
2011-02-07 23:47:15 +03:00
struct list_head * boost_tasks ;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
/* boosting is needed for this rcu_node */
/* structure. If there are no tasks */
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
2014-06-13 00:30:25 +04:00
struct rt_mutex boost_mtx ;
/* Used only for the priority-boosting */
/* side effect, not as a lock. */
2011-02-07 23:47:15 +03:00
unsigned long boost_time ;
/* When to start boosting (jiffies). */
struct task_struct * boost_kthread_task ;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
2011-03-30 04:48:28 +04:00
unsigned int boost_kthread_status ;
/* State of boost_kthread_task for tracing. */
2011-02-23 00:42:43 +03:00
unsigned long n_tasks_boosted ;
/* Total number of tasks boosted. */
unsigned long n_exp_boosts ;
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts ;
/* Number of tasks boosted for normal GP. */
unsigned long n_balk_blkd_tasks ;
/* Refused to boost: no blocked tasks. */
unsigned long n_balk_exp_gp_tasks ;
/* Refused to boost: nothing blocking GP. */
unsigned long n_balk_boost_tasks ;
/* Refused to boost: already boosting. */
unsigned long n_balk_notblocked ;
/* Refused to boost: RCU RS CS still running. */
unsigned long n_balk_notyet ;
/* Refused to boost: not yet time. */
unsigned long n_balk_nos ;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
2013-02-11 08:48:58 +04:00
# ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq [ 2 ] ;
/* Place for rcu_nocb_kthread() to wait GP. */
# endif /* #ifdef CONFIG_RCU_NOCB_CPU */
2012-12-31 01:06:35 +04:00
int need_future_gp [ 2 ] ;
/* Counts of upcoming no-CB GP requests. */
2012-06-27 04:00:35 +04:00
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp ;
2015-06-25 00:20:08 +03:00
struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp ;
2009-08-23 00:56:45 +04:00
} ____cacheline_internodealigned_in_smp ;
2009-09-28 18:46:33 +04:00
/*
* Do a full breadth - first scan of the rcu_node structures for the
* specified rcu_state structure .
*/
# define rcu_for_each_node_breadth_first(rsp, rnp) \
for ( ( rnp ) = & ( rsp ) - > node [ 0 ] ; \
2012-04-24 02:52:53 +04:00
( rnp ) < & ( rsp ) - > node [ rcu_num_nodes ] ; ( rnp ) + + )
2009-09-28 18:46:33 +04:00
2009-12-02 23:10:15 +03:00
/*
* Do a breadth - first scan of the non - leaf rcu_node structures for the
* specified rcu_state structure . Note that if there is a singleton
* rcu_node tree with but one rcu_node structure , this loop is a no - op .
*/
# define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
for ( ( rnp ) = & ( rsp ) - > node [ 0 ] ; \
2012-04-24 02:52:53 +04:00
( rnp ) < ( rsp ) - > level [ rcu_num_lvls - 1 ] ; ( rnp ) + + )
2009-12-02 23:10:15 +03:00
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
* structure . Note that if there is a singleton rcu_node tree with but
* one rcu_node structure , this loop - will - visit the rcu_node structure .
* It is still a leaf node , even if it is also the root node .
*/
2009-09-28 18:46:33 +04:00
# define rcu_for_each_leaf_node(rsp, rnp) \
2012-04-24 02:52:53 +04:00
for ( ( rnp ) = ( rsp ) - > level [ rcu_num_lvls - 1 ] ; \
( rnp ) < & ( rsp ) - > node [ rcu_num_nodes ] ; ( rnp ) + + )
2009-09-28 18:46:33 +04:00
2009-08-23 00:56:45 +04:00
/* Index values for nxttail array in struct rcu_data. */
# define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
# define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
# define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
# define RCU_NEXT_TAIL 3
# define RCU_NEXT_SIZE 4
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
2010-02-23 04:05:01 +03:00
unsigned long completed ; /* Track rsp->completed gp number */
2009-08-23 00:56:45 +04:00
/* in order to detect GP end. */
2010-02-23 04:05:01 +03:00
unsigned long gpnum ; /* Highest gp number that this CPU */
2009-08-23 00:56:45 +04:00
/* is aware of having started. */
rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors
Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used
in places where it would be useful for it to apply to the normal RCU
flavors, rcu_preempt, rcu_sched, and rcu_bh. This is especially the
case for workloads that aggressively overload the system, particularly
those that generate large numbers of RCU updates on systems running
NO_HZ_FULL CPUs. This commit therefore communicates quiescent states
from cond_resched_rcu_qs() to the normal RCU flavors.
Note that it is unfortunately necessary to leave the old ->passed_quiesce
mechanism in place to allow quiescent states that apply to only one
flavor to be recorded. (Yes, we could decrement ->rcu_qs_ctr_snap in
that case, but that is not so good for debugging of RCU internals.)
In addition, if one of the RCU flavor's grace period has stalled, this
will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight
quiescent state visible from other CPUs.
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu()
was used in preemptible code. ]
2014-12-14 07:32:04 +03:00
unsigned long rcu_qs_ctr_snap ; /* Snapshot of rcu_qs_ctr to check */
/* for rcu_all_qs() invocations. */
rcu: Simplify quiescent-state accounting
There is often a delay between the time that a CPU passes through a
quiescent state and the time that this quiescent state is reported to the
RCU core. It is quite possible that the grace period ended before the
quiescent state could be reported, for example, some other CPU might have
deduced that this CPU passed through dyntick-idle mode. It is critically
important that quiescent state be counted only against the grace period
that was in effect at the time that the quiescent state was detected.
Previously, this was handled by recording the number of the last grace
period to complete when passing through a quiescent state. The RCU
core then checks this number against the current value, and rejects
the quiescent state if there is a mismatch. However, one additional
possibility must be accounted for, namely that the quiescent state was
recorded after the prior grace period completed but before the current
grace period started. In this case, the RCU core must reject the
quiescent state, but the recorded number will match. This is handled
when the CPU becomes aware of a new grace period -- at that point,
it invalidates any prior quiescent state.
This works, but is a bit indirect. The new approach records the current
grace period, and the RCU core checks to see (1) that this is still the
current grace period and (2) that this grace period has not yet ended.
This approach simplifies reasoning about correctness, and this commit
changes over to this new approach.
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-06-27 11:17:43 +04:00
bool passed_quiesce ; /* User-mode/idle loop etc. */
2009-08-23 00:56:45 +04:00
bool qs_pending ; /* Core waits for quiesc state. */
bool beenonline ; /* CPU online at least once. */
rcu: Handle gpnum/completed wrap while dyntick idle
Subtle race conditions can result if a CPU stays in dyntick-idle mode
long enough for the ->gpnum and ->completed fields to wrap. For
example, consider the following sequence of events:
o CPU 1 encounters a quiescent state while waiting for grace period
5 to complete, but then enters dyntick-idle mode.
o While CPU 1 is in dyntick-idle mode, the grace-period counters
wrap around so that the grace period number is now 4.
o Just as CPU 1 exits dyntick-idle mode, grace period 4 completes
and grace period 5 begins.
o The quiescent state that CPU 1 passed through during the old
grace period 5 looks like it applies to the new grace period
5. Therefore, the new grace period 5 completes without CPU 1
having passed through a quiescent state.
This could clearly be a fatal surprise to any long-running RCU read-side
critical section that happened to be running on CPU 1 at the time. At one
time, this was not a problem, given that it takes significant time for
the grace-period counters to overflow even on 32-bit systems. However,
with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long
idle periods are now becoming quite feasible. It is therefore time to
close this race.
This commit therefore avoids this race condition by having the
quiescent-state forcing code detect when a CPU is falling too far
behind, and setting a new rcu_data field ->gpwrap when this happens.
Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed
fields are known to be untrustworthy, and can be ignored, along with
any associated quiescent states.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2014-12-09 07:26:55 +03:00
bool gpwrap ; /* Possible gpnum/completed wrap. */
2009-08-23 00:56:45 +04:00
struct rcu_node * mynode ; /* This CPU's leaf of hierarchy */
unsigned long grpmask ; /* Mask to apply to leaf qsmask. */
2012-01-17 01:29:10 +04:00
unsigned long ticks_this_gp ; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
2015-06-25 21:27:10 +03:00
struct cpu_stop_work exp_stop_work ;
/* Expedited grace-period control */
/* for CPU stopping. */
2009-08-23 00:56:45 +04:00
/* 2) batch handling */
/*
* If nxtlist is not NULL , it is partitioned as follows .
* Any of the partitions might be empty , in which case the
* pointer to that partition will be equal to the pointer for
* the following partition . When the list is empty , all of
2009-09-23 20:50:42 +04:00
* the nxttail elements point to the - > nxtlist pointer itself ,
* which in that case is NULL .
2009-08-23 00:56:45 +04:00
*
* [ nxtlist , * nxttail [ RCU_DONE_TAIL ] ) :
* Entries that batch # < = - > completed
* The grace period for these entries has completed , and
* the other grace - period - completed entries may be moved
* here temporarily in rcu_process_callbacks ( ) .
2009-09-23 20:50:42 +04:00
* [ * nxttail [ RCU_DONE_TAIL ] , * nxttail [ RCU_WAIT_TAIL ] ) :
* Entries that batch # < = - > completed - 1 : waiting for current GP
* [ * nxttail [ RCU_WAIT_TAIL ] , * nxttail [ RCU_NEXT_READY_TAIL ] ) :
* Entries known to have arrived before current GP ended
* [ * nxttail [ RCU_NEXT_READY_TAIL ] , * nxttail [ RCU_NEXT_TAIL ] ) :
* Entries that might have arrived after current GP ended
* Note that the value of * nxttail [ RCU_NEXT_TAIL ] will
* always be NULL , as this is the end of the list .
2009-08-23 00:56:45 +04:00
*/
struct rcu_head * nxtlist ;
struct rcu_head * * nxttail [ RCU_NEXT_SIZE ] ;
2012-12-04 01:52:00 +04:00
unsigned long nxtcompleted [ RCU_NEXT_SIZE ] ;
/* grace periods for sublists. */
2012-01-07 02:11:30 +04:00
long qlen_lazy ; /* # of lazy queued callbacks */
long qlen ; /* # of queued callbacks, incl lazy */
2009-10-14 21:15:55 +04:00
long qlen_last_fqs_check ;
/* qlen at last check for QS forcing */
2010-09-08 01:23:09 +04:00
unsigned long n_cbs_invoked ; /* count of RCU cbs invoked. */
2012-10-29 18:29:20 +04:00
unsigned long n_nocbs_invoked ; /* count of no-CBs RCU cbs invoked. */
2010-10-20 10:13:06 +04:00
unsigned long n_cbs_orphaned ; /* RCU cbs orphaned by dying CPU */
unsigned long n_cbs_adopted ; /* RCU cbs adopted from dying CPU */
2009-10-14 21:15:55 +04:00
unsigned long n_force_qs_snap ;
/* did other CPU force QS recently? */
2009-08-23 00:56:45 +04:00
long blimit ; /* Upper limit on a processed batch */
/* 3) dynticks interface. */
struct rcu_dynticks * dynticks ; /* Shared per-CPU dynticks state. */
int dynticks_snap ; /* Per-GP tracking for dynticks. */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs ; /* Kicked due to dynticks idle. */
unsigned long offline_fqs ; /* Kicked due to being offline. */
2014-06-21 03:49:01 +04:00
unsigned long cond_resched_completed ;
/* Grace period that needs help */
/* from cond_resched(). */
2009-08-23 00:56:45 +04:00
/* 5) __rcu_pending() statistics. */
2010-02-23 04:05:01 +03:00
unsigned long n_rcu_pending ; /* rcu_pending() calls since boot. */
unsigned long n_rp_qs_pending ;
2010-04-15 04:39:26 +04:00
unsigned long n_rp_report_qs ;
2010-02-23 04:05:01 +03:00
unsigned long n_rp_cb_ready ;
unsigned long n_rp_cpu_needs_gp ;
unsigned long n_rp_gp_completed ;
unsigned long n_rp_gp_started ;
rcu: Break call_rcu() deadlock involving scheduler and perf
Dave Jones got the following lockdep splat:
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.12.0-rc3+ #92 Not tainted
> -------------------------------------------------------
> trinity-child2/15191 is trying to acquire lock:
> (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50
>
> but task is already holding lock:
> (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&ctx->lock){-.-...}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0
> [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0
> [<ffffffff81732052>] __schedule+0x1d2/0xa20
> [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0
> [<ffffffff817352b6>] retint_kernel+0x26/0x30
> [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50
> [<ffffffff813f0504>] pty_write+0x54/0x60
> [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0
> [<ffffffff813e5838>] tty_write+0x158/0x2d0
> [<ffffffff811c4850>] vfs_write+0xc0/0x1f0
> [<ffffffff811c52cc>] SyS_write+0x4c/0xa0
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> -> #2 (&rq->lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0
> [<ffffffff81054336>] do_fork+0x126/0x460
> [<ffffffff81054696>] kernel_thread+0x26/0x30
> [<ffffffff8171ff93>] rest_init+0x23/0x140
> [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403
> [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4
>
> -> #1 (&p->pi_lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff810979d1>] try_to_wake_up+0x31/0x350
> [<ffffffff81097d62>] default_wake_function+0x12/0x20
> [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40
> [<ffffffff8108ea38>] __wake_up_common+0x58/0x90
> [<ffffffff8108ff59>] __wake_up+0x39/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111b8d>] call_rcu+0x1d/0x20
> [<ffffffff81093697>] cpu_attach_domain+0x287/0x360
> [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0
> [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a
> [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202
> [<ffffffff817200be>] kernel_init+0xe/0x190
> [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0
>
> -> #0 (&rdp->nocb_wq){......}:
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> other info that might help us debug this:
>
> Chain exists of:
> &rdp->nocb_wq --> &rq->lock --> &ctx->lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&ctx->lock);
> lock(&rq->lock);
> lock(&ctx->lock);
> lock(&rdp->nocb_wq);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity-child2/15191:
> #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> stack backtrace:
> CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92
> ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40
> ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0
> ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0
> Call Trace:
> [<ffffffff8172a363>] dump_stack+0x4e/0x82
> [<ffffffff81726741>] print_circular_bug+0x200/0x20f
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60
> [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0
> [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
The underlying problem is that perf is invoking call_rcu() with the
scheduler locks held, but in NOCB mode, call_rcu() will with high
probability invoke the scheduler -- which just might want to use its
locks. The reason that call_rcu() needs to invoke the scheduler is
to wake up the corresponding rcuo callback-offload kthread, which
does the job of starting up a grace period and invoking the callbacks
afterwards.
One solution (championed on a related problem by Lai Jiangshan) is to
simply defer the wakeup to some point where scheduler locks are no longer
held. Since we don't want to unnecessarily incur the cost of such
deferral, the task before us is threefold:
1. Determine when it is likely that a relevant scheduler lock is held.
2. Defer the wakeup in such cases.
3. Ensure that all deferred wakeups eventually happen, preferably
sooner rather than later.
We use irqs_disabled_flags() as a proxy for relevant scheduler locks
being held. This works because the relevant locks are always acquired
with interrupts disabled. We may defer more often than needed, but that
is at least safe.
The wakeup deferral is tracked via a new field in the per-CPU and
per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup.
This flag is checked by the RCU core processing. The __rcu_pending()
function now checks this flag, which causes rcu_check_callbacks()
to initiate RCU core processing at each scheduling-clock interrupt
where this flag is set. Of course this is not sufficient because
scheduling-clock interrupts are often turned off (the things we used to
be able to count on!). So the flags are also checked on entry to any
state that RCU considers to be idle, which includes both NO_HZ_IDLE idle
state and NO_HZ_FULL user-mode-execution state.
This approach should allow call_rcu() to be invoked regardless of what
locks you might be holding, the key word being "should".
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
2013-10-05 01:33:34 +04:00
unsigned long n_rp_nocb_defer_wakeup ;
2010-02-23 04:05:01 +03:00
unsigned long n_rp_need_nothing ;
2009-08-23 00:56:45 +04:00
2015-06-30 03:06:39 +03:00
/* 6) _rcu_barrier(), OOM callbacks, and expediting. */
2012-05-29 10:57:46 +04:00
struct rcu_head barrier_head ;
2012-06-12 04:39:43 +04:00
# ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head ;
# endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2015-06-30 03:06:39 +03:00
struct mutex exp_funnel_mutex ;
2015-06-30 21:14:32 +03:00
bool exp_done ; /* Expedited QS for this CPU? */
2012-05-29 10:57:46 +04:00
2012-08-20 08:35:53 +04:00
/* 7) Callback offloading. */
# ifdef CONFIG_RCU_NOCB_CPU
struct rcu_head * nocb_head ; /* CBs waiting for kthread. */
struct rcu_head * * nocb_tail ;
2014-12-18 23:31:27 +03:00
atomic_long_t nocb_q_count ; /* # CBs waiting for nocb */
atomic_long_t nocb_q_count_lazy ; /* invocation (all stages). */
rcu: Parallelize and economize NOCB kthread wakeups
An 80-CPU system with a context-switch-heavy workload can require so
many NOCB kthread wakeups that the RCU grace-period kthreads spend several
tens of percent of a CPU just awakening things. This clearly will not
scale well: If you add enough CPUs, the RCU grace-period kthreads would
get behind, increasing grace-period latency.
To avoid this problem, this commit divides the NOCB kthreads into leaders
and followers, where the grace-period kthreads awaken the leaders each of
whom in turn awakens its followers. By default, the number of groups of
kthreads is the square root of the number of CPUs, but this default may
be overridden using the rcutree.rcu_nocb_leader_stride boot parameter.
This reduces the number of wakeups done per grace period by the RCU
grace-period kthread by the square root of the number of CPUs, but of
course by shifting those wakeups to the leaders. In addition, because
the leaders do grace periods on behalf of their respective followers,
the number of wakeups of the followers decreases by up to a factor of two.
Instead of being awakened once when new callbacks arrive and again
at the end of the grace period, the followers are awakened only at
the end of the grace period.
For a numerical example, in a 4096-CPU system, the grace-period kthread
would awaken 64 leaders, each of which would awaken its 63 followers
at the end of the grace period. This compares favorably with the 79
wakeups for the grace-period kthread on an 80-CPU system.
Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2014-06-24 20:26:11 +04:00
struct rcu_head * nocb_follower_head ; /* CBs ready to invoke. */
struct rcu_head * * nocb_follower_tail ;
2012-08-20 08:35:53 +04:00
wait_queue_head_t nocb_wq ; /* For nocb kthreads to sleep on. */
struct task_struct * nocb_kthread ;
2014-07-30 01:50:47 +04:00
int nocb_defer_wakeup ; /* Defer wakeup of nocb_kthread. */
rcu: Parallelize and economize NOCB kthread wakeups
An 80-CPU system with a context-switch-heavy workload can require so
many NOCB kthread wakeups that the RCU grace-period kthreads spend several
tens of percent of a CPU just awakening things. This clearly will not
scale well: If you add enough CPUs, the RCU grace-period kthreads would
get behind, increasing grace-period latency.
To avoid this problem, this commit divides the NOCB kthreads into leaders
and followers, where the grace-period kthreads awaken the leaders each of
whom in turn awakens its followers. By default, the number of groups of
kthreads is the square root of the number of CPUs, but this default may
be overridden using the rcutree.rcu_nocb_leader_stride boot parameter.
This reduces the number of wakeups done per grace period by the RCU
grace-period kthread by the square root of the number of CPUs, but of
course by shifting those wakeups to the leaders. In addition, because
the leaders do grace periods on behalf of their respective followers,
the number of wakeups of the followers decreases by up to a factor of two.
Instead of being awakened once when new callbacks arrive and again
at the end of the grace period, the followers are awakened only at
the end of the grace period.
For a numerical example, in a 4096-CPU system, the grace-period kthread
would awaken 64 leaders, each of which would awaken its 63 followers
at the end of the grace period. This compares favorably with the 79
wakeups for the grace-period kthread on an 80-CPU system.
Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2014-06-24 20:26:11 +04:00
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head * nocb_gp_head ____cacheline_internodealigned_in_smp ;
/* CBs waiting for GP. */
struct rcu_head * * nocb_gp_tail ;
2014-08-28 00:43:40 +04:00
bool nocb_leader_sleep ; /* Is the nocb leader thread asleep? */
rcu: Parallelize and economize NOCB kthread wakeups
An 80-CPU system with a context-switch-heavy workload can require so
many NOCB kthread wakeups that the RCU grace-period kthreads spend several
tens of percent of a CPU just awakening things. This clearly will not
scale well: If you add enough CPUs, the RCU grace-period kthreads would
get behind, increasing grace-period latency.
To avoid this problem, this commit divides the NOCB kthreads into leaders
and followers, where the grace-period kthreads awaken the leaders each of
whom in turn awakens its followers. By default, the number of groups of
kthreads is the square root of the number of CPUs, but this default may
be overridden using the rcutree.rcu_nocb_leader_stride boot parameter.
This reduces the number of wakeups done per grace period by the RCU
grace-period kthread by the square root of the number of CPUs, but of
course by shifting those wakeups to the leaders. In addition, because
the leaders do grace periods on behalf of their respective followers,
the number of wakeups of the followers decreases by up to a factor of two.
Instead of being awakened once when new callbacks arrive and again
at the end of the grace period, the followers are awakened only at
the end of the grace period.
For a numerical example, in a 4096-CPU system, the grace-period kthread
would awaken 64 leaders, each of which would awaken its 63 followers
at the end of the grace period. This compares favorably with the 79
wakeups for the grace-period kthread on an 80-CPU system.
Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2014-06-24 20:26:11 +04:00
struct rcu_data * nocb_next_follower ;
/* Next follower in wakeup chain. */
/* The following fields are used by the follower, hence new cachline. */
struct rcu_data * nocb_leader ____cacheline_internodealigned_in_smp ;
/* Leader CPU takes GP-end wakeups. */
2012-08-20 08:35:53 +04:00
# endif /* #ifdef CONFIG_RCU_NOCB_CPU */
2013-03-07 01:37:09 +04:00
/* 8) RCU CPU stall data. */
unsigned int softirq_snap ; /* Snapshot of softirq activity. */
2009-08-23 00:56:45 +04:00
int cpu ;
rcu: Add grace-period, quiescent-state, and call_rcu trace events
Add trace events to record grace-period start and end, quiescent states,
CPUs noticing grace-period start and end, grace-period initialization,
call_rcu() invocation, tasks blocking in RCU read-side critical sections,
tasks exiting those same critical sections, force_quiescent_state()
detection of dyntick-idle and offline CPUs, CPUs entering and leaving
dyntick-idle mode (except from NMIs), CPUs coming online and going
offline, and CPUs being kicked for staying in dyntick-idle mode for too
long (as in many weeks, even on 32-bit systems).
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
rcu: Add the rcu flavor to callback trace events
The earlier trace events for registering RCU callbacks and for invoking
them did not include the RCU flavor (rcu_bh, rcu_preempt, or rcu_sched).
This commit adds the RCU flavor to those trace events.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-06-25 17:36:56 +04:00
struct rcu_state * rsp ;
2009-08-23 00:56:45 +04:00
} ;
2011-09-11 08:54:08 +04:00
/* Values for fqs_state field in struct rcu_state. */
rcu: Fix long-grace-period race between forcing and initialization
Very long RCU read-side critical sections (50 milliseconds or
so) can cause a race between force_quiescent_state() and
rcu_start_gp() as follows on kernel builds with multi-level
rcu_node hierarchies:
1. CPU 0 calls force_quiescent_state(), sees that there is a
grace period in progress, and acquires ->fsqlock.
2. CPU 1 detects the end of the grace period, and so
cpu_quiet_msk_finish() sets rsp->completed to rsp->gpnum.
This operation is carried out under the root rnp->lock,
but CPU 0 has not yet acquired that lock. Note that
rsp->signaled is still RCU_SAVE_DYNTICK from the last
grace period.
3. CPU 1 calls rcu_start_gp(), but no one wants a new grace
period, so it drops the root rnp->lock and returns.
4. CPU 0 acquires the root rnp->lock and picks up rsp->completed
and rsp->signaled, then drops rnp->lock. It then enters the
RCU_SAVE_DYNTICK leg of the switch statement.
5. CPU 2 invokes call_rcu(), and now needs a new grace period.
It calls rcu_start_gp(), which acquires the root rnp->lock, sets
rsp->signaled to RCU_GP_INIT (too bad that CPU 0 is already in
the RCU_SAVE_DYNTICK leg of the switch statement!) and starts
initializing the rcu_node hierarchy. If there are multiple
levels to the hierarchy, it will drop the root rnp->lock and
initialize the lower levels of the hierarchy.
6. CPU 0 notes that rsp->completed has not changed, which permits
both CPU 2 and CPU 0 to try updating it concurrently. If CPU 0's
update prevails, later calls to force_quiescent_state() can
count old quiescent states against the new grace period, which
can in turn result in premature ending of grace periods.
Not good.
This patch adds an RCU_GP_IDLE state for rsp->signaled that is
set initially at boot time and any time a grace period ends.
This prevents CPU 0 from getting into the workings of
force_quiescent_state() in step 4. Additional locking and
checks prevent the concurrent update of rsp->signaled in step 6.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1256742889199-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-28 18:14:49 +03:00
# define RCU_GP_IDLE 0 /* No grace period in progress. */
# define RCU_GP_INIT 1 /* Grace period being initialized. */
# define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
2010-01-05 02:09:07 +03:00
# define RCU_FORCE_QS 3 /* Need to force quiescent state. */
2009-08-23 00:56:45 +04:00
# define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
2014-07-30 01:50:47 +04:00
/* Values for nocb_defer_wakeup field in struct rcu_data. */
# define RCU_NOGP_WAKE_NOT 0
# define RCU_NOGP_WAKE 1
# define RCU_NOGP_WAKE_FORCE 2
2013-04-04 09:14:11 +04:00
# define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
/* and jiffies_till_next_fqs. */
2010-03-06 02:03:26 +03:00
2013-04-04 09:14:11 +04:00
# define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
/* delay between bouts of */
/* quiescent-state forcing. */
# define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
/* at least one scheduling clock */
/* irq before ratting on them. */
2009-08-23 00:56:45 +04:00
2011-05-21 03:06:29 +04:00
# define rcu_wait(cond) \
do { \
for ( ; ; ) { \
set_current_state ( TASK_INTERRUPTIBLE ) ; \
if ( cond ) \
break ; \
schedule ( ) ; \
} \
__set_current_state ( TASK_RUNNING ) ; \
} while ( 0 )
2009-08-23 00:56:45 +04:00
/*
* RCU global state , including node hierarchy . This hierarchy is
* represented in " heap " form in a dense array . The root ( first level )
* of the hierarchy is in - > node [ 0 ] ( referenced by - > level [ 0 ] ) , the second
* level in - > node [ 1 ] through - > node [ m ] ( - > node [ 1 ] referenced by - > level [ 1 ] ) ,
* and the third level in - > node [ m + 1 ] and following ( - > node [ m + 1 ] referenced
* by - > level [ 2 ] ) . The number of levels is determined by the number of
* CPUs and by CONFIG_RCU_FANOUT . Small systems will have a " hierarchy "
* consisting of a single rcu_node .
*/
struct rcu_state {
struct rcu_node node [ NUM_RCU_NODES ] ; /* Hierarchy. */
2015-07-09 16:34:23 +03:00
struct rcu_node * level [ RCU_NUM_LVLS + 1 ] ;
/* Hierarchy levels (+1 to */
/* shut bogus gcc warning) */
2014-06-21 03:49:01 +04:00
u8 flavor_mask ; /* bit in flavor mask. */
2010-06-28 12:25:04 +04:00
struct rcu_data __percpu * rda ; /* pointer of percu rcu_data. */
2012-05-29 10:26:01 +04:00
void ( * call ) ( struct rcu_head * head , /* call_rcu() flavor. */
void ( * func ) ( struct rcu_head * head ) ) ;
2009-08-23 00:56:45 +04:00
/* The following fields are guarded by the root rcu_node's lock. */
2011-09-11 08:54:08 +04:00
u8 fqs_state ____cacheline_internodealigned_in_smp ;
2009-08-23 00:56:45 +04:00
/* Force QS state. */
2011-06-16 02:47:09 +04:00
u8 boost ; /* Subject to priority boost. */
2010-02-23 04:05:01 +03:00
unsigned long gpnum ; /* Current gp number. */
unsigned long completed ; /* # of last completed gp. */
2012-06-19 05:36:08 +04:00
struct task_struct * gp_kthread ; /* Task for grace periods. */
wait_queue_head_t gp_wq ; /* Where GP task waits. */
2014-03-12 18:10:41 +04:00
short gp_flags ; /* Commands for GP task. */
short gp_state ; /* GP kthread sleep state. */
2009-09-23 20:50:42 +04:00
2009-12-02 23:10:15 +03:00
/* End of fields guarded by root rcu_node's lock. */
2009-09-23 20:50:42 +04:00
2012-10-08 21:54:03 +04:00
raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp ;
/* Protect following fields. */
2012-03-02 01:18:08 +04:00
struct rcu_head * orphan_nxtlist ; /* Orphaned callbacks that */
/* need a grace period. */
struct rcu_head * * orphan_nxttail ; /* Tail of above. */
struct rcu_head * orphan_donelist ; /* Orphaned callbacks that */
/* are ready to invoke. */
struct rcu_head * * orphan_donetail ; /* Tail of above. */
long qlen_lazy ; /* Number of lazy callbacks. */
long qlen ; /* Total number of callbacks. */
2012-10-08 21:54:03 +04:00
/* End of fields guarded by orphan_lock. */
2012-10-07 19:36:12 +04:00
2012-05-29 16:18:53 +04:00
struct mutex barrier_mutex ; /* Guards barrier fields. */
2012-05-29 11:34:56 +04:00
atomic_t barrier_cpu_count ; /* # CPUs waiting on. */
2012-05-29 14:03:37 +04:00
struct completion barrier_completion ; /* Wake at barrier end. */
2015-06-26 21:20:00 +03:00
unsigned long barrier_sequence ; /* ++ at start and end of */
2012-05-30 01:56:46 +04:00
/* _rcu_barrier(). */
2012-10-07 19:36:12 +04:00
/* End of fields guarded by barrier_mutex. */
2015-06-24 20:46:30 +03:00
unsigned long expedited_sequence ; /* Take a ticket. */
2015-07-12 02:24:45 +03:00
atomic_long_t expedited_workdone0 ; /* # done by others #0. */
2012-10-12 03:18:09 +04:00
atomic_long_t expedited_workdone1 ; /* # done by others #1. */
atomic_long_t expedited_workdone2 ; /* # done by others #2. */
2015-06-30 03:06:39 +03:00
atomic_long_t expedited_workdone3 ; /* # done by others #3. */
2012-10-12 03:18:09 +04:00
atomic_long_t expedited_normal ; /* # fallbacks to normal. */
2015-06-25 21:27:10 +03:00
atomic_t expedited_need_qs ; /* # CPUs left to check in. */
wait_queue_head_t expedited_wq ; /* Wait for check-ins. */
2012-10-12 02:24:03 +04:00
2009-08-23 00:56:45 +04:00
unsigned long jiffies_force_qs ; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs ; /* Number of calls to */
/* force_quiescent_state(). */
unsigned long n_force_qs_lh ; /* ~Number of calls leaving */
/* due to lock unavailable. */
unsigned long n_force_qs_ngp ; /* Number of calls leaving */
/* due to no GP active. */
unsigned long gp_start ; /* Time at which GP started, */
/* but in jiffies. */
2014-12-11 21:20:59 +03:00
unsigned long gp_activity ; /* Time of last GP kthread */
/* activity in jiffies. */
2009-08-23 00:56:45 +04:00
unsigned long jiffies_stall ; /* Time at which to check */
/* for CPU stalls. */
2013-09-24 00:57:18 +04:00
unsigned long jiffies_resched ; /* Time at which to resched */
/* a reluctant CPU. */
2014-12-08 20:57:48 +03:00
unsigned long n_force_qs_gpstart ; /* Snapshot of n_force_qs at */
/* GP start. */
2011-04-07 03:01:16 +04:00
unsigned long gp_max ; /* Maximum GP duration in */
/* jiffies. */
2013-07-13 00:50:28 +04:00
const char * name ; /* Name of structure. */
rcu: Distinguish "rcuo" kthreads by RCU flavor
Currently, the per-no-CBs-CPU kthreads are named "rcuo" followed by
the CPU number, for example, "rcuo". This is problematic given that
there are either two or three RCU flavors, each of which gets a per-CPU
kthread with exactly the same name. This commit therefore introduces
a one-letter abbreviation for each RCU flavor, namely 'b' for RCU-bh,
'p' for RCU-preempt, and 's' for RCU-sched. This abbreviation is used
to distinguish the "rcuo" kthreads, for example, for CPU 0 we would have
"rcuob/0", "rcuop/0", and "rcuos/0".
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
2012-12-03 20:16:28 +04:00
char abbr ; /* Abbreviated name. */
2012-06-12 22:01:13 +04:00
struct list_head flavors ; /* List of RCU flavors. */
2009-08-23 00:56:45 +04:00
} ;
2012-06-23 04:06:26 +04:00
/* Values for rcu_state structure's gp_flags field. */
# define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
# define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
2014-03-12 18:10:41 +04:00
/* Values for rcu_state structure's gp_flags field. */
# define RCU_GP_WAIT_INIT 0 /* Initial state. */
# define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
2015-05-20 00:16:52 +03:00
# define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
# define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
2015-07-02 22:27:31 +03:00
# define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */
2015-05-20 00:16:52 +03:00
# define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
# define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
2014-03-12 18:10:41 +04:00
2012-06-12 22:01:13 +04:00
extern struct list_head rcu_struct_flavors ;
2012-08-20 08:35:53 +04:00
/* Sequence through rcu_state structures for each RCU flavor. */
2012-06-12 22:01:13 +04:00
# define for_each_rcu_flavor(rsp) \
list_for_each_entry ( ( rsp ) , & rcu_struct_flavors , flavors )
2009-03-25 18:42:24 +03:00
/*
* RCU implementation internal declarations :
*/
2009-08-23 00:56:46 +04:00
extern struct rcu_state rcu_sched_state ;
2009-03-25 18:42:24 +03:00
extern struct rcu_state rcu_bh_state ;
2014-09-22 22:00:48 +04:00
# ifdef CONFIG_PREEMPT_RCU
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 00:56:52 +04:00
extern struct rcu_state rcu_preempt_state ;
2014-09-22 22:00:48 +04:00
# endif /* #ifdef CONFIG_PREEMPT_RCU */
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 00:56:52 +04:00
2011-06-21 12:59:33 +04:00
# ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU ( unsigned int , rcu_cpu_kthread_status ) ;
DECLARE_PER_CPU ( int , rcu_cpu_kthread_cpu ) ;
DECLARE_PER_CPU ( unsigned int , rcu_cpu_kthread_loops ) ;
DECLARE_PER_CPU ( char , rcu_cpu_has_work ) ;
# endif /* #ifdef CONFIG_RCU_BOOST */
2010-01-15 03:10:58 +03:00
# ifndef RCU_TREE_NONCORE
2009-08-23 00:56:45 +04:00
2009-09-23 20:50:43 +04:00
/* Forward declarations for rcutree_plugin.h */
2009-11-11 00:37:19 +03:00
static void rcu_bootup_announce ( void ) ;
2014-10-21 23:50:04 +04:00
static void rcu_preempt_note_context_switch ( void ) ;
2011-02-07 23:47:15 +03:00
static int rcu_preempt_blocked_readers_cgp ( struct rcu_node * rnp ) ;
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-22 19:53:48 +03:00
# ifdef CONFIG_HOTPLUG_CPU
2014-10-31 21:22:37 +03:00
static bool rcu_preempt_has_tasks ( struct rcu_node * rnp ) ;
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-22 19:53:48 +03:00
# endif /* #ifdef CONFIG_HOTPLUG_CPU */
2010-02-23 04:05:05 +03:00
static void rcu_print_detail_task_stall ( struct rcu_state * rsp ) ;
2011-08-14 00:31:47 +04:00
static int rcu_print_task_stall ( struct rcu_node * rnp ) ;
2009-09-23 20:50:43 +04:00
static void rcu_preempt_check_blocked_tasks ( struct rcu_node * rnp ) ;
2014-10-21 19:12:00 +04:00
static void rcu_preempt_check_callbacks ( void ) ;
2009-09-23 20:50:43 +04:00
void call_rcu ( struct rcu_head * head , void ( * func ) ( struct rcu_head * rcu ) ) ;
static void __init __rcu_init_preempt ( void ) ;
2011-05-05 08:43:49 +04:00
static void rcu_initiate_boost ( struct rcu_node * rnp , unsigned long flags ) ;
2011-06-16 02:47:09 +04:00
static void rcu_preempt_boost_start_gp ( struct rcu_node * rnp ) ;
static void invoke_rcu_callbacks_kthread ( void ) ;
2011-11-30 03:57:13 +04:00
static bool rcu_is_callbacks_kthread ( void ) ;
2011-06-16 02:47:09 +04:00
# ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks ( void ) ;
2013-06-19 22:52:21 +04:00
static int rcu_spawn_one_boost_kthread ( struct rcu_state * rsp ,
2012-07-16 14:42:35 +04:00
struct rcu_node * rnp ) ;
2011-06-16 02:47:09 +04:00
# endif /* #ifdef CONFIG_RCU_BOOST */
2014-07-13 23:00:53 +04:00
static void __init rcu_spawn_boost_kthreads ( void ) ;
2013-06-19 22:52:21 +04:00
static void rcu_prepare_kthreads ( int cpu ) ;
2014-10-23 02:07:37 +04:00
static void rcu_cleanup_after_idle ( void ) ;
2014-10-23 02:03:43 +04:00
static void rcu_prepare_for_idle ( void ) ;
2012-02-28 23:02:21 +04:00
static void rcu_idle_count_callbacks_posted ( void ) ;
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 08:52:37 +03:00
static bool rcu_preempt_has_tasks ( struct rcu_node * rnp ) ;
2012-01-17 01:29:10 +04:00
static void print_cpu_stall_info_begin ( void ) ;
static void print_cpu_stall_info ( struct rcu_state * rsp , int cpu ) ;
static void print_cpu_stall_info_end ( void ) ;
static void zero_cpu_stall_ticks ( struct rcu_data * rdp ) ;
static void increment_cpu_stall_ticks ( void ) ;
rcu: Make rcu_barrier() understand about missing rcuo kthreads
Commit 35ce7f29a44a (rcu: Create rcuo kthreads only for onlined CPUs)
avoids creating rcuo kthreads for CPUs that never come online. This
fixes a bug in many instances of firmware: Instead of lying about their
age, these systems instead lie about the number of CPUs that they have.
Before commit 35ce7f29a44a, this could result in huge numbers of useless
rcuo kthreads being created.
It appears that experience indicates that I should have told the
people suffering from this problem to fix their broken firmware, but
I instead produced what turned out to be a partial fix. The missing
piece supplied by this commit makes sure that rcu_barrier() knows not to
post callbacks for no-CBs CPUs that have not yet come online, because
otherwise rcu_barrier() will hang on systems having firmware that lies
about the number of CPUs.
It is tempting to simply have rcu_barrier() refuse to post a callback on
any no-CBs CPU that does not have an rcuo kthread. This unfortunately
does not work because rcu_barrier() is required to wait for all pending
callbacks. It is therefore required to wait even for those callbacks
that cannot possibly be invoked. Even if doing so hangs the system.
Given that posting a callback to a no-CBs CPU that does not yet have an
rcuo kthread can hang rcu_barrier(), It is tempting to report an error
in this case. Unfortunately, this will result in false positives at
boot time, when it is perfectly legal to post callbacks to the boot CPU
before the scheduler has started, in other words, before it is legal
to invoke rcu_barrier().
So this commit instead has rcu_barrier() avoid posting callbacks to
CPUs having neither rcuo kthread nor pending callbacks, and has it
complain bitterly if it finds CPUs having no rcuo kthread but some
pending callbacks. And when rcu_barrier() does find CPUs having no rcuo
kthread but pending callbacks, as noted earlier, it has no choice but
to hang indefinitely.
Reported-by: Yanko Kaneti <yaneti@declera.com>
Reported-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Reported-by: Meelis Roos <mroos@linux.ee>
Reported-by: Eric B Munson <emunson@akamai.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Eric B Munson <emunson@akamai.com>
Tested-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Tested-by: Yanko Kaneti <yaneti@declera.com>
Tested-by: Kevin Fenzi <kevin@scrye.com>
Tested-by: Meelis Roos <mroos@linux.ee>
2014-10-27 19:15:54 +03:00
static bool rcu_nocb_cpu_needs_barrier ( struct rcu_state * rsp , int cpu ) ;
2013-02-11 08:48:58 +04:00
static void rcu_nocb_gp_set ( struct rcu_node * rnp , int nrq ) ;
2012-12-31 03:21:01 +04:00
static void rcu_nocb_gp_cleanup ( struct rcu_state * rsp , struct rcu_node * rnp ) ;
2013-02-11 08:48:58 +04:00
static void rcu_init_one_nocb ( struct rcu_node * rnp ) ;
2012-08-20 08:35:53 +04:00
static bool __call_rcu_nocb ( struct rcu_data * rdp , struct rcu_head * rhp ,
rcu: Break call_rcu() deadlock involving scheduler and perf
Dave Jones got the following lockdep splat:
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.12.0-rc3+ #92 Not tainted
> -------------------------------------------------------
> trinity-child2/15191 is trying to acquire lock:
> (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50
>
> but task is already holding lock:
> (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&ctx->lock){-.-...}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0
> [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0
> [<ffffffff81732052>] __schedule+0x1d2/0xa20
> [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0
> [<ffffffff817352b6>] retint_kernel+0x26/0x30
> [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50
> [<ffffffff813f0504>] pty_write+0x54/0x60
> [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0
> [<ffffffff813e5838>] tty_write+0x158/0x2d0
> [<ffffffff811c4850>] vfs_write+0xc0/0x1f0
> [<ffffffff811c52cc>] SyS_write+0x4c/0xa0
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> -> #2 (&rq->lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0
> [<ffffffff81054336>] do_fork+0x126/0x460
> [<ffffffff81054696>] kernel_thread+0x26/0x30
> [<ffffffff8171ff93>] rest_init+0x23/0x140
> [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403
> [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4
>
> -> #1 (&p->pi_lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff810979d1>] try_to_wake_up+0x31/0x350
> [<ffffffff81097d62>] default_wake_function+0x12/0x20
> [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40
> [<ffffffff8108ea38>] __wake_up_common+0x58/0x90
> [<ffffffff8108ff59>] __wake_up+0x39/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111b8d>] call_rcu+0x1d/0x20
> [<ffffffff81093697>] cpu_attach_domain+0x287/0x360
> [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0
> [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a
> [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202
> [<ffffffff817200be>] kernel_init+0xe/0x190
> [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0
>
> -> #0 (&rdp->nocb_wq){......}:
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> other info that might help us debug this:
>
> Chain exists of:
> &rdp->nocb_wq --> &rq->lock --> &ctx->lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&ctx->lock);
> lock(&rq->lock);
> lock(&ctx->lock);
> lock(&rdp->nocb_wq);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity-child2/15191:
> #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> stack backtrace:
> CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92
> ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40
> ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0
> ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0
> Call Trace:
> [<ffffffff8172a363>] dump_stack+0x4e/0x82
> [<ffffffff81726741>] print_circular_bug+0x200/0x20f
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60
> [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0
> [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
The underlying problem is that perf is invoking call_rcu() with the
scheduler locks held, but in NOCB mode, call_rcu() will with high
probability invoke the scheduler -- which just might want to use its
locks. The reason that call_rcu() needs to invoke the scheduler is
to wake up the corresponding rcuo callback-offload kthread, which
does the job of starting up a grace period and invoking the callbacks
afterwards.
One solution (championed on a related problem by Lai Jiangshan) is to
simply defer the wakeup to some point where scheduler locks are no longer
held. Since we don't want to unnecessarily incur the cost of such
deferral, the task before us is threefold:
1. Determine when it is likely that a relevant scheduler lock is held.
2. Defer the wakeup in such cases.
3. Ensure that all deferred wakeups eventually happen, preferably
sooner rather than later.
We use irqs_disabled_flags() as a proxy for relevant scheduler locks
being held. This works because the relevant locks are always acquired
with interrupts disabled. We may defer more often than needed, but that
is at least safe.
The wakeup deferral is tracked via a new field in the per-CPU and
per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup.
This flag is checked by the RCU core processing. The __rcu_pending()
function now checks this flag, which causes rcu_check_callbacks()
to initiate RCU core processing at each scheduling-clock interrupt
where this flag is set. Of course this is not sufficient because
scheduling-clock interrupts are often turned off (the things we used to
be able to count on!). So the flags are also checked on entry to any
state that RCU considers to be idle, which includes both NO_HZ_IDLE idle
state and NO_HZ_FULL user-mode-execution state.
This approach should allow call_rcu() to be invoked regardless of what
locks you might be holding, the key word being "should".
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
2013-10-05 01:33:34 +04:00
bool lazy , unsigned long flags ) ;
2012-08-20 08:35:53 +04:00
static bool rcu_nocb_adopt_orphan_cbs ( struct rcu_state * rsp ,
rcu: Break call_rcu() deadlock involving scheduler and perf
Dave Jones got the following lockdep splat:
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.12.0-rc3+ #92 Not tainted
> -------------------------------------------------------
> trinity-child2/15191 is trying to acquire lock:
> (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50
>
> but task is already holding lock:
> (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&ctx->lock){-.-...}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0
> [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0
> [<ffffffff81732052>] __schedule+0x1d2/0xa20
> [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0
> [<ffffffff817352b6>] retint_kernel+0x26/0x30
> [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50
> [<ffffffff813f0504>] pty_write+0x54/0x60
> [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0
> [<ffffffff813e5838>] tty_write+0x158/0x2d0
> [<ffffffff811c4850>] vfs_write+0xc0/0x1f0
> [<ffffffff811c52cc>] SyS_write+0x4c/0xa0
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> -> #2 (&rq->lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0
> [<ffffffff81054336>] do_fork+0x126/0x460
> [<ffffffff81054696>] kernel_thread+0x26/0x30
> [<ffffffff8171ff93>] rest_init+0x23/0x140
> [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403
> [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4
>
> -> #1 (&p->pi_lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff810979d1>] try_to_wake_up+0x31/0x350
> [<ffffffff81097d62>] default_wake_function+0x12/0x20
> [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40
> [<ffffffff8108ea38>] __wake_up_common+0x58/0x90
> [<ffffffff8108ff59>] __wake_up+0x39/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111b8d>] call_rcu+0x1d/0x20
> [<ffffffff81093697>] cpu_attach_domain+0x287/0x360
> [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0
> [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a
> [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202
> [<ffffffff817200be>] kernel_init+0xe/0x190
> [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0
>
> -> #0 (&rdp->nocb_wq){......}:
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> other info that might help us debug this:
>
> Chain exists of:
> &rdp->nocb_wq --> &rq->lock --> &ctx->lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&ctx->lock);
> lock(&rq->lock);
> lock(&ctx->lock);
> lock(&rdp->nocb_wq);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity-child2/15191:
> #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> stack backtrace:
> CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92
> ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40
> ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0
> ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0
> Call Trace:
> [<ffffffff8172a363>] dump_stack+0x4e/0x82
> [<ffffffff81726741>] print_circular_bug+0x200/0x20f
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60
> [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0
> [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
The underlying problem is that perf is invoking call_rcu() with the
scheduler locks held, but in NOCB mode, call_rcu() will with high
probability invoke the scheduler -- which just might want to use its
locks. The reason that call_rcu() needs to invoke the scheduler is
to wake up the corresponding rcuo callback-offload kthread, which
does the job of starting up a grace period and invoking the callbacks
afterwards.
One solution (championed on a related problem by Lai Jiangshan) is to
simply defer the wakeup to some point where scheduler locks are no longer
held. Since we don't want to unnecessarily incur the cost of such
deferral, the task before us is threefold:
1. Determine when it is likely that a relevant scheduler lock is held.
2. Defer the wakeup in such cases.
3. Ensure that all deferred wakeups eventually happen, preferably
sooner rather than later.
We use irqs_disabled_flags() as a proxy for relevant scheduler locks
being held. This works because the relevant locks are always acquired
with interrupts disabled. We may defer more often than needed, but that
is at least safe.
The wakeup deferral is tracked via a new field in the per-CPU and
per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup.
This flag is checked by the RCU core processing. The __rcu_pending()
function now checks this flag, which causes rcu_check_callbacks()
to initiate RCU core processing at each scheduling-clock interrupt
where this flag is set. Of course this is not sufficient because
scheduling-clock interrupts are often turned off (the things we used to
be able to count on!). So the flags are also checked on entry to any
state that RCU considers to be idle, which includes both NO_HZ_IDLE idle
state and NO_HZ_FULL user-mode-execution state.
This approach should allow call_rcu() to be invoked regardless of what
locks you might be holding, the key word being "should".
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
2013-10-05 01:33:34 +04:00
struct rcu_data * rdp ,
unsigned long flags ) ;
2014-07-30 01:50:47 +04:00
static int rcu_nocb_need_deferred_wakeup ( struct rcu_data * rdp ) ;
rcu: Break call_rcu() deadlock involving scheduler and perf
Dave Jones got the following lockdep splat:
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.12.0-rc3+ #92 Not tainted
> -------------------------------------------------------
> trinity-child2/15191 is trying to acquire lock:
> (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50
>
> but task is already holding lock:
> (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&ctx->lock){-.-...}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0
> [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0
> [<ffffffff81732052>] __schedule+0x1d2/0xa20
> [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0
> [<ffffffff817352b6>] retint_kernel+0x26/0x30
> [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50
> [<ffffffff813f0504>] pty_write+0x54/0x60
> [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0
> [<ffffffff813e5838>] tty_write+0x158/0x2d0
> [<ffffffff811c4850>] vfs_write+0xc0/0x1f0
> [<ffffffff811c52cc>] SyS_write+0x4c/0xa0
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> -> #2 (&rq->lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0
> [<ffffffff81054336>] do_fork+0x126/0x460
> [<ffffffff81054696>] kernel_thread+0x26/0x30
> [<ffffffff8171ff93>] rest_init+0x23/0x140
> [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403
> [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4
>
> -> #1 (&p->pi_lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff810979d1>] try_to_wake_up+0x31/0x350
> [<ffffffff81097d62>] default_wake_function+0x12/0x20
> [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40
> [<ffffffff8108ea38>] __wake_up_common+0x58/0x90
> [<ffffffff8108ff59>] __wake_up+0x39/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111b8d>] call_rcu+0x1d/0x20
> [<ffffffff81093697>] cpu_attach_domain+0x287/0x360
> [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0
> [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a
> [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202
> [<ffffffff817200be>] kernel_init+0xe/0x190
> [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0
>
> -> #0 (&rdp->nocb_wq){......}:
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> other info that might help us debug this:
>
> Chain exists of:
> &rdp->nocb_wq --> &rq->lock --> &ctx->lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&ctx->lock);
> lock(&rq->lock);
> lock(&ctx->lock);
> lock(&rdp->nocb_wq);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity-child2/15191:
> #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> stack backtrace:
> CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92
> ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40
> ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0
> ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0
> Call Trace:
> [<ffffffff8172a363>] dump_stack+0x4e/0x82
> [<ffffffff81726741>] print_circular_bug+0x200/0x20f
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60
> [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0
> [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
The underlying problem is that perf is invoking call_rcu() with the
scheduler locks held, but in NOCB mode, call_rcu() will with high
probability invoke the scheduler -- which just might want to use its
locks. The reason that call_rcu() needs to invoke the scheduler is
to wake up the corresponding rcuo callback-offload kthread, which
does the job of starting up a grace period and invoking the callbacks
afterwards.
One solution (championed on a related problem by Lai Jiangshan) is to
simply defer the wakeup to some point where scheduler locks are no longer
held. Since we don't want to unnecessarily incur the cost of such
deferral, the task before us is threefold:
1. Determine when it is likely that a relevant scheduler lock is held.
2. Defer the wakeup in such cases.
3. Ensure that all deferred wakeups eventually happen, preferably
sooner rather than later.
We use irqs_disabled_flags() as a proxy for relevant scheduler locks
being held. This works because the relevant locks are always acquired
with interrupts disabled. We may defer more often than needed, but that
is at least safe.
The wakeup deferral is tracked via a new field in the per-CPU and
per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup.
This flag is checked by the RCU core processing. The __rcu_pending()
function now checks this flag, which causes rcu_check_callbacks()
to initiate RCU core processing at each scheduling-clock interrupt
where this flag is set. Of course this is not sufficient because
scheduling-clock interrupts are often turned off (the things we used to
be able to count on!). So the flags are also checked on entry to any
state that RCU considers to be idle, which includes both NO_HZ_IDLE idle
state and NO_HZ_FULL user-mode-execution state.
This approach should allow call_rcu() to be invoked regardless of what
locks you might be holding, the key word being "should".
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
2013-10-05 01:33:34 +04:00
static void do_nocb_deferred_wakeup ( struct rcu_data * rdp ) ;
2012-08-20 08:35:53 +04:00
static void rcu_boot_init_nocb_percpu_data ( struct rcu_data * rdp ) ;
2014-07-11 22:30:24 +04:00
static void rcu_spawn_all_nocb_kthreads ( int cpu ) ;
static void __init rcu_spawn_nocb_kthreads ( void ) ;
# ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads ( struct rcu_state * rsp ) ;
# endif /* #ifdef CONFIG_RCU_NOCB_CPU */
2014-06-21 03:49:01 +04:00
static void __maybe_unused rcu_kick_nohz_cpu ( int cpu ) ;
2013-01-08 01:37:42 +04:00
static bool init_nocb_callback_list ( struct rcu_data * rdp ) ;
2014-09-03 01:13:44 +04:00
static void rcu_sysidle_enter ( int irq ) ;
static void rcu_sysidle_exit ( int irq ) ;
2013-06-22 03:37:22 +04:00
static void rcu_sysidle_check_cpu ( struct rcu_data * rdp , bool * isidle ,
unsigned long * maxj ) ;
static bool is_sysidle_rcu_state ( struct rcu_state * rsp ) ;
static void rcu_sysidle_report_gp ( struct rcu_state * rsp , int isidle ,
unsigned long maxj ) ;
2013-06-22 04:10:40 +04:00
static void rcu_bind_gp_kthread ( void ) ;
nohz_full: Add rcu_dyntick data for scalable detection of all-idle state
This commit adds fields to the rcu_dyntick structure that are used to
detect idle CPUs. These new fields differ from the existing ones in
that the existing ones consider a CPU executing in user mode to be idle,
where the new ones consider CPUs executing in user mode to be busy.
The handling of these new fields is otherwise quite similar to that for
the exiting fields. This commit also adds the initialization required
for these fields.
So, why is usermode execution treated differently, with RCU considering
it a quiescent state equivalent to idle, while in contrast the new
full-system idle state detection considers usermode execution to be
non-idle?
It turns out that although one of RCU's quiescent states is usermode
execution, it is not a full-system idle state. This is because the
purpose of the full-system idle state is not RCU, but rather determining
when accurate timekeeping can safely be disabled. Whenever accurate
timekeeping is required in a CONFIG_NO_HZ_FULL kernel, at least one
CPU must keep the scheduling-clock tick going. If even one CPU is
executing in user mode, accurate timekeeping is requires, particularly for
architectures where gettimeofday() and friends do not enter the kernel.
Only when all CPUs are really and truly idle can accurate timekeeping be
disabled, allowing all CPUs to turn off the scheduling clock interrupt,
thus greatly improving energy efficiency.
This naturally raises the question "Why is this code in RCU rather than in
timekeeping?", and the answer is that RCU has the data and infrastructure
to efficiently make this determination.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2013-06-21 23:34:33 +04:00
static void rcu_sysidle_init_percpu_data ( struct rcu_dynticks * rdtp ) ;
2013-11-08 21:03:10 +04:00
static bool rcu_nohz_full_cpu ( struct rcu_state * rsp ) ;
2014-08-05 04:43:50 +04:00
static void rcu_dynticks_task_enter ( void ) ;
static void rcu_dynticks_task_exit ( void ) ;
2009-09-23 20:50:43 +04:00
2010-01-15 03:10:58 +03:00
# endif /* #ifndef RCU_TREE_NONCORE */
2012-08-20 08:35:53 +04:00
# ifdef CONFIG_RCU_TRACE
2014-12-18 23:31:27 +03:00
/* Read out queue lengths for tracing. */
2012-08-20 08:35:53 +04:00
static inline void rcu_nocb_q_lengths ( struct rcu_data * rdp , long * ql , long * qll )
{
2014-12-18 23:31:27 +03:00
# ifdef CONFIG_RCU_NOCB_CPU
* ql = atomic_long_read ( & rdp - > nocb_q_count ) ;
* qll = atomic_long_read ( & rdp - > nocb_q_count_lazy ) ;
2012-08-20 08:35:53 +04:00
# else /* #ifdef CONFIG_RCU_NOCB_CPU */
* ql = 0 ;
* qll = 0 ;
# endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2014-12-18 23:31:27 +03:00
}
2012-08-20 08:35:53 +04:00
# endif /* #ifdef CONFIG_RCU_TRACE */
2015-07-15 04:35:23 +03:00
/*
* Place this after a lock - acquisition primitive to guarantee that
* an UNLOCK + LOCK pair act as a full barrier . This guarantee applies
* if the UNLOCK and LOCK are executed by the same CPU or if the
* UNLOCK and LOCK operate on the same lock variable .
*/
# ifdef CONFIG_PPC
# define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
# else /* #ifdef CONFIG_PPC */
# define smp_mb__after_unlock_lock() do { } while (0)
# endif /* #else #ifdef CONFIG_PPC */