rcu: Remove nohz_full full-system-idle state machine
The NO_HZ_FULL_SYSIDLE full-system-idle capability was added in 2013
by commit 0edd1b1784
("nohz_full: Add full-system-idle state machine"),
but has not been used. This commit therefore removes it.
If it turns out to be needed later, this commit can always be reverted.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
f7a10a9750
commit
fe5ac724d8
@ -2520,11 +2520,7 @@ It is similarly socially unacceptable to interrupt an
|
||||
<tt>nohz_full</tt> CPU running in userspace.
|
||||
RCU must therefore track <tt>nohz_full</tt> userspace
|
||||
execution.
|
||||
And in
|
||||
<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
|
||||
kernels, RCU must separately track idle CPUs on the one hand and
|
||||
CPUs that are either idle or executing in userspace on the other.
|
||||
In both cases, RCU must be able to sample state at two points in
|
||||
RCU must therefore be able to sample state at two points in
|
||||
time, and be able to determine whether or not some other CPU spent
|
||||
any time idle and/or executing in userspace.
|
||||
|
||||
|
@ -854,15 +854,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
|
||||
#define kfree_rcu(ptr, rcu_head) \
|
||||
__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
|
||||
|
||||
/* Only for use by adaptive-ticks code. */
|
||||
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
bool rcu_sys_is_idle(void);
|
||||
void rcu_sysidle_force_exit(void);
|
||||
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
static inline bool rcu_sys_is_idle(void) { return false; }
|
||||
static inline void rcu_sysidle_force_exit(void) { }
|
||||
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
|
||||
|
||||
/*
|
||||
* Place this after a lock-acquisition primitive to guarantee that
|
||||
|
@ -270,10 +270,6 @@ void rcu_bh_qs(void)
|
||||
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
|
||||
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
|
||||
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
|
||||
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
|
||||
.dynticks_idle = ATOMIC_INIT(1),
|
||||
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -546,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
|
||||
|
||||
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
|
||||
struct rcu_data *rdp);
|
||||
static void force_qs_rnp(struct rcu_state *rsp,
|
||||
int (*f)(struct rcu_data *rsp, bool *isidle,
|
||||
unsigned long *maxj),
|
||||
bool *isidle, unsigned long *maxj);
|
||||
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
|
||||
static void force_quiescent_state(struct rcu_state *rsp);
|
||||
static int rcu_pending(void);
|
||||
|
||||
@ -854,7 +847,6 @@ void rcu_idle_enter(void)
|
||||
|
||||
local_irq_save(flags);
|
||||
rcu_eqs_enter(false);
|
||||
rcu_sysidle_enter(0);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rcu_idle_enter);
|
||||
@ -904,7 +896,6 @@ void rcu_irq_exit(void)
|
||||
trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
|
||||
rdtp->dynticks_nesting--;
|
||||
}
|
||||
rcu_sysidle_enter(1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -986,7 +977,6 @@ void rcu_idle_exit(void)
|
||||
|
||||
local_irq_save(flags);
|
||||
rcu_eqs_exit(false);
|
||||
rcu_sysidle_exit(0);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rcu_idle_exit);
|
||||
@ -1038,7 +1028,6 @@ void rcu_irq_enter(void)
|
||||
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
|
||||
else
|
||||
rcu_eqs_exit_common(oldval, true);
|
||||
rcu_sysidle_exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1217,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
|
||||
* credit them with an implicit quiescent state. Return 1 if this CPU
|
||||
* is in dynticks idle mode, which is an extended quiescent state.
|
||||
*/
|
||||
static int dyntick_save_progress_counter(struct rcu_data *rdp,
|
||||
bool *isidle, unsigned long *maxj)
|
||||
static int dyntick_save_progress_counter(struct rcu_data *rdp)
|
||||
{
|
||||
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
|
||||
rcu_sysidle_check_cpu(rdp, isidle, maxj);
|
||||
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
|
||||
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
|
||||
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
|
||||
@ -1238,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
|
||||
* idle state since the last call to dyntick_save_progress_counter()
|
||||
* for this same CPU, or by virtue of having been offline.
|
||||
*/
|
||||
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
|
||||
bool *isidle, unsigned long *maxj)
|
||||
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
|
||||
{
|
||||
unsigned long jtsq;
|
||||
bool *rnhqp;
|
||||
@ -2105,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
|
||||
*/
|
||||
static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
|
||||
{
|
||||
bool isidle = false;
|
||||
unsigned long maxj;
|
||||
struct rcu_node *rnp = rcu_get_root(rsp);
|
||||
|
||||
WRITE_ONCE(rsp->gp_activity, jiffies);
|
||||
rsp->n_force_qs++;
|
||||
if (first_time) {
|
||||
/* Collect dyntick-idle snapshots. */
|
||||
if (is_sysidle_rcu_state(rsp)) {
|
||||
isidle = true;
|
||||
maxj = jiffies - ULONG_MAX / 4;
|
||||
}
|
||||
force_qs_rnp(rsp, dyntick_save_progress_counter,
|
||||
&isidle, &maxj);
|
||||
rcu_sysidle_report_gp(rsp, isidle, maxj);
|
||||
force_qs_rnp(rsp, dyntick_save_progress_counter);
|
||||
} else {
|
||||
/* Handle dyntick-idle and offline CPUs. */
|
||||
isidle = true;
|
||||
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
|
||||
force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
|
||||
}
|
||||
/* Clear flag to prevent immediate re-entry. */
|
||||
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
|
||||
@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user)
|
||||
*
|
||||
* The caller must have suppressed start of new grace periods.
|
||||
*/
|
||||
static void force_qs_rnp(struct rcu_state *rsp,
|
||||
int (*f)(struct rcu_data *rsp, bool *isidle,
|
||||
unsigned long *maxj),
|
||||
bool *isidle, unsigned long *maxj)
|
||||
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
|
||||
{
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
|
||||
for_each_leaf_node_possible_cpu(rnp, cpu) {
|
||||
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
|
||||
if ((rnp->qsmask & bit) != 0) {
|
||||
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
|
||||
if (f(per_cpu_ptr(rsp->rda, cpu)))
|
||||
mask |= bit;
|
||||
}
|
||||
}
|
||||
@ -3793,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
|
||||
!init_nocb_callback_list(rdp))
|
||||
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
|
||||
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
|
||||
rcu_sysidle_init_percpu_data(rdp->dynticks);
|
||||
rcu_dynticks_eqs_online();
|
||||
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
|
||||
|
||||
|
@ -45,14 +45,6 @@ struct rcu_dynticks {
|
||||
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
|
||||
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
|
||||
bool rcu_urgent_qs; /* GP old need light quiescent state. */
|
||||
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
long long dynticks_idle_nesting;
|
||||
/* irq/process nesting level from idle. */
|
||||
atomic_t dynticks_idle; /* Even value for idle, else odd. */
|
||||
/* "Idle" excludes userspace execution. */
|
||||
unsigned long dynticks_idle_jiffies;
|
||||
/* End of last non-NMI non-idle period. */
|
||||
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
#ifdef CONFIG_RCU_FAST_NO_HZ
|
||||
bool all_lazy; /* Are all CPU's CBs lazy? */
|
||||
unsigned long nonlazy_posted;
|
||||
@ -529,15 +521,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
|
||||
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
||||
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
|
||||
static bool init_nocb_callback_list(struct rcu_data *rdp);
|
||||
static void rcu_sysidle_enter(int irq);
|
||||
static void rcu_sysidle_exit(int irq);
|
||||
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
|
||||
unsigned long *maxj);
|
||||
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
|
||||
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
|
||||
unsigned long maxj);
|
||||
static void rcu_bind_gp_kthread(void);
|
||||
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
|
||||
static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
|
||||
static void rcu_dynticks_task_enter(void);
|
||||
static void rcu_dynticks_task_exit(void);
|
||||
|
@ -2563,429 +2563,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
|
||||
#endif /* #ifdef CONFIG_NO_HZ_FULL */
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
|
||||
static int full_sysidle_state; /* Current system-idle state. */
|
||||
#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
|
||||
#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
|
||||
#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
|
||||
#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
|
||||
#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
|
||||
|
||||
/*
|
||||
* Invoked to note exit from irq or task transition to idle. Note that
|
||||
* usermode execution does -not- count as idle here! After all, we want
|
||||
* to detect full-system idle states, not RCU quiescent states and grace
|
||||
* periods. The caller must have disabled interrupts.
|
||||
*/
|
||||
static void rcu_sysidle_enter(int irq)
|
||||
{
|
||||
unsigned long j;
|
||||
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
|
||||
|
||||
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!");
|
||||
|
||||
/* If there are no nohz_full= CPUs, no need to track this. */
|
||||
if (!tick_nohz_full_enabled())
|
||||
return;
|
||||
|
||||
/* Adjust nesting, check for fully idle. */
|
||||
if (irq) {
|
||||
rdtp->dynticks_idle_nesting--;
|
||||
WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
|
||||
if (rdtp->dynticks_idle_nesting != 0)
|
||||
return; /* Still not fully idle. */
|
||||
} else {
|
||||
if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
|
||||
DYNTICK_TASK_NEST_VALUE) {
|
||||
rdtp->dynticks_idle_nesting = 0;
|
||||
} else {
|
||||
rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
|
||||
WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
|
||||
return; /* Still not fully idle. */
|
||||
}
|
||||
}
|
||||
|
||||
/* Record start of fully idle period. */
|
||||
j = jiffies;
|
||||
WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
|
||||
smp_mb__before_atomic();
|
||||
atomic_inc(&rdtp->dynticks_idle);
|
||||
smp_mb__after_atomic();
|
||||
WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unconditionally force exit from full system-idle state. This is
|
||||
* invoked when a normal CPU exits idle, but must be called separately
|
||||
* for the timekeeping CPU (tick_do_timer_cpu). The reason for this
|
||||
* is that the timekeeping CPU is permitted to take scheduling-clock
|
||||
* interrupts while the system is in system-idle state, and of course
|
||||
* rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
|
||||
* interrupt from any other type of interrupt.
|
||||
*/
|
||||
void rcu_sysidle_force_exit(void)
|
||||
{
|
||||
int oldstate = READ_ONCE(full_sysidle_state);
|
||||
int newoldstate;
|
||||
|
||||
/*
|
||||
* Each pass through the following loop attempts to exit full
|
||||
* system-idle state. If contention proves to be a problem,
|
||||
* a trylock-based contention tree could be used here.
|
||||
*/
|
||||
while (oldstate > RCU_SYSIDLE_SHORT) {
|
||||
newoldstate = cmpxchg(&full_sysidle_state,
|
||||
oldstate, RCU_SYSIDLE_NOT);
|
||||
if (oldstate == newoldstate &&
|
||||
oldstate == RCU_SYSIDLE_FULL_NOTED) {
|
||||
rcu_kick_nohz_cpu(tick_do_timer_cpu);
|
||||
return; /* We cleared it, done! */
|
||||
}
|
||||
oldstate = newoldstate;
|
||||
}
|
||||
smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoked to note entry to irq or task transition from idle. Note that
|
||||
* usermode execution does -not- count as idle here! The caller must
|
||||
* have disabled interrupts.
|
||||
*/
|
||||
static void rcu_sysidle_exit(int irq)
|
||||
{
|
||||
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
|
||||
|
||||
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!");
|
||||
|
||||
/* If there are no nohz_full= CPUs, no need to track this. */
|
||||
if (!tick_nohz_full_enabled())
|
||||
return;
|
||||
|
||||
/* Adjust nesting, check for already non-idle. */
|
||||
if (irq) {
|
||||
rdtp->dynticks_idle_nesting++;
|
||||
WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
|
||||
if (rdtp->dynticks_idle_nesting != 1)
|
||||
return; /* Already non-idle. */
|
||||
} else {
|
||||
/*
|
||||
* Allow for irq misnesting. Yes, it really is possible
|
||||
* to enter an irq handler then never leave it, and maybe
|
||||
* also vice versa. Handle both possibilities.
|
||||
*/
|
||||
if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
|
||||
rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
|
||||
WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
|
||||
return; /* Already non-idle. */
|
||||
} else {
|
||||
rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Record end of idle period. */
|
||||
smp_mb__before_atomic();
|
||||
atomic_inc(&rdtp->dynticks_idle);
|
||||
smp_mb__after_atomic();
|
||||
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
|
||||
|
||||
/*
|
||||
* If we are the timekeeping CPU, we are permitted to be non-idle
|
||||
* during a system-idle state. This must be the case, because
|
||||
* the timekeeping CPU has to take scheduling-clock interrupts
|
||||
* during the time that the system is transitioning to full
|
||||
* system-idle state. This means that the timekeeping CPU must
|
||||
* invoke rcu_sysidle_force_exit() directly if it does anything
|
||||
* more than take a scheduling-clock interrupt.
|
||||
*/
|
||||
if (smp_processor_id() == tick_do_timer_cpu)
|
||||
return;
|
||||
|
||||
/* Update system-idle state: We are clearly no longer fully idle! */
|
||||
rcu_sysidle_force_exit();
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if the current CPU is idle. Note that usermode execution
|
||||
* does not count as idle. The caller must have disabled interrupts,
|
||||
* and must be running on tick_do_timer_cpu.
|
||||
*/
|
||||
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
|
||||
unsigned long *maxj)
|
||||
{
|
||||
int cur;
|
||||
unsigned long j;
|
||||
struct rcu_dynticks *rdtp = rdp->dynticks;
|
||||
|
||||
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!");
|
||||
|
||||
/* If there are no nohz_full= CPUs, don't check system-wide idleness. */
|
||||
if (!tick_nohz_full_enabled())
|
||||
return;
|
||||
|
||||
/*
|
||||
* If some other CPU has already reported non-idle, if this is
|
||||
* not the flavor of RCU that tracks sysidle state, or if this
|
||||
* is an offline or the timekeeping CPU, nothing to do.
|
||||
*/
|
||||
if (!*isidle || rdp->rsp != rcu_state_p ||
|
||||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
|
||||
return;
|
||||
/* Verify affinity of current kthread. */
|
||||
WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
|
||||
|
||||
/* Pick up current idle and NMI-nesting counter and check. */
|
||||
cur = atomic_read(&rdtp->dynticks_idle);
|
||||
if (cur & 0x1) {
|
||||
*isidle = false; /* We are not idle! */
|
||||
return;
|
||||
}
|
||||
smp_mb(); /* Read counters before timestamps. */
|
||||
|
||||
/* Pick up timestamps. */
|
||||
j = READ_ONCE(rdtp->dynticks_idle_jiffies);
|
||||
/* If this CPU entered idle more recently, update maxj timestamp. */
|
||||
if (ULONG_CMP_LT(*maxj, j))
|
||||
*maxj = j;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is this the flavor of RCU that is handling full-system idle?
|
||||
*/
|
||||
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
|
||||
{
|
||||
return rsp == rcu_state_p;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a delay in jiffies based on the number of CPUs, rcu_node
|
||||
* leaf fanout, and jiffies tick rate. The idea is to allow larger
|
||||
* systems more time to transition to full-idle state in order to
|
||||
* avoid the cache thrashing that otherwise occur on the state variable.
|
||||
* Really small systems (less than a couple of tens of CPUs) should
|
||||
* instead use a single global atomically incremented counter, and later
|
||||
* versions of this will automatically reconfigure themselves accordingly.
|
||||
*/
|
||||
static unsigned long rcu_sysidle_delay(void)
|
||||
{
|
||||
if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
|
||||
return 0;
|
||||
return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance the full-system-idle state. This is invoked when all of
|
||||
* the non-timekeeping CPUs are idle.
|
||||
*/
|
||||
static void rcu_sysidle(unsigned long j)
|
||||
{
|
||||
/* Check the current state. */
|
||||
switch (READ_ONCE(full_sysidle_state)) {
|
||||
case RCU_SYSIDLE_NOT:
|
||||
|
||||
/* First time all are idle, so note a short idle period. */
|
||||
WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
|
||||
break;
|
||||
|
||||
case RCU_SYSIDLE_SHORT:
|
||||
|
||||
/*
|
||||
* Idle for a bit, time to advance to next state?
|
||||
* cmpxchg failure means race with non-idle, let them win.
|
||||
*/
|
||||
if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
|
||||
(void)cmpxchg(&full_sysidle_state,
|
||||
RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
|
||||
break;
|
||||
|
||||
case RCU_SYSIDLE_LONG:
|
||||
|
||||
/*
|
||||
* Do an additional check pass before advancing to full.
|
||||
* cmpxchg failure means race with non-idle, let them win.
|
||||
*/
|
||||
if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
|
||||
(void)cmpxchg(&full_sysidle_state,
|
||||
RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Found a non-idle non-timekeeping CPU, so kick the system-idle state
|
||||
* back to the beginning.
|
||||
*/
|
||||
static void rcu_sysidle_cancel(void)
|
||||
{
|
||||
smp_mb();
|
||||
if (full_sysidle_state > RCU_SYSIDLE_SHORT)
|
||||
WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the sysidle state based on the results of a force-quiescent-state
|
||||
* scan of the CPUs' dyntick-idle state.
|
||||
*/
|
||||
static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
|
||||
unsigned long maxj, bool gpkt)
|
||||
{
|
||||
if (rsp != rcu_state_p)
|
||||
return; /* Wrong flavor, ignore. */
|
||||
if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
|
||||
return; /* Running state machine from timekeeping CPU. */
|
||||
if (isidle)
|
||||
rcu_sysidle(maxj); /* More idle! */
|
||||
else
|
||||
rcu_sysidle_cancel(); /* Idle is over. */
|
||||
}
|
||||
|
||||
/*
|
||||
* Wrapper for rcu_sysidle_report() when called from the grace-period
|
||||
* kthread's context.
|
||||
*/
|
||||
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
|
||||
unsigned long maxj)
|
||||
{
|
||||
/* If there are no nohz_full= CPUs, no need to track this. */
|
||||
if (!tick_nohz_full_enabled())
|
||||
return;
|
||||
|
||||
rcu_sysidle_report(rsp, isidle, maxj, true);
|
||||
}
|
||||
|
||||
/* Callback and function for forcing an RCU grace period. */
|
||||
struct rcu_sysidle_head {
|
||||
struct rcu_head rh;
|
||||
int inuse;
|
||||
};
|
||||
|
||||
static void rcu_sysidle_cb(struct rcu_head *rhp)
|
||||
{
|
||||
struct rcu_sysidle_head *rshp;
|
||||
|
||||
/*
|
||||
* The following memory barrier is needed to replace the
|
||||
* memory barriers that would normally be in the memory
|
||||
* allocator.
|
||||
*/
|
||||
smp_mb(); /* grace period precedes setting inuse. */
|
||||
|
||||
rshp = container_of(rhp, struct rcu_sysidle_head, rh);
|
||||
WRITE_ONCE(rshp->inuse, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if the system is fully idle, other than the timekeeping CPU.
|
||||
* The caller must have disabled interrupts. This is not intended to be
|
||||
* called unless tick_nohz_full_enabled().
|
||||
*/
|
||||
bool rcu_sys_is_idle(void)
|
||||
{
|
||||
static struct rcu_sysidle_head rsh;
|
||||
int rss = READ_ONCE(full_sysidle_state);
|
||||
|
||||
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!");
|
||||
|
||||
if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
|
||||
return false;
|
||||
|
||||
/* Handle small-system case by doing a full scan of CPUs. */
|
||||
if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
|
||||
int oldrss = rss - 1;
|
||||
|
||||
/*
|
||||
* One pass to advance to each state up to _FULL.
|
||||
* Give up if any pass fails to advance the state.
|
||||
*/
|
||||
while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
|
||||
int cpu;
|
||||
bool isidle = true;
|
||||
unsigned long maxj = jiffies - ULONG_MAX / 4;
|
||||
struct rcu_data *rdp;
|
||||
|
||||
/* Scan all the CPUs looking for nonidle CPUs. */
|
||||
for_each_possible_cpu(cpu) {
|
||||
rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
|
||||
rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
|
||||
if (!isidle)
|
||||
break;
|
||||
}
|
||||
rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
|
||||
oldrss = rss;
|
||||
rss = READ_ONCE(full_sysidle_state);
|
||||
}
|
||||
}
|
||||
|
||||
/* If this is the first observation of an idle period, record it. */
|
||||
if (rss == RCU_SYSIDLE_FULL) {
|
||||
rss = cmpxchg(&full_sysidle_state,
|
||||
RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
|
||||
return rss == RCU_SYSIDLE_FULL;
|
||||
}
|
||||
|
||||
smp_mb(); /* ensure rss load happens before later caller actions. */
|
||||
|
||||
/* If already fully idle, tell the caller (in case of races). */
|
||||
if (rss == RCU_SYSIDLE_FULL_NOTED)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If we aren't there yet, and a grace period is not in flight,
|
||||
* initiate a grace period. Either way, tell the caller that
|
||||
* we are not there yet. We use an xchg() rather than an assignment
|
||||
* to make up for the memory barriers that would otherwise be
|
||||
* provided by the memory allocator.
|
||||
*/
|
||||
if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
|
||||
!rcu_gp_in_progress(rcu_state_p) &&
|
||||
!rsh.inuse && xchg(&rsh.inuse, 1) == 0)
|
||||
call_rcu(&rsh.rh, rcu_sysidle_cb);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize dynticks sysidle state for CPUs coming online.
|
||||
*/
|
||||
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
|
||||
{
|
||||
rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
|
||||
}
|
||||
|
||||
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
|
||||
static void rcu_sysidle_enter(int irq)
|
||||
{
|
||||
}
|
||||
|
||||
static void rcu_sysidle_exit(int irq)
|
||||
{
|
||||
}
|
||||
|
||||
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
|
||||
unsigned long *maxj)
|
||||
{
|
||||
}
|
||||
|
||||
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
|
||||
unsigned long maxj)
|
||||
{
|
||||
}
|
||||
|
||||
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
|
||||
/*
|
||||
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
|
||||
* grace-period kthread will do force_quiescent_state() processing?
|
||||
@ -3016,13 +2593,7 @@ static void rcu_bind_gp_kthread(void)
|
||||
|
||||
if (!tick_nohz_full_enabled())
|
||||
return;
|
||||
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
cpu = tick_do_timer_cpu;
|
||||
if (cpu >= 0 && cpu < nr_cpu_ids)
|
||||
set_cpus_allowed_ptr(current, cpumask_of(cpu));
|
||||
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
housekeeping_affine(current);
|
||||
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
|
||||
}
|
||||
|
||||
/* Record the current task on dyntick-idle entry. */
|
||||
|
@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL
|
||||
Note the boot CPU will still be kept outside the range to
|
||||
handle the timekeeping duty.
|
||||
|
||||
config NO_HZ_FULL_SYSIDLE
|
||||
bool "Detect full-system idle state for full dynticks system"
|
||||
depends on NO_HZ_FULL
|
||||
default n
|
||||
help
|
||||
At least one CPU must keep the scheduling-clock tick running for
|
||||
timekeeping purposes whenever there is a non-idle CPU, where
|
||||
"non-idle" also includes dynticks CPUs as long as they are
|
||||
running non-idle tasks. Because the underlying adaptive-tick
|
||||
support cannot distinguish between all CPUs being idle and
|
||||
all CPUs each running a single task in dynticks mode, the
|
||||
underlying support simply ensures that there is always a CPU
|
||||
handling the scheduling-clock tick, whether or not all CPUs
|
||||
are idle. This Kconfig option enables scalable detection of
|
||||
the all-CPUs-idle state, thus allowing the scheduling-clock
|
||||
tick to be disabled when all CPUs are idle. Note that scalable
|
||||
detection of the all-CPUs-idle state means that larger systems
|
||||
will be slower to declare the all-CPUs-idle state.
|
||||
|
||||
Say Y if you would like to help debug all-CPUs-idle detection.
|
||||
|
||||
Say N if you are unsure.
|
||||
|
||||
config NO_HZ_FULL_SYSIDLE_SMALL
|
||||
int "Number of CPUs above which large-system approach is used"
|
||||
depends on NO_HZ_FULL_SYSIDLE
|
||||
range 1 NR_CPUS
|
||||
default 8
|
||||
help
|
||||
The full-system idle detection mechanism takes a lazy approach
|
||||
on large systems, as is required to attain decent scalability.
|
||||
However, on smaller systems, scalability is not anywhere near as
|
||||
large a concern as is energy efficiency. The sysidle subsystem
|
||||
therefore uses a fast but non-scalable algorithm for small
|
||||
systems and a lazier but scalable algorithm for large systems.
|
||||
This Kconfig parameter defines the number of CPUs in the largest
|
||||
system that will be considered to be "small".
|
||||
|
||||
The default value will be fine in most cases. Battery-powered
|
||||
systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
|
||||
numbers of CPUs, and (3) are suffering from battery-lifetime
|
||||
problems due to long sysidle latencies might wish to experiment
|
||||
with larger values for this Kconfig parameter. On the other
|
||||
hand, they might be even better served by disabling NO_HZ_FULL
|
||||
entirely, given that NO_HZ_FULL is intended for HPC and
|
||||
real-time workloads that at present do not tend to be run on
|
||||
battery-powered systems.
|
||||
|
||||
Take the default if you are unsure.
|
||||
|
||||
config NO_HZ
|
||||
bool "Old Idle dynticks config"
|
||||
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
|
||||
|
@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n
|
||||
CONFIG_NO_HZ_IDLE=n
|
||||
CONFIG_NO_HZ_FULL=y
|
||||
CONFIG_NO_HZ_FULL_ALL=n
|
||||
CONFIG_NO_HZ_FULL_SYSIDLE=y
|
||||
CONFIG_RCU_FAST_NO_HZ=n
|
||||
CONFIG_RCU_TRACE=y
|
||||
CONFIG_HOTPLUG_CPU=y
|
||||
|
@ -18,7 +18,6 @@ CONFIG_PROVE_RCU
|
||||
|
||||
In common code tested by TREE_RCU test cases.
|
||||
|
||||
CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
CONFIG_RCU_NOCB_CPU
|
||||
|
||||
Meaningless for TINY_RCU.
|
||||
|
@ -9,8 +9,7 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one.
|
||||
CONFIG_HOTPLUG_CPU -- Do half. (Every second.)
|
||||
CONFIG_HZ_PERIODIC -- Do one.
|
||||
CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
|
||||
CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE.
|
||||
CONFIG_NO_HZ_FULL_SYSIDLE -- Do one.
|
||||
CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.
|
||||
CONFIG_PREEMPT -- Do half. (First three and #8.)
|
||||
CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
|
||||
CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
|
||||
@ -48,10 +47,6 @@ CONFIG_64BIT
|
||||
|
||||
Used only to check CONFIG_RCU_FANOUT value, inspection suffices.
|
||||
|
||||
CONFIG_NO_HZ_FULL_SYSIDLE_SMALL
|
||||
|
||||
Defer until Frederic uses this.
|
||||
|
||||
CONFIG_PREEMPT_COUNT
|
||||
CONFIG_PREEMPT_RCU
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user